Exemplo n.º 1
0
def get_model_v2(albert_config, max_seq_length, init_checkpoint, learning_rate,
                 start_n_top, end_n_top, dropout, num_train_steps, num_warmup_steps):
    """Returns keras model"""

    squad_model = ALBertQAModel(
        albert_config, max_seq_length, init_checkpoint, start_n_top, end_n_top, dropout)

    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=learning_rate,
                                                                     decay_steps=num_train_steps, end_learning_rate=0.0)
    if num_warmup_steps:
        learning_rate_fn = WarmUp(initial_learning_rate=learning_rate,
                                  decay_schedule_fn=learning_rate_fn,
                                  warmup_steps=num_warmup_steps)

    if FLAGS.optimizer == "LAMB":
        optimizer_fn = LAMB
    else:
        optimizer_fn = AdamWeightDecay

    optimizer = optimizer_fn(
        learning_rate=learning_rate_fn,
        weight_decay_rate=FLAGS.weight_decay,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=FLAGS.adam_epsilon,
        exclude_from_weight_decay=['layer_norm', 'bias'])

    squad_model.optimizer = optimizer

    return squad_model
Exemplo n.º 2
0
def get_model(albert_config, max_seq_length, num_labels, init_checkpoint,
              learning_rate, num_train_steps, num_warmup_steps,
              loss_multiplier):
    """Returns keras fuctional model"""
    float_type = tf.float32
    hidden_dropout_prob = FLAGS.classifier_dropout  # as per original code relased
    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                           dtype=tf.int32,
                                           name='input_word_ids')
    input_mask = tf.keras.layers.Input(shape=(max_seq_length, ),
                                       dtype=tf.int32,
                                       name='input_mask')
    input_type_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                           dtype=tf.int32,
                                           name='input_type_ids')

    albert_layer = AlbertModel(config=albert_config, float_type=float_type)

    pooled_output, _ = albert_layer(input_word_ids, input_mask, input_type_ids)

    albert_model = tf.keras.Model(
        inputs=[input_word_ids, input_mask, input_type_ids],
        outputs=[pooled_output])

    albert_model.load_weights(init_checkpoint)

    initializer = tf.keras.initializers.TruncatedNormal(
        stddev=albert_config.initializer_range)

    output = tf.keras.layers.Dropout(rate=hidden_dropout_prob)(pooled_output)

    output = tf.keras.layers.Dense(num_labels,
                                   kernel_initializer=initializer,
                                   name='output',
                                   dtype=float_type)(output)
    model = tf.keras.Model(inputs={
        'input_word_ids': input_word_ids,
        'input_mask': input_mask,
        'input_type_ids': input_type_ids
    },
                           outputs=output)

    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=learning_rate,
        decay_steps=num_train_steps,
        end_learning_rate=0.0)
    if num_warmup_steps:
        learning_rate_fn = WarmUp(initial_learning_rate=learning_rate,
                                  decay_schedule_fn=learning_rate_fn,
                                  warmup_steps=num_warmup_steps)
    if FLAGS.optimizer == "LAMB":
        optimizer_fn = LAMB
    else:
        optimizer_fn = AdamWeightDecay

    optimizer = optimizer_fn(learning_rate=learning_rate_fn,
                             weight_decay_rate=FLAGS.weight_decay,
                             beta_1=0.9,
                             beta_2=0.999,
                             epsilon=FLAGS.adam_epsilon,
                             exclude_from_weight_decay=['layer_norm', 'bias'])

    if FLAGS.task_name.lower() == 'sts':
        loss_fct = tf.keras.losses.MeanSquaredError()
        model.compile(optimizer=optimizer, loss=loss_fct, metrics=['mse'])
    else:
        loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True)
        model.compile(optimizer=optimizer, loss=loss_fct, metrics=['accuracy'])

    return model
Exemplo n.º 3
0
def get_model_v1(albert_config, max_seq_length, init_checkpoint, learning_rate,
                 num_train_steps, num_warmup_steps):
    """Returns keras fuctional model"""
    float_type = tf.float32
    # hidden_dropout_prob = 0.9 # as per original code relased
    unique_ids = tf.keras.layers.Input(
        shape=(1,), dtype=tf.int32, name='unique_ids')
    input_word_ids = tf.keras.layers.Input(
        shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
    input_mask = tf.keras.layers.Input(
        shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
    input_type_ids = tf.keras.layers.Input(
        shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')

    albert_layer = AlbertModel(config=albert_config, float_type=float_type)

    _, sequence_output = albert_layer(
        input_word_ids, input_mask, input_type_ids)

    albert_model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids],
                                  outputs=[sequence_output])

    if init_checkpoint != None:
        albert_model.load_weights(init_checkpoint)

    initializer = tf.keras.initializers.TruncatedNormal(
        stddev=albert_config.initializer_range)

    squad_logits_layer = ALBertSquadLogitsLayer(
        initializer=initializer, float_type=float_type, name='squad_logits')

    start_logits, end_logits = squad_logits_layer(sequence_output)

    squad_model = tf.keras.Model(
        inputs={
            'unique_ids': unique_ids,
            'input_ids': input_word_ids,
            'input_mask': input_mask,
            'segment_ids': input_type_ids,
        },
        outputs=[unique_ids, start_logits, end_logits],
        name='squad_model')

    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=learning_rate,
                                                                     decay_steps=num_train_steps, end_learning_rate=0.0)
    if num_warmup_steps:
        learning_rate_fn = WarmUp(initial_learning_rate=learning_rate,
                                  decay_schedule_fn=learning_rate_fn,
                                  warmup_steps=num_warmup_steps)

    if FLAGS.optimizer == "LAMB":
        optimizer_fn = LAMB
    else:
        optimizer_fn = AdamWeightDecay

    optimizer = optimizer_fn(
        learning_rate=learning_rate_fn,
        weight_decay_rate=FLAGS.weight_decay,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=FLAGS.adam_epsilon,
        exclude_from_weight_decay=['layer_norm', 'bias'])

    squad_model.optimizer = optimizer

    return squad_model
def run_customized_training(strategy, albert_config, max_seq_length,
                            max_predictions_per_seq, model_dir,
                            steps_per_epoch, steps_per_loop, epochs,
                            initial_lr, warmup_steps, input_files,
                            train_batch_size):
    """Run BERT pretrain model training using low-level API."""

    train_input_fn = functools.partial(get_pretrain_input_data, input_files,
                                       max_seq_length, max_predictions_per_seq,
                                       train_batch_size, strategy)

    with strategy.scope():
        pretrain_model, core_model = albert_model.pretrain_model(
            albert_config, max_seq_length, max_predictions_per_seq)

        if FLAGS.init_checkpoint:
            logging.info(
                f"pre-trained weights loaded from {FLAGS.init_checkpoint}")
            pretrain_model.load_weights(FLAGS.init_checkpoint)

        learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=initial_lr,
            decay_steps=int(steps_per_epoch * epochs),
            end_learning_rate=0.0)

        if warmup_steps:
            learning_rate_fn = WarmUp(initial_learning_rate=initial_lr,
                                      decay_schedule_fn=learning_rate_fn,
                                      warmup_steps=warmup_steps)
        if FLAGS.optimizer == "lamp":
            optimizer_fn = LAMB
        else:
            optimizer_fn = AdamWeightDecay

        optimizer = optimizer_fn(
            learning_rate=learning_rate_fn,
            weight_decay_rate=FLAGS.weight_decay,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=FLAGS.adam_epsilon,
            exclude_from_weight_decay=['layer_norm', 'bias'])
        pretrain_model.optimizer = optimizer

    trained_model = run_customized_training_loop(
        strategy=strategy,
        model=pretrain_model,
        loss_fn=get_loss_fn(loss_factor=1.0 / strategy.num_replicas_in_sync),
        model_dir=model_dir,
        train_input_fn=train_input_fn,
        steps_per_epoch=steps_per_epoch,
        steps_per_loop=steps_per_loop,
        epochs=epochs)

    # Creates the BERT core model outside distribution strategy scope.
    _, core_model = albert_model.pretrain_model(albert_config, max_seq_length,
                                                max_predictions_per_seq)

    # Restores the core model from model checkpoints and save weights only
    # contains the core model.
    checkpoint = tf.train.Checkpoint(model=core_model)
    latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
    assert latest_checkpoint_file
    logging.info('Checkpoint file %s found and restoring from '
                 'checkpoint', latest_checkpoint_file)
    status = checkpoint.restore(latest_checkpoint_file)
    status.assert_existing_objects_matched().expect_partial()
    core_model.save_weights(f"{model_dir}/tf2_model.h5")
    return trained_model
Exemplo n.º 5
0
def main(_):
    logging.set_verbosity(logging.INFO)

    if FLAGS.enable_xla:
        set_config_v2(FLAGS.enable_xla)

    processors = {
        "cola": classifier_data_lib.ColaProcessor,
        "sts": classifier_data_lib.StsbProcessor,
        "sst": classifier_data_lib.Sst2Processor,
        "mnli": classifier_data_lib.MnliProcessor,
        "qnli": classifier_data_lib.QnliProcessor,
        "qqp": classifier_data_lib.QqpProcessor,
        "rte": classifier_data_lib.RteProcessor,
        "mrpc": classifier_data_lib.MrpcProcessor,
        "wnli": classifier_data_lib.WnliProcessor,
        "xnli": classifier_data_lib.XnliProcessor,
    }
    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    logging.info("processor is : ", FLAGS.task_name)

    strategy = None
    if FLAGS.strategy_type == "one":
        strategy = tf.distribute.OneDeviceStrategy("GPU:0")
    elif FLAGS.strategy_type == "mirror":
        strategy = tf.distribute.MirroredStrategy()
    else:
        raise ValueError(
            'The distribution strategy type is not supported: %s' %
            FLAGS.strategy_type)

    with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
        input_meta_data = json.loads(reader.read().decode('utf-8'))

        num_labels = input_meta_data["num_labels"]
        FLAGS.max_seq_length = input_meta_data["max_seq_length"]
        processor_type = input_meta_data['processor_type']

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    albert_config = AlbertConfig.from_json_file(FLAGS.albert_config_file)
    tinybert_config = TinybertConfig.from_json_file(FLAGS.tinybert_config_file)

    if FLAGS.max_seq_length > albert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the ALBERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, albert_config.max_position_embeddings))

    tf.io.gfile.makedirs(FLAGS.output_dir)

    num_train_steps = None
    num_warmup_steps = None
    steps_per_epoch = None

    if FLAGS.do_train:
        len_train_examples = input_meta_data['train_data_size']
        steps_per_epoch = int(len_train_examples / FLAGS.train_batch_size)
        num_train_steps = int(steps_per_epoch * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    if FLAGS.do_eval:
        eval_input_fn = functools.partial(create_classifier_dataset,
                                          FLAGS.eval_data_path,
                                          seq_length=FLAGS.max_seq_length,
                                          batch_size=FLAGS.eval_batch_size,
                                          is_training=False,
                                          drop_remainder=False)
        len_eval_examples = input_meta_data['eval_data_size']
        eval_steps = int(len_eval_examples / FLAGS.eval_batch_size)

    loss_multiplier = 1.0 / strategy.num_replicas_in_sync

    model = None

    if FLAGS.do_train:
        logging.info("***** Running training *****")
        logging.info("  Num examples = %d", len_train_examples)
        logging.info("  Batch size = %d", FLAGS.train_batch_size)
        logging.info("  Num steps = %d", num_train_steps)

        # 为create方法固定4个参数值
        train_input_fn = functools.partial(create_classifier_dataset,
                                           FLAGS.train_data_path,
                                           seq_length=FLAGS.max_seq_length,
                                           batch_size=FLAGS.train_batch_size,
                                           drop_remainder=False)

        with strategy.scope():

            summary_dir = os.path.join(FLAGS.output_dir, 'summaries')
            summary_callback = tf.keras.callbacks.TensorBoard(summary_dir)
            checkpoint_path = os.path.join(FLAGS.output_dir, 'checkpoint')
            checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
                checkpoint_path, save_weights_only=False)
            custom_callbacks = [summary_callback, checkpoint_callback]

            def metric_fn():
                if FLAGS.task_name.lower() == "sts":
                    return tf.keras.metrics.MeanSquaredError(dtype=tf.float32)
                else:
                    return tf.keras.metrics.SparseCategoricalAccuracy(
                        dtype=tf.float32)

            if FLAGS.custom_training_loop:
                if FLAGS.task_name.lower() == "sts":
                    loss_fn = get_loss_fn_v2(loss_factor=loss_multiplier)
                else:
                    loss_fn = get_loss_fn(num_labels,
                                          loss_factor=loss_multiplier)

                tinybert_config = TinybertConfig.from_json_file(
                    FLAGS.tinybert_config_file)

                train_model, albert, tinybert = tinybert_model.get_fine_tune_model(
                    tinybert_config, albert_config, FLAGS.max_seq_length)
                albert.summary()
                tinybert.summary()
                train_model.summary()

                model = train_model
                if FLAGS.optimizer == "LAMB":
                    optimizer_fn = LAMB
                else:
                    optimizer_fn = AdamWeightDecay

                learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
                    initial_learning_rate=FLAGS.learning_rate,
                    decay_steps=num_train_steps,
                    end_learning_rate=0.0)
                if num_warmup_steps:
                    learning_rate_fn = WarmUp(
                        initial_learning_rate=FLAGS.learning_rate,
                        decay_schedule_fn=learning_rate_fn,
                        warmup_steps=num_warmup_steps)

                optimizer = optimizer_fn(
                    learning_rate=learning_rate_fn,
                    weight_decay_rate=FLAGS.weight_decay,
                    beta_1=0.9,
                    beta_2=0.999,
                    epsilon=FLAGS.adam_epsilon,
                    exclude_from_weight_decay=['layer_norm', 'bias'])

                train_model.optimizer = optimizer

                run_customized_training_loop(
                    strategy=strategy,
                    models=[albert, tinybert, train_model],
                    model=train_model,
                    albert=albert,
                    tinybert=tinybert,
                    loss_fn=get_loss_fn_v3(loss_factor=1.0 /
                                           strategy.num_replicas_in_sync),
                    model_dir=FLAGS.output_dir,
                    train_input_fn=train_input_fn,
                    steps_per_epoch=steps_per_epoch,
                    epochs=FLAGS.num_train_epochs,
                    metric_fn=metric_fn,
                    custom_callbacks=custom_callbacks)
            else:
                model = get_model(albert_config=albert_config,
                                  max_seq_length=FLAGS.max_seq_length,
                                  num_labels=num_labels,
                                  init_checkpoint=FLAGS.init_checkpoint,
                                  learning_rate=FLAGS.learning_rate,
                                  num_train_steps=num_train_steps,
                                  num_warmup_steps=num_warmup_steps,
                                  loss_multiplier=loss_multiplier)
                model.summary()
                training_dataset = train_input_fn()
                evaluation_dataset = eval_input_fn()
                model.fit(x=training_dataset,
                          epochs=FLAGS.num_train_epochs,
                          callbacks=custom_callbacks)

    if FLAGS.do_eval:
        if not model:
            raise ValueError("model not init")
        len_eval_examples = input_meta_data['eval_data_size']

        logging.info("***** Running evaluation *****")
        logging.info("  Num examples = %d", len_eval_examples)
        logging.info("  Batch size = %d", FLAGS.eval_batch_size)
        evaluation_dataset = eval_input_fn()

        loss, accuracy = model.evaluate(evaluation_dataset)

        print(f"loss : {loss} , Accuracy : {accuracy}")

    if FLAGS.do_predict:

        logging.info("***** Running prediction*****")
        flags.mark_flag_as_required("input_data_dir")
        flags.mark_flag_as_required("predict_data_path")
        tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file)

        predict_examples = processor.get_test_examples(FLAGS.input_data_dir)

        label_list = processor.get_labels()
        label_map = {i: label for i, label in enumerate(label_list)}

        classifier_data_lib.file_based_convert_examples_to_features(
            predict_examples, label_list, FLAGS.max_seq_length, tokenizer,
            FLAGS.predict_data_path)

        predict_input_fn = functools.partial(create_classifier_dataset,
                                             FLAGS.predict_data_path,
                                             seq_length=FLAGS.max_seq_length,
                                             batch_size=FLAGS.eval_batch_size,
                                             is_training=False,
                                             drop_remainder=False)
        prediction_dataset = predict_input_fn()

        with strategy.scope():
            logits = model.predict(prediction_dataset)
            if FLAGS.task_name.lower() == "sts":
                predictions = logits
                probabilities = logits
            else:
                predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                probabilities = tf.nn.softmax(logits, axis=-1)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        output_submit_file = os.path.join(FLAGS.output_dir,
                                          "submit_results.tsv")
        with tf.io.gfile.GFile(output_predict_file, "w") as pred_writer,\
            tf.io.gfile.GFile(output_submit_file, "w") as sub_writer:
            logging.info("***** Predict results *****")
            for (example, probability,
                 prediction) in zip(predict_examples, probabilities,
                                    predictions):
                output_line = "\t".join(
                    str(class_probability.numpy())
                    for class_probability in probability) + "\n"
                pred_writer.write(output_line)

                actual_label = label_map[int(prediction)]
                sub_writer.write(
                    six.ensure_str(example.guid) + "\t" + actual_label + "\n")
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help=
        "Bert pre-trained model selected in the list: bert-base-cased,bert-large-cased"
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    # Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev/test set.")
    parser.add_argument("--eval_on",
                        default="dev",
                        type=str,
                        help="Evaluation set, dev: Development, test: Test")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--weight_decay",
                        default=0.01,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    # training stratergy arguments
    parser.add_argument(
        "--multi_gpu",
        action='store_true',
        help="Set this flag to enable multi-gpu training using MirroredStrategy."
        "Single gpu training")
    parser.add_argument(
        "--gpus",
        default='0',
        type=str,
        help="Comma separated list of gpus devices."
        "For Single gpu pass the gpu id.Default '0' GPU"
        "For Multi gpu,if gpus not specified all the available gpus will be used"
    )

    args = parser.parse_args()

    processor = NerProcessor()
    label_list = processor.get_labels()
    num_labels = len(label_list) + 1

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    if args.do_train:
        tokenizer = FullTokenizer(os.path.join(args.bert_model, "vocab.txt"),
                                  args.do_lower_case)

    if args.multi_gpu:
        if len(args.gpus.split(',')) == 1:
            strategy = tf.distribute.MirroredStrategy()
        else:
            gpus = [f"/gpu:{gpu}" for gpu in args.gpus.split(',')]
            strategy = tf.distribute.MirroredStrategy(devices=gpus)
    else:
        gpu = args.gpus.split(',')[0]
        strategy = tf.distribute.OneDeviceStrategy(device=f"/gpu:{gpu}")

    train_examples = None
    optimizer = None
    num_train_optimization_steps = 0
    ner = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) /
            args.train_batch_size) * args.num_train_epochs
        warmup_steps = int(args.warmup_proportion *
                           num_train_optimization_steps)
        learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=args.learning_rate,
            decay_steps=num_train_optimization_steps,
            end_learning_rate=0.0)
        if warmup_steps:
            learning_rate_fn = WarmUp(initial_learning_rate=args.learning_rate,
                                      decay_schedule_fn=learning_rate_fn,
                                      warmup_steps=warmup_steps)
        optimizer = AdamWeightDecay(
            learning_rate=learning_rate_fn,
            weight_decay_rate=args.weight_decay,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=args.adam_epsilon,
            exclude_from_weight_decay=['layer_norm', 'bias'])

        with strategy.scope():
            ner = BertNer(args.bert_model, tf.float32, num_labels,
                          args.max_seq_length)
            loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(
                reduction=tf.keras.losses.Reduction.NONE)

    label_map = {i: label for i, label in enumerate(label_list, 1)}
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        all_input_ids = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.input_ids for f in train_features], dtype=np.int32))
        all_input_mask = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.input_mask for f in train_features], dtype=np.int32))
        all_segment_ids = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.segment_ids for f in train_features],
                       dtype=np.int32))
        all_valid_ids = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.valid_ids for f in train_features], dtype=np.int32))
        all_label_mask = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.label_mask for f in train_features]))

        all_label_ids = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.label_id for f in eval_features], dtype=np.int32))

        # Dataset using tf.data
        train_data = tf.data.Dataset.zip(
            (all_input_ids, all_input_mask, all_segment_ids, all_valid_ids,
             all_label_ids, all_label_mask))
        shuffled_train_data = train_data.shuffle(buffer_size=int(
            len(train_features) * 0.1),
                                                 seed=args.seed,
                                                 reshuffle_each_iteration=True)
        batched_train_data = shuffled_train_data.batch(args.train_batch_size)
        # Distributed dataset
        dist_dataset = strategy.experimental_distribute_dataset(
            batched_train_data)

        loss_metric = tf.keras.metrics.Mean()

        epoch_bar = master_bar(range(args.num_train_epochs))
        pb_max_len = math.ceil(
            float(len(train_features)) / float(args.train_batch_size))

        def train_step(input_ids, input_mask, segment_ids, valid_ids,
                       label_ids, label_mask):
            def step_fn(input_ids, input_mask, segment_ids, valid_ids,
                        label_ids, label_mask):

                with tf.GradientTape() as tape:
                    logits = ner(input_ids,
                                 input_mask,
                                 segment_ids,
                                 valid_ids,
                                 training=True)
                    label_mask = tf.reshape(label_mask, (-1, ))
                    logits = tf.reshape(logits, (-1, num_labels))
                    logits_masked = tf.boolean_mask(logits, label_mask)
                    label_ids = tf.reshape(label_ids, (-1, ))
                    label_ids_masked = tf.boolean_mask(label_ids, label_mask)
                    cross_entropy = loss_fct(label_ids_masked, logits_masked)
                    loss = tf.reduce_sum(cross_entropy) * (
                        1.0 / args.train_batch_size)
                grads = tape.gradient(loss, ner.trainable_variables)
                optimizer.apply_gradients(
                    list(zip(grads, ner.trainable_variables)))
                return cross_entropy

            per_example_losses = strategy.experimental_run_v2(
                step_fn,
                args=(input_ids, input_mask, segment_ids, valid_ids, label_ids,
                      label_mask))
            mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN,
                                        per_example_losses,
                                        axis=0)
            return mean_loss

        for epoch in epoch_bar:
            with strategy.scope():
                for (input_ids, input_mask, segment_ids, valid_ids, label_ids,
                     label_mask) in progress_bar(dist_dataset,
                                                 total=pb_max_len,
                                                 parent=epoch_bar):
                    loss = train_step(input_ids, input_mask, segment_ids,
                                      valid_ids, label_ids, label_mask)
                    loss_metric(loss)
                    epoch_bar.child.comment = f'loss : {loss_metric.result()}'
            loss_metric.reset_states()

        # model weight save
        ner.save_weights(os.path.join(args.output_dir, "model.h5"))
        # copy vocab to output_dir
        shutil.copyfile(os.path.join(args.bert_model, "vocab.txt"),
                        os.path.join(args.output_dir, "vocab.txt"))
        # copy bert config to output_dir
        shutil.copyfile(os.path.join(args.bert_model, "bert_config.json"),
                        os.path.join(args.output_dir, "bert_config.json"))
        # save label_map and max_seq_length of trained model
        model_config = {
            "bert_model": args.bert_model,
            "do_lower": args.do_lower_case,
            "max_seq_length": args.max_seq_length,
            "num_labels": num_labels,
            "label_map": label_map
        }
        json.dump(model_config,
                  open(os.path.join(args.output_dir, "model_config.json"),
                       "w"),
                  indent=4)

    if args.do_eval:
        # load tokenizer
        tokenizer = FullTokenizer(os.path.join(args.output_dir, "vocab.txt"),
                                  args.do_lower_case)
        # model build hack : fix
        config = json.load(
            open(os.path.join(args.output_dir, "bert_config.json")))
        ner = BertNer(config, tf.float32, num_labels, args.max_seq_length)
        ids = tf.ones((1, 128), dtype=tf.int32)
        _ = ner(ids, ids, ids, ids, training=False)
        ner.load_weights(os.path.join(args.output_dir, "model.h5"))

        # load test or development set based on argsK
        if args.eval_on == "dev":
            eval_examples = processor.get_dev_examples(args.data_dir)
        elif args.eval_on == "test":
            eval_examples = processor.get_test_examples(args.data_dir)

        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer)
        logger.info("***** Running evalution *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        all_input_ids = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.input_ids for f in eval_features], dtype=np.int32))
        all_input_mask = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.input_mask for f in eval_features], dtype=np.int32))
        all_segment_ids = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.segment_ids for f in eval_features], dtype=np.int32))
        all_valid_ids = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.valid_ids for f in eval_features], dtype=np.int32))

        all_label_ids = tf.data.Dataset.from_tensor_slices(
            np.asarray([f.label_id for f in eval_features], dtype=np.int32))

        eval_data = tf.data.Dataset.zip(
            (all_input_ids, all_input_mask, all_segment_ids, all_valid_ids,
             all_label_ids))
        batched_eval_data = eval_data.batch(args.eval_batch_size)

        loss_metric = tf.keras.metrics.Mean()
        epoch_bar = master_bar(range(1))
        pb_max_len = math.ceil(
            float(len(eval_features)) / float(args.eval_batch_size))

        y_true = []
        y_pred = []
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        for epoch in epoch_bar:
            for (input_ids, input_mask, segment_ids, valid_ids,
                 label_ids) in progress_bar(batched_eval_data,
                                            total=pb_max_len,
                                            parent=epoch_bar):
                logits = ner(input_ids,
                             input_mask,
                             segment_ids,
                             valid_ids,
                             training=False)
                logits = tf.argmax(logits, axis=2)
                for i, label in enumerate(label_ids):
                    temp_1 = []
                    temp_2 = []
                    for j, m in enumerate(label):
                        if j == 0:
                            continue
                        elif label_ids[i][j].numpy() == len(label_map):
                            y_true.append(temp_1)
                            y_pred.append(temp_2)
                            break
                        else:
                            temp_1.append(label_map[label_ids[i][j].numpy()])
                            temp_2.append(label_map[logits[i][j].numpy()])
        report = classification_report(y_true, y_pred, digits=4)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            logger.info("\n%s", report)
            writer.write(report)
Exemplo n.º 7
0
def run_customized_training(strategy, albert_config, tinybert_config,
                            max_seq_length, max_predictions_per_seq, model_dir,
                            steps_per_epoch, steps_per_loop, epochs,
                            initial_lr, warmup_steps, input_files,
                            train_batch_size, use_mlm_loss):
    """Run BERT pretrain model training using low-level API."""

    train_input_fn = functools.partial(get_pretrain_input_data, input_files,
                                       max_seq_length, max_predictions_per_seq,
                                       train_batch_size, strategy)

    with strategy.scope():
        # albert, albert_encoder = albert_model.pretrain_model(
        #     albert_config, max_seq_length, max_predictions_per_seq)
        train_model, albert, tinybert = tinybert_model.train_tinybert_model(
            tinybert_config, albert_config, max_seq_length,
            max_predictions_per_seq)
        albert.summary()
        tinybert.summary()
        train_model.summary()

        if FLAGS.init_checkpoint:
            logging.info(
                f"model pre-trained weights loaded from {FLAGS.init_checkpoint}"
            )
            train_model.load_weights(FLAGS.init_checkpoint)

        learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=initial_lr,
            decay_steps=int(steps_per_epoch * epochs),
            end_learning_rate=0.0)

        if warmup_steps:
            learning_rate_fn = WarmUp(initial_learning_rate=initial_lr,
                                      decay_schedule_fn=learning_rate_fn,
                                      warmup_steps=warmup_steps)
        if FLAGS.optimizer == "lamp":
            optimizer_fn = LAMB
        else:
            optimizer_fn = AdamWeightDecay

        optimizer = optimizer_fn(
            learning_rate=learning_rate_fn,
            weight_decay_rate=FLAGS.weight_decay,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=FLAGS.adam_epsilon,
            exclude_from_weight_decay=['layer_norm', 'bias'])
        train_model.optimizer = optimizer
    # 注意这里的model_dir是albert和tinybert共享,需要修改
    if FLAGS.do_train:
        trained_model = run_customized_training_loop(
            strategy=strategy,
            models=[albert, tinybert, train_model],
            model=train_model,
            albert=albert,
            tinybert=tinybert,
            start_wtih_trained_model=FLAGS.start_with_train_model,
            loss_fn=get_loss_fn(loss_factor=1.0 /
                                strategy.num_replicas_in_sync),
            model_dir=model_dir,
            train_input_fn=train_input_fn,
            steps_per_epoch=steps_per_epoch,
            steps_per_loop=steps_per_loop,
            epochs=epochs,
        )
    # Creates the BERT core model outside distribution strategy scope.
    training, albert, tinybert = tinybert_model.train_tinybert_model(
        tinybert_config, albert_config, max_seq_length,
        max_predictions_per_seq)

    # Restores the core model from model checkpoints and save weights only
    # contains the core model.
    # 在training的过程中会保存ckpt的模型文件,在训练结束后从ckpt读出模型再存储为h5文件
    # 寻找albert模型文件
    checkpoint_model = tf.train.Checkpoint(model=training)
    latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
    assert latest_checkpoint_file
    logging.info('Checkpoint file %s found and restoring from '
                 'checkpoint', latest_checkpoint_file)
    status = checkpoint_model.restore(latest_checkpoint_file)
    status.assert_existing_objects_matched().expect_partial()
    # 寻找tinybert模型文件
    # checkpoint_tinybert = tf.train.Checkpoint(model=tinybert)
    # latest_tinybert_checkpoint_file = tf.train.latest_checkpoint(tinybert_model_dir)
    # assert latest_tinybert_checkpoint_file
    # logging.info('Checkpoint_Tinybert file %s found and restoring from '
    #              'checkpoint', latest_tinybert_checkpoint_file)
    # status_tinybert = checkpoint_albert.restore(latest_tinybert_checkpoint_file)
    # status_tinybert.assert_existing_objects_matched().expect_partial()
    # 创建存储文件
    if not os.path.exists(model_dir + '/models/'):
        os.makedirs(model_dir + '/models/')
    albert.save_weights(f"{model_dir}/models/albert_model.h5")
    tinybert.save_weights(f"{model_dir}/models/tinybert_model.h5")