Пример #1
0
    def test_bigbird_base_uncased(self):
        model = BigBirdModel.from_pretrained(
            'bigbird-base-uncased', attn_dropout=0.0, hidden_dropout_prob=0.0)
        self.config = copy.deepcopy(model.config)
        self.config['seq_len'] = 512
        self.config['batch_size'] = 3

        rand_mask_idx_list, input_ids, _ = create_input_data(self.config, 102)
        input_ids = paddle.to_tensor(input_ids)
        rand_mask_idx_list = paddle.to_tensor(rand_mask_idx_list)
        output = model(input_ids, rand_mask_idx_list=rand_mask_idx_list)

        expected_seq_shape = (self.config['batch_size'], self.config['seq_len'],
                              self.config['hidden_size'])
        expected_pooled_shape = (self.config['batch_size'],
                                 self.config['hidden_size'])
        self.check_output_equal(output[0].numpy().shape, expected_seq_shape)
        self.check_output_equal(output[1].numpy().shape, expected_pooled_shape)

        expected_seq_slice = np.array([[0.06685783, 0.01576832, -0.14448889],
                                       [0.16531630, 0.00974050, -0.15113291],
                                       [0.08514148, -0.01252885, -0.12458798]])
        # There's output diff about 1e-4 between cpu and gpu
        self.check_output_equal(
            output[0].numpy()[0, 0:3, 0:3], expected_seq_slice, atol=1e-4)

        expected_pooled_slice = np.array(
            [[0.78695089, 0.87273526, -0.88046724],
             [0.66016346, 0.74889791, -0.76608104],
             [0.15944470, 0.25242448, -0.34336662]])
        self.check_output_equal(
            output[1].numpy()[0:3, 0:3], expected_pooled_slice, atol=1e-4)
Пример #2
0
 def test_forward(self):
     bigbird = BigBirdModel(**self.config)
     model = self.TEST_MODEL_CLASS(bigbird)
     input_ids = paddle.to_tensor(self.input_ids)
     rand_mask_idx_list = paddle.to_tensor(self.rand_mask_idx_list)
     output = model(input_ids, rand_mask_idx_list=rand_mask_idx_list)
     self.check_output_equal(self.expected_shape, output.numpy().shape)
Пример #3
0
 def test_forward(self):
     bigbird = BigBirdModel(**self.config)
     model = self.TEST_MODEL_CLASS(bigbird)
     input_ids = paddle.to_tensor(self.input_ids)
     rand_mask_idx_list = paddle.to_tensor(self.rand_mask_idx_list)
     masked_positions = paddle.to_tensor(self.masked_lm_positions)
     output = model(input_ids,
                    rand_mask_idx_list=rand_mask_idx_list,
                    masked_positions=masked_positions)
     self.check_output_equal(output[0].numpy().shape,
                             self.expected_pred_shape)
     self.check_output_equal(output[1].numpy().shape,
                             self.expected_seq_shape)
Пример #4
0
 def test_forward(self):
     bigbird = BigBirdModel(**self.config)
     model = self.TEST_MODEL_CLASS(bigbird)
     input_ids = paddle.to_tensor(self.input_ids)
     rand_mask_idx_list = paddle.to_tensor(self.rand_mask_idx_list)
     labels = paddle.to_tensor(self.labels)
     masked_lm_loss, prediction_scores, sequence_output = model(
         input_ids, rand_mask_idx_list=rand_mask_idx_list, labels=labels)
     self.check_output_equal(self.expected_shape1,
                             masked_lm_loss.numpy().shape)
     self.check_output_equal(self.expected_shape2,
                             prediction_scores.numpy().shape)
     self.check_output_equal(self.expected_shape3,
                             sequence_output.numpy().shape)
Пример #5
0
def do_train(args):
    # Initialization for the parallel enviroment
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    worker_index = paddle.distributed.get_rank()
    worker_num = paddle.distributed.get_world_size()

    # Set the random seed for the training process
    set_seed(args)
    worker_init = WorkerInitObj(args.seed + worker_index)

    # Get the model class and tokenizer class
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    # Define the pretrain model and metric
    pretrained_models_list = list(
        model_class.pretrained_init_configuration.keys())
    if args.model_name_or_path in pretrained_models_list:
        model = BigBirdForPretraining(
            BigBirdModel(**model_class.pretrained_init_configuration[
                args.model_name_or_path]))
    else:
        model = BigBirdForPretraining.from_pretrained(args.model_name_or_path)
    # Get bigbird config for generate random attention mask
    config = getattr(model, BigBirdForPretraining.base_model_prefix).config
    criterion = BigBirdPretrainingCriterion(config["vocab_size"], args.use_nsp)
    if worker_num > 1:
        model = paddle.DataParallel(model)

    # Define learing_rate scheduler and optimizer
    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, args.max_steps,
                                         args.warmup_steps)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.epochs):
        files = [
            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
        ]
        files.sort()
        num_files = len(files)
        for f_id in range(num_files):
            train_data_loader = create_dataloader(
                files[f_id], tokenizer, worker_init, args.batch_size,
                args.max_encoder_length, args.max_pred_length, config)
            for step, batch in enumerate(train_data_loader):
                global_step += 1
                (input_ids, segment_ids, masked_lm_positions, masked_lm_ids,
                 masked_lm_weights, next_sentence_labels,
                 masked_lm_scale) = batch[:7]
                rand_mask_idx_list = batch[7:]

                prediction_scores, seq_relationship_score = model(
                    input_ids=input_ids,
                    token_type_ids=segment_ids,
                    rand_mask_idx_list=rand_mask_idx_list,
                    masked_positions=masked_lm_positions)
                loss = criterion(prediction_scores, seq_relationship_score,
                                 masked_lm_ids, next_sentence_labels,
                                 masked_lm_scale, masked_lm_weights)
                if global_step % args.logging_steps == 0 and worker_index == 0:
                    logger.info(
                        "global step %d, epoch: %d, lr: %.10f, loss: %f, speed: %.2f step/s"
                        % (global_step, epoch, optimizer.get_lr(), loss,
                           args.logging_steps / (time.time() - tic_train)))
                    tic_train = time.time()
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.clear_grad()
                if global_step % args.save_steps == 0:
                    if worker_index == 0:
                        output_dir = os.path.join(args.output_dir,
                                                  "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # Need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
                        paddle.save(
                            optimizer.state_dict(),
                            os.path.join(output_dir, "model_state.pdopt"))
                if global_step >= args.max_steps:
                    del train_data_loader
                    return
            del train_data_loader