def do_train(args): set_seed(args) DEV, TEST, TOKENIZER_CLASS = DATASET_INFO[args.dataset] tokenizer = TOKENIZER_CLASS.from_pretrained(args.model_name_or_path) train_ds, eval_ds, test_ds = load_dataset(args.dataset, splits=['train', DEV, TEST]) paddle.set_device(args.device) trainer_num = paddle.distributed.get_world_size() if trainer_num > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() if rank == 0: if os.path.exists(args.model_name_or_path): logger.info("init checkpoint from %s" % args.model_name_or_path) model = ErnieDocForQuestionAnswering.from_pretrained( args.model_name_or_path, dropout=args.dropout) model_config = model.ernie_doc.config if trainer_num > 1: model = paddle.DataParallel(model) train_ds_iter = MRCIterator(train_ds, args.batch_size, tokenizer, trainer_num, trainer_id=rank, memory_len=model_config["memory_len"], max_seq_length=args.max_seq_length, random_seed=args.seed) eval_ds_iter = MRCIterator(eval_ds, args.batch_size, tokenizer, trainer_num, trainer_id=rank, memory_len=model_config["memory_len"], max_seq_length=args.max_seq_length, mode="eval", random_seed=args.seed) test_ds_iter = MRCIterator(test_ds, args.batch_size, tokenizer, trainer_num, trainer_id=rank, memory_len=model_config["memory_len"], max_seq_length=args.max_seq_length, mode="test", random_seed=args.seed) train_dataloader = paddle.io.DataLoader.from_generator(capacity=70, return_list=True) train_dataloader.set_batch_generator(train_ds_iter, paddle.get_device()) eval_dataloader = paddle.io.DataLoader.from_generator(capacity=70, return_list=True) eval_dataloader.set_batch_generator(eval_ds_iter, paddle.get_device()) test_dataloader = paddle.io.DataLoader.from_generator(capacity=70, return_list=True) test_dataloader.set_batch_generator(test_ds_iter, paddle.get_device()) num_training_examples = train_ds_iter.get_num_examples() num_training_steps = args.epochs * num_training_examples // args.batch_size // trainer_num logger.info("Device count: %d, trainer_id: %d" % (trainer_num, rank)) logger.info("Num train examples: %d" % num_training_examples) logger.info("Max train steps: %d" % num_training_steps) logger.info("Num warmup steps: %d" % int(num_training_steps * args.warmup_proportion)) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] # Construct dict name_dict = dict() for n, p in model.named_parameters(): name_dict[p.name] = n optimizer = AdamWDL(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, n_layers=model_config["num_hidden_layers"], layerwise_decay=args.layerwise_decay, name_dict=name_dict) global_steps = 0 create_memory = partial(init_memory, args.batch_size, args.memory_length, model_config["hidden_size"], model_config["num_hidden_layers"]) criterion = CrossEntropyLossForQA() memories = create_memory() tic_train = time.time() best_avg_metric = -1 for epoch in range(args.epochs): train_ds_iter.shuffle_sample() train_dataloader.set_batch_generator(train_ds_iter, paddle.get_device()) for step, batch in enumerate(train_dataloader, start=1): global_steps += 1 input_ids, position_ids, token_type_ids, attn_mask, start_position, \ end_position, qids, gather_idx, need_cal_loss = batch start_logits, end_logits, memories = model(input_ids, memories, token_type_ids, position_ids, attn_mask) start_logits, end_logits, qids, start_position, end_position = list( map(lambda x: paddle.gather(x, gather_idx), [ start_logits, end_logits, qids, start_position, end_position ])) loss = criterion([start_logits, end_logits], [start_position, end_position]) * need_cal_loss mean_loss = loss.mean() mean_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_steps % args.logging_steps == 0: logger.info( "train: global step %d, epoch: %d, loss: %f, lr: %f, speed: %.2f step/s" % (global_steps, epoch, mean_loss, lr_scheduler.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_steps % args.save_steps == 0: # Evaluate logger.info("Eval:") EM, F1, AVG = evaluate(args, model, criterion, EM_AND_F1(), eval_dataloader, create_memory(), tokenizer) if rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % (global_steps)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if best_avg_metric < AVG: output_dir = os.path.join(args.output_dir, "best_model") if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if args.max_steps > 0 and global_steps >= args.max_steps: return logger.info("Test:") evaluate(args, model, criterion, EM_AND_F1(), test_dataloader, create_memory(), tokenizer) if rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % (global_steps)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
def do_train(args): set_seed(args) tokenizer_class, eval_name, test_name, = DATASET_INFO[args.dataset] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) train_ds, eval_ds, test_ds = load_dataset( args.dataset, splits=["train", eval_name, test_name]) num_classes = len(train_ds.label_list) no_entity_id = num_classes - 1 paddle.set_device(args.device) trainer_num = paddle.distributed.get_world_size() if trainer_num > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() if rank == 0: if os.path.exists(args.model_name_or_path): logger.info("init checkpoint from %s" % args.model_name_or_path) model = ErnieDocForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) model_config = model.ernie_doc.config if trainer_num > 1: model = paddle.DataParallel(model) train_ds_iter = SequenceLabelingIterator( train_ds, args.batch_size, tokenizer, trainer_num, trainer_id=rank, memory_len=model_config["memory_len"], max_seq_length=args.max_seq_length, random_seed=args.seed, no_entity_id=no_entity_id) eval_ds_iter = SequenceLabelingIterator( eval_ds, args.batch_size, tokenizer, trainer_num, trainer_id=rank, memory_len=model_config["memory_len"], max_seq_length=args.max_seq_length, mode="eval", no_entity_id=no_entity_id) test_ds_iter = SequenceLabelingIterator( test_ds, args.batch_size, tokenizer, trainer_num, trainer_id=rank, memory_len=model_config["memory_len"], max_seq_length=args.max_seq_length, mode="test", no_entity_id=no_entity_id) train_dataloader = paddle.io.DataLoader.from_generator(capacity=70, return_list=True) train_dataloader.set_batch_generator(train_ds_iter, paddle.get_device()) eval_dataloader = paddle.io.DataLoader.from_generator(capacity=70, return_list=True) eval_dataloader.set_batch_generator(eval_ds_iter, paddle.get_device()) test_dataloader = paddle.io.DataLoader.from_generator(capacity=70, return_list=True) test_dataloader.set_batch_generator(test_ds_iter, paddle.get_device()) num_training_examples = train_ds_iter.get_num_examples() num_training_steps = args.epochs * num_training_examples // args.batch_size // trainer_num logger.info("Device count: %d, trainer_id: %d" % (trainer_num, rank)) logger.info("Num train examples: %d" % num_training_examples) logger.info("Max train steps: %d" % num_training_steps) logger.info("Num warmup steps: %d" % int(num_training_steps * args.warmup_proportion)) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] # Construct dict name_dict = dict() for n, p in model.named_parameters(): name_dict[p.name] = n optimizer = AdamWDL(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, n_layers=model_config["num_hidden_layers"], layerwise_decay=args.layerwise_decay, name_dict=name_dict) criterion = paddle.nn.loss.CrossEntropyLoss() metric = ChunkEvaluator(label_list=train_ds.label_list) global_steps = 0 create_memory = partial(init_memory, args.batch_size, args.memory_length, model_config["hidden_size"], model_config["num_hidden_layers"]) # Copy the memory memories = create_memory() tic_train = time.time() best_f1 = 0 for epoch in range(args.epochs): train_ds_iter.shuffle_sample() train_dataloader.set_batch_generator(train_ds_iter, paddle.get_device()) for step, batch in enumerate(train_dataloader, start=1): global_steps += 1 input_ids, position_ids, token_type_ids, attn_mask, labels, lengths, qids, \ gather_idx, need_cal_loss = batch logits, memories = model(input_ids, memories, token_type_ids, position_ids, attn_mask) logits, labels = list( map(lambda x: paddle.gather(x, gather_idx), [logits, labels])) loss = criterion(logits, labels) * need_cal_loss loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_steps % args.logging_steps == 0: logger.info( "train: global step %d, epoch: %d, loss: %f, lr: %f, speed: %.2f step/s" % (global_steps, epoch, loss, lr_scheduler.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_steps % args.save_steps == 0: # Evaluate logger.info("Eval:") precision, recall, f1_score = evaluate(model, metric, eval_dataloader, create_memory()) # Save if rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % (global_steps)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if f1_score > best_f1: logger.info("Save best model......") best_f1 = f1_score best_model_dir = os.path.join(args.output_dir, "best_model") if not os.path.exists(best_model_dir): os.makedirs(best_model_dir) model_to_save.save_pretrained(best_model_dir) tokenizer.save_pretrained(best_model_dir) if args.max_steps > 0 and global_steps >= args.max_steps: return logger.info("Final test result:") eval_acc = evaluate(model, metric, test_dataloader, create_memory())
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) if args.task_type == "cross-lingual-transfer": train_ds = load_dataset("xnli", "en", splits="train") train_ds = train_ds.map(trans_func, lazy=True) elif args.task_type == "translate-train-all": all_train_ds = [] for language in all_languages: train_ds = load_dataset("xnli", language, splits="train") all_train_ds.append(train_ds.map(trans_func, lazy=True)) train_ds = XnliDataset(all_train_ds) train_batch_sampler = DistributedBatchSampler(train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # position_ids Pad(axis=0, pad_val=0, dtype="int64"), # attention_mask Stack(dtype="int64") # labels ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 3 model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes, dropout=args.dropout) n_layers = model.ernie_m.config['num_hidden_layers'] if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] # Construct dict name_dict = dict() for n, p in model.named_parameters(): name_dict[p.name] = n optimizer = AdamWDL(learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, n_layers=n_layers, layerwise_decay=args.layerwise_decay, apply_decay_param_fun=lambda x: x in decay_params, name_dict=name_dict) loss_fct = nn.CrossEntropyLoss() if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) metric = Accuracy() global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, position_ids, attention_mask, labels = batch with paddle.amp.auto_cast( args.use_amp, custom_white_list=["layer_norm", "softmax", "gelu"]): logits = model(input_ids, position_ids, attention_mask) loss = loss_fct(logits, labels) if args.use_amp: scaled_loss = scaler.scale(loss) scaled_loss.backward() scaler.minimize(optimizer, scaled_loss) else: loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: for language in all_languages: tic_eval = time.time() test_data_loader = get_test_dataloader( args, language, batchify_fn, trans_func) evaluate(model, loss_fct, metric, test_data_loader, language) print("eval done total : %s s" % (time.time() - tic_eval)) if paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "ernie_m_ft_model_%d.pdparams" % (global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step >= num_training_steps: break if global_step >= num_training_steps: break if paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "ernie_m_final_model_%d.pdparams" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)