def train(args, model, train_data_loader, dev_data_loader, metric, rank): num_examples = len(train_data_loader) * args.batch_size * args.n_gpu max_train_steps = args.epochs * len(train_data_loader) if rank == 0: print("Num train examples: %d" % num_examples) print("Max train steps: %d" % max_train_steps) print("Warmup proportion: %d" % args.warmup_proportion) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_train_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) loss_fn = DGULossFunction(args.task_name) load_ckpt(args, model, optimizer) step = 0 best_metric = 0.0 total_time = 0.0 for epoch in range(args.epochs): if rank == 0: print('\nEpoch %d/%d' % (epoch + 1, args.epochs)) batch_start_time = time.time() for batch in train_data_loader: step += 1 input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fn(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() total_time += (time.time() - batch_start_time) if rank == 0: if step % args.logging_steps == 0: print_logs(args, step, logits, labels, loss, total_time, metric) total_time = 0.0 if step % args.save_steps == 0 or step == max_train_steps: save_ckpt(model, optimizer, args.output_dir, step) if args.do_eval: print('\nEval begin...') metric_out = evaluation(args, model, dev_data_loader, metric) if metric_out > best_metric: best_metric = metric_out save_ckpt(model, optimizer, args.output_dir, 'best') print('Best model, step: %d\n' % step) batch_start_time = time.time()
def do_train(args): paddle.enable_static() if not args.eager_run else None paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank()) args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] # Loads or initializes a model. pretrained_models = list( tokenizer_class.pretrained_init_configuration.keys()) if args.model_name_or_path in pretrained_models: tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) generator = ElectraGenerator( ElectraModel(**model_class.pretrained_init_configuration[ args.model_name_or_path + "-generator"])) discriminator = ElectraDiscriminator( ElectraModel(**model_class.pretrained_init_configuration[ args.model_name_or_path + "-discriminator"])) model = model_class(generator, discriminator) else: if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt: # load checkpoint tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path) for file_id, file_name in model_class.resource_files_names.items(): full_file_name = os.path.join(args.model_name_or_path, file_name) # to be write : load model ckpt file else: raise ValueError( "initialize a model need identifier or the " "path to a directory instead. The supported model " "identifiers are as follows: {}".format( model_class.pretrained_init_configuration.keys())) criterion = ElectraPretrainingCriterion( getattr(model.generator, ElectraGenerator.base_model_prefix).config["vocab_size"], model.gen_weight, model.disc_weight) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) # Loads dataset. tic_load_data = time.time() print("start load data : %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) train_dataset = BookCorpus(data_path=args.input_dir, tokenizer=tokenizer, max_seq_length=args.max_seq_length, mode='train') print("load data done, total : %s s" % (time.time() - tic_load_data)) # Reads data and generates mini-batches. data_collator = DataCollatorForElectra(tokenizer=tokenizer, max_seq_length=args.max_seq_length, mlm=True, mlm_probability=args.mask_prob) train_data_loader = create_dataloader( train_dataset, batch_size=args.train_batch_size, mode='train', use_gpu=True if args.n_gpu else False, data_collator=data_collator) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) print("start train : %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, raw_input_ids, gen_labels = batch gen_logits, disc_logits, disc_labels = model( input_ids=input_ids, raw_input_ids=raw_input_ids, gen_labels=gen_labels) loss = criterion(gen_logits, disc_logits, gen_labels, disc_labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() #print("backward done, total %s s" % (time.time() - tic_train)) #tic_train = time.time() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "model_%d.pdparams" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model #model_to_save.save_pretrained(output_dir) paddle.save( model.state_dict(), os.path.join(output_dir, "model_state.pdparams")) tokenizer.save_pretrained(output_dir) paddle.save(optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt"))
def do_train(): if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Reads label_map. label_map_path = os.path.join(args.data_path, "predicate2id.json") if not (os.path.exists(label_map_path) and os.path.isfile(label_map_path)): sys.exit("{} dose not exists or is not a file.".format(label_map_path)) with open(label_map_path, 'r', encoding='utf8') as fp: label_map = json.load(fp) num_classes = (len(label_map.keys()) - 2) * 2 + 2 # Loads pretrained model ERNIE model = ErnieForTokenClassification.from_pretrained( "ernie-1.0", num_classes=num_classes) model = paddle.DataParallel(model) tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") criterion = BCELossForDuIE() # Loads dataset. train_dataset = DuIEDataset.from_file( os.path.join(args.data_path, 'train_data.json'), tokenizer, args.max_seq_length, True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) collator = DataCollator() train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=collator, return_list=True) eval_file_path = os.path.join(args.data_path, 'dev_data.json') test_dataset = DuIEDataset.from_file(eval_file_path, tokenizer, args.max_seq_length, True) test_batch_sampler = paddle.io.BatchSampler(test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) test_data_loader = DataLoader(dataset=test_dataset, batch_sampler=test_batch_sampler, collate_fn=collator, return_list=True) # Defines learning rate strategy. steps_by_epoch = len(train_data_loader) num_training_steps = steps_by_epoch * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_ratio) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) # Starts training. global_step = 0 logging_steps = 50 save_steps = 10000 tic_train = time.time() for epoch in range(args.num_train_epochs): print("\n=====start training of %d epochs=====" % epoch) tic_epoch = time.time() model.train() for step, batch in enumerate(train_data_loader): input_ids, seq_lens, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch logits = model(input_ids=input_ids) mask = (input_ids != 0).logical_and((input_ids != 1)).logical_and( (input_ids != 2)) loss = criterion(logits, labels, mask) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() loss_item = loss.numpy().item() if global_step % logging_steps == 0 and paddle.distributed.get_rank( ) == 0: print( "epoch: %d / %d, steps: %d / %d, loss: %f, speed: %.2f step/s" % (epoch, args.num_train_epochs, step, steps_by_epoch, loss_item, logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % save_steps == 0 and global_step != 0 and paddle.distributed.get_rank( ) == 0: print("\n=====start evaluating ckpt of %d steps=====" % global_step) precision, recall, f1 = evaluate(model, criterion, test_data_loader, eval_file_path, "eval") print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" % (100 * precision, 100 * recall, 100 * f1)) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: print("saving checkpoing model_%d.pdparams to %s " % (global_step, args.output_dir)) paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) model.train() # back to train mode global_step += 1 tic_epoch = time.time() - tic_epoch print("epoch time footprint: %d hour %d min %d sec" % (tic_epoch // 3600, (tic_epoch % 3600) // 60, tic_epoch % 60)) # Does final evaluation. if paddle.distributed.get_rank() == 0: print("\n=====start evaluating last ckpt of %d steps=====" % global_step) precision, recall, f1 = evaluate(model, criterion, test_data_loader, eval_file_path, "eval") print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" % (100 * precision, 100 * recall, 100 * f1)) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) print("\n=====training complete=====")
def run(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) if args.version_2_with_negative: train_examples = load_dataset('squad_v2', split='train') dev_examples = load_dataset('squad_v2', split='validation') else: train_examples = load_dataset('squad', split='train') dev_examples = load_dataset('squad', split='validation') set_seed(args) if rank == 0: if os.path.exists(args.model_name_or_path): print("init checkpoint from %s" % args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) column_names = train_examples.column_names if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.do_train: train_ds = train_examples.map(partial(prepare_train_features, tokenizer=tokenizer, args=args), batched=True, remove_columns=column_names, num_proc=4) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), 'attention_mask': Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64") }): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = CrossEntropyLossForSQuAD() global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, attention_mask, start_positions, end_positions = batch logits = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) loss = criterion(logits, (start_positions, end_positions)) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch + 1, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if global_step == num_training_steps: break if args.do_predict and rank == 0: dev_ds = dev_examples.map(partial(prepare_validation_features, tokenizer=tokenizer, args=args), batched=True, remove_columns=column_names, num_proc=4) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "attention_mask": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) evaluate(model, dev_data_loader, dev_examples, args)
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds, dev_ds, test_ds = load_dataset("chnsenticorp", splits=["train", "dev", "test"]) # If you wanna use bert/roberta/electra pretrained model, # model = ppnlp.transformers.BertForSequenceClassification.from_pretrained('bert-base-chinese', num_class=2) # model = ppnlp.transformers.RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext', num_class=2) # model = ppnlp.transformers.ElectraForSequenceClassification.from_pretrained('chinese-electra-small', num_classes=2) model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained( 'ernie-tiny', num_classes=len(train_ds.label_list)) # If you wanna use bert/roberta/electra pretrained model, # tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese') # tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext') # tokenizer = ppnlp.transformers.ElectraTokenizer.from_pretrained('chinese-electra-small', num_classes=2) # ErnieTinyTokenizer is special for ernie-tiny pretained model. tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( 'ernie-tiny') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64") # label ): [data for data in fn(samples)] train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) test_data_loader = create_dataloader(test_ds, mode='test', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, labels = batch logits = model(input_ids, token_type_ids) loss = criterion(logits, labels) probs = F.softmax(logits, axis=1) correct = metric.compute(probs, labels) metric.update(correct) acc = metric.accumulate() global_step += 1 if global_step % 10 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, acc, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % 100 == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) evaluate(model, criterion, metric, dev_data_loader) model._layers.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) if rank == 0: print('Evaluating on test data.') evaluate(model, criterion, metric, test_data_loader)
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_ds = load_dataset('glue', args.task_name, splits="train") tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial( convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) train_data_loader = DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.task_name == "mnli": dev_ds_matched, dev_ds_mismatched = load_dataset( 'glue', args.task_name, splits=["dev_matched", "dev_mismatched"]) dev_ds_matched = dev_ds_matched.map(trans_func, lazy=True) dev_ds_mismatched = dev_ds_mismatched.map(trans_func, lazy=True) dev_batch_sampler_matched = paddle.io.BatchSampler( dev_ds_matched, batch_size=args.batch_size, shuffle=False) dev_data_loader_matched = DataLoader( dataset=dev_ds_matched, batch_sampler=dev_batch_sampler_matched, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_batch_sampler_mismatched = paddle.io.BatchSampler( dev_ds_mismatched, batch_size=args.batch_size, shuffle=False) dev_data_loader_mismatched = DataLoader( dataset=dev_ds_mismatched, batch_sampler=dev_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=0, return_list=True) else: dev_ds = load_dataset('glue', args.task_name, splits='dev') dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_labels = 1 if train_ds.label_list == None else len(train_ds.label_list) model = model_class.from_pretrained( args.model_name_or_path, num_classes=num_labels) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) # Step1: Initialize a dictionary to save the weights from the origin BERT model. origin_weights = {} for name, param in model.named_parameters(): origin_weights[name] = param # Step2: Convert origin model to supernet. sp_config = supernet(expand_ratio=args.width_mult_list) model = Convert(sp_config).convert(model) # Use weights saved in the dictionary to initialize supernet. utils.set_state_dict(model, origin_weights) del origin_weights # Step3: Define teacher model. teacher_model = model_class.from_pretrained( args.model_name_or_path, num_classes=num_labels) # Step4: Config about distillation. mapping_layers = ['bert.embeddings'] for idx in range(model.bert.config['num_hidden_layers']): mapping_layers.append('bert.encoder.layers.{}'.format(idx)) default_distill_config = { 'lambda_distill': 0.1, 'teacher_model': teacher_model, 'mapping_layers': mapping_layers, } distill_config = DistillConfig(**default_distill_config) # Step5: Config in supernet training. ofa_model = OFA(model, distill_config=distill_config, elastic_order=['width']) criterion = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() if args.task_name == "mnli": dev_data_loader = (dev_data_loader_matched, dev_data_loader_mismatched) # Step6: Calculate the importance of neurons and head, # and then reorder them according to the importance. head_importance, neuron_importance = nlp_utils.compute_neuron_head_importance( args.task_name, ofa_model.model, dev_data_loader, loss_fct=criterion, num_layers=model.bert.config['num_hidden_layers'], num_heads=model.bert.config['num_attention_heads']) reorder_neuron_head(ofa_model.model, head_importance, neuron_importance) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=ofa_model.model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in ofa_model.model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): # Step7: Set current epoch and task. ofa_model.set_epoch(epoch) ofa_model.set_task('width') for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch for width_mult in args.width_mult_list: # Step8: Broadcast supernet config from width_mult, # and use this config in supernet training. net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) logits, teacher_logits = ofa_model( input_ids, segment_ids, attention_mask=[None, None]) rep_loss = ofa_model.calc_distill_loss() if args.task_name == 'sts-b': logit_loss = 0.0 else: logit_loss = soft_cross_entropy(logits, teacher_logits.detach()) loss = rep_loss + args.lambda_logit * logit_loss loss.backward() optimizer.step() lr_scheduler.step() ofa_model.model.clear_grad() if global_step % args.logging_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0: if args.task_name == "mnli": evaluate( teacher_model, criterion, metric, dev_data_loader_matched, width_mult=100) evaluate( teacher_model, criterion, metric, dev_data_loader_mismatched, width_mult=100) else: evaluate( teacher_model, criterion, metric, dev_data_loader, width_mult=100) for idx, width_mult in enumerate(args.width_mult_list): net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) tic_eval = time.time() if args.task_name == "mnli": acc = evaluate(ofa_model, criterion, metric, dev_data_loader_matched, width_mult) evaluate(ofa_model, criterion, metric, dev_data_loader_mismatched, width_mult) print("eval done total : %s s" % (time.time() - tic_eval)) else: acc = evaluate(ofa_model, criterion, metric, dev_data_loader, width_mult) print("eval done total : %s s" % (time.time() - tic_eval)) if (not args.n_gpu > 1 ) or paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) set_seed(args) train_ds, dev_ds, test_ds = load_dataset( 'dureader_yesno', splits=['train', 'dev', 'test']) trans_func = partial(convert_example, tokenizer=tokenizer) train_batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id), 'labels': Stack(dtype="int64") }): fn(samples) test_batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id), 'id': Stack() }): fn(samples) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_data_loader = DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=train_batchify_fn, return_list=True) test_ds = test_ds.map(trans_func, lazy=True) test_batch_sampler = paddle.io.BatchSampler( test_ds, batch_size=args.batch_size, shuffle=False) test_data_loader = DataLoader( dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=test_batchify_fn, return_list=True) model = model_class.from_pretrained( args.model_name_or_path, num_classes=len(train_ds.label_list)) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, label = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = criterion(logits, label) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, metric, dev_data_loader) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: predictions = predict(model, test_data_loader) with open('prediction.json', "w") as writer: writer.write( json.dumps( predictions, ensure_ascii=False, indent=4) + "\n")
def run(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) set_seed(args) train_examples, dev_examples, test_examples = load_dataset( 'cmrc2018', split=["train", "validation", "test"]) column_names = train_examples.column_names if rank == 0: if os.path.exists(args.model_name_or_path): print("init checkpoint from %s" % args.model_name_or_path) model = AutoModelForQuestionAnswering.from_pretrained( args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = examples['context'] questions = examples['question'] tokenized_examples = tokenizer(questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples['token_type_ids'][i] # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples['answers'][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != 1: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != 1: token_end_index -= 1 token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) return tokenized_examples def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HuggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = examples['context'] questions = examples['question'] tokenized_examples = tokenizer(questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length, return_attention_mask=True) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] for i in range(len(tokenized_examples["input_ids"])): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples['token_type_ids'][i] context_index = 1 # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_index else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if args.do_train: args.batch_size = int(args.batch_size / args.gradient_accumulation_steps) train_ds = train_examples.map(prepare_train_features, batched=True, remove_columns=column_names, num_proc=1) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64") }): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) dev_ds = dev_examples.map(prepare_validation_features, batched=True, remove_columns=column_names, num_proc=1) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.eval_batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) num_training_steps = int( args.max_steps / args.gradient_accumulation_steps) if args.max_steps > 0 else int( len(train_data_loader) * args.num_train_epochs / args.gradient_accumulation_steps) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = CrossEntropyLossForSQuAD() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, token_type_ids, start_positions, end_positions = batch logits = model(input_ids=input_ids, token_type_ids=token_type_ids) loss = criterion(logits, (start_positions, end_positions)) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: global_step += 1 optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, num_training_steps, epoch, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: if rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if global_step == num_training_steps: break evaluate(model, dev_examples, dev_data_loader, args) if args.do_predict and rank == 0: test_ds = test_examples.map(prepare_validation_features, batched=True, remove_columns=column_names, num_proc=1) test_batch_sampler = paddle.io.BatchSampler( test_ds, batch_size=args.eval_batch_size, shuffle=False) test_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) test_data_loader = DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=test_batchify_fn, return_list=True) evaluate(model, test_examples, test_data_loader, args, do_eval=False)
def do_train(args): # Initialize the paddle and paddle fleet execute enviroment paddle.enable_static() place = paddle.set_device(args.device) fleet.init(is_collective=True) worker_num = fleet.worker_num() worker_index = fleet.worker_index() # Create the random seed for the worker set_seed(args.seed) worker_init = WorkerInitObj(args.seed + worker_index) # Define the input data in the static mode main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() data_holders = create_data_holder(args) [ input_ids, segment_ids, input_mask, masked_lm_positions, masked_lm_labels, next_sentence_labels, masked_lm_scale ] = data_holders # Define the model structure in static mode args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) config = model_class.pretrained_init_configuration[args.model_name_or_path] if config["vocab_size"] % 8 != 0: config["vocab_size"] += 8 - (config["vocab_size"] % 8) model = BertForPretraining(BertModel(**config)) criterion = BertPretrainingCriterion(model.bert.config["vocab_size"]) prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_positions=masked_lm_positions) loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale) # Define the dynamic learing_reate scheduler and optimizer num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, multi_precision=args.use_pure_fp16) if worker_num == 1 and args.use_amp: custom_black_list = (['lookup_table', 'lookup_table_v2'] if args.use_pure_fp16 else None) amp_list = paddle.static.amp.AutoMixedPrecisionLists( custom_white_list=['softmax', 'layer_norm', 'gelu'], custom_black_list=custom_black_list) optimizer = paddle.static.amp.decorate( optimizer, amp_list, init_loss_scaling=args.scale_loss, use_dynamic_loss_scaling=True, use_pure_fp16=args.use_pure_fp16) if worker_num > 1: # Use the fleet api to compile the distributed optimizer optimizer = dist_optimizer(args, optimizer) optimizer.minimize(loss) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) state_dict = model.state_dict() # Use the state dict to update the parameter reset_state_dict = reset_program_state_dict(model, state_dict) paddle.static.set_program_state(main_program, reset_state_dict) if args.use_amp: optimizer.amp_init(place) if worker_num == 1: # Construct the compiled program main_program = build_compiled_program(main_program, loss) pool = ThreadPoolExecutor(1) global_step = 0 tic_train = time.time() epoch = 0 while True: files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and "training" in f ] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 # Select one file for each worker and create the DataLoader for the file data_file = select_dataset_file_for_each_worker( files, f_start_id, worker_num, worker_index) train_data_loader, _ = create_pretraining_dataset( data_file, args.max_predictions_per_seq, args, data_holders, worker_init, paddle.static.cuda_places()) for f_id in range(f_start_id + 1, len(files)): data_file = select_dataset_file_for_each_worker( files, f_id, worker_num, worker_index) dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, args, data_holders, worker_init, paddle.static.cuda_places()) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for step, batch in enumerate(train_data_loader): train_reader_cost += time.time() - reader_start global_step += 1 train_start = time.time() loss_return = exe.run(main_program, feed=batch, fetch_list=[loss]) train_run_cost += time.time() - train_start total_samples += args.batch_size # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_steps == 0: print( "tobal step: %d, epoch: %d, batch: %d, loss: %f, " "avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec" % (global_step, epoch, step, loss_return[0], train_reader_cost / args.logging_steps, (train_reader_cost + train_run_cost) / args.logging_steps, total_samples / args.logging_steps, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 if global_step % args.save_steps == 0: if worker_index == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # TODO(fangzeyang): Udpate the save_params to paddle.static paddle.fluid.io.save_params(exe, output_dir) tokenizer.save_pretrained(output_dir) if global_step >= args.max_steps: reader_start = time.time() del train_data_loader return reader_start = time.time() del train_data_loader train_data_loader, data_file = dataset_future.result(timeout=None) epoch += 1
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_ds = load_dataset('clue', args.task_name, splits="train") tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_ds = load_dataset('clue', args.task_name, splits='dev') dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if train_ds.label_list == None else len( train_ds.label_list) model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() best_acc = 0.0 global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() acc = evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if acc > best_acc: best_acc = acc if global_step >= num_training_steps: print("best_acc: ", best_acc) return print("best_acc: ", best_acc)
def do_train(args): paddle.enable_static() if not args.eager_run else None paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank()) args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] # Loads or initializes a model. pretrained_models = list(tokenizer_class.pretrained_init_configuration.keys( )) if args.model_name_or_path in pretrained_models: tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) generator = ElectraGenerator( ElectraModel(**model_class.pretrained_init_configuration[ args.model_name_or_path + "-generator"])) discriminator = ElectraDiscriminator( ElectraModel(**model_class.pretrained_init_configuration[ args.model_name_or_path + "-discriminator"])) model = model_class(generator, discriminator) args.init_from_ckpt = False else: if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt: # Load checkpoint tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) with open( os.path.join(args.model_name_or_path, "run_states.json"), 'r') as f: config_dict = json.load(f) model_name = config_dict["model_name"] if model_name in pretrained_models: generator = ElectraGenerator( ElectraModel(**model_class.pretrained_init_configuration[ model_name + "-generator"])) discriminator = ElectraDiscriminator( ElectraModel(**model_class.pretrained_init_configuration[ model_name + "-discriminator"])) model = model_class(generator, discriminator) model.set_state_dict( paddle.load( os.path.join(args.model_name_or_path, "model_state.pdparams"))) else: raise ValueError( "initialize a model from ckpt need model_name " "in model_config_file. The supported model_name " "are as follows: {}".format( tokenizer_class.pretrained_init_configuration.keys())) else: raise ValueError( "initialize a model need identifier or the " "directory of storing model. if use identifier, the supported model " "identifiers are as follows: {}, if use directory, " "make sure set init_from_ckpt as True".format( model_class.pretrained_init_configuration.keys())) criterion = ElectraPretrainingCriterion( getattr(model.generator, ElectraGenerator.base_model_prefix).config["vocab_size"], model.gen_weight, model.disc_weight) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) # Loads dataset. tic_load_data = time.time() print("start load data : %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) train_dataset = BookCorpus( data_path=args.input_dir, tokenizer=tokenizer, max_seq_length=args.max_seq_length, mode='train') print("load data done, total : %s s" % (time.time() - tic_load_data)) # Reads data and generates mini-batches. data_collator = DataCollatorForElectra( tokenizer=tokenizer, max_seq_length=args.max_seq_length, mlm=True, mlm_probability=args.mask_prob) train_data_loader = create_dataloader( train_dataset, batch_size=args.train_batch_size, mode='train', use_gpu=True if args.device in "gpu" else False, data_collator=data_collator) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_params) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) print("start train : %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) trained_global_step = global_step = 0 t_loss = paddle.to_tensor([0.0]) log_loss = paddle.to_tensor([0.0]) loss_list = [] log_list = [] tic_train = time.time() if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt: optimizer.set_state_dict( paddle.load( os.path.join(args.model_name_or_path, "model_state.pdopt"))) trained_global_step = global_step = config_dict["global_step"] if trained_global_step < num_training_steps: print( "[ start train from checkpoint ] we have already trained %s steps, seeking next step : %s" % (trained_global_step, trained_global_step + 1)) else: print( "[ start train from checkpoint ] we have already trained %s steps, but total training steps is %s, please check configuration !" % (trained_global_step, num_training_steps)) exit(0) for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): if trained_global_step > 0: trained_global_step -= 1 continue global_step += 1 input_ids, raw_input_ids, gen_labels = batch if args.use_amp: with paddle.amp.auto_cast(): gen_logits, disc_logits, disc_labels, attention_mask = model( input_ids=input_ids, raw_input_ids=raw_input_ids, gen_labels=gen_labels) loss = criterion(gen_logits, disc_logits, gen_labels, disc_labels, attention_mask) scaled = scaler.scale(loss) scaled.backward() t_loss += loss.detach() scaler.minimize(optimizer, scaled) else: gen_logits, disc_logits, disc_labels, attention_mask = model( input_ids=input_ids, raw_input_ids=raw_input_ids, gen_labels=gen_labels) loss = criterion(gen_logits, disc_logits, gen_labels, disc_labels, attention_mask) loss.backward() t_loss += loss.detach() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: local_loss = (t_loss - log_loss) / args.logging_steps if (paddle.distributed.get_world_size() > 1): paddle.distributed.all_gather(loss_list, local_loss) if paddle.distributed.get_rank() == 0: log_str = ( "global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, " "avg_loss: {4:.15f}, lr: {5:.10f}, speed: {6:.2f} s/it" ).format(global_step, num_training_steps, epoch, step, float((paddle.stack(loss_list).sum() / len( loss_list)).numpy()), optimizer.get_lr(), (time.time() - tic_train) / args.logging_steps) print(log_str) log_list.append(log_str) loss_list = [] else: log_str = ( "global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, " "loss: {4:.15f}, lr: {5:.10f}, speed: {6:.2f} s/it" ).format(global_step, num_training_steps, epoch, step, float(local_loss.numpy()), optimizer.get_lr(), (time.time() - tic_train) / args.logging_steps) print(log_str) log_list.append(log_str) log_loss = t_loss tic_train = time.time() if global_step % args.save_steps == 0: if paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d.pdparams" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model config_to_save = copy.deepcopy( model_to_save.discriminator.electra.config) if 'self' in config_to_save: del config_to_save['self'] run_states = { "model_name": model_name if args.init_from_ckpt else args.model_name_or_path, "global_step": global_step, "epoch": epoch, "step": step, } with open( os.path.join(output_dir, "model_config.json"), 'w') as f: json.dump(config_to_save, f) with open( os.path.join(output_dir, "run_states.json"), 'w') as f: json.dump(run_states, f) paddle.save(model.state_dict(), os.path.join(output_dir, "model_state.pdparams")) tokenizer.save_pretrained(output_dir) paddle.save(optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt")) if len(log_list) > 0: with open(os.path.join(output_dir, "train.log"), 'w') as f: for log in log_list: if len(log.strip()) > 0: f.write(log.strip() + '\n') if global_step >= num_training_steps: return
def do_train(args): # Set the paddle execute enviroment paddle.enable_static() place = paddle.set_device(args.select_device) set_seed(args) # Create the main_program for the training and dev_program for the validation main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() dev_program = paddle.static.Program() # Get the configuration of tokenizer and model args.task_name = args.task_name.lower() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] dataset_class, metric_class = TASK_CLASSES[args.task_name] # Create the tokenizer and dataset tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) train_dataset = dataset_class.get_datasets(["train"]) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_dataset.get_labels(), max_seq_length=args.max_seq_length) train_dataset = train_dataset.apply(trans_func, lazy=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if train_dataset.get_labels() else "float32") # label ): [data for i, data in enumerate(fn(samples))] train_batch_sampler = paddle.io.BatchSampler(train_dataset, batch_size=args.batch_size, shuffle=True) feed_list_name = [] # Define the input data and create the train/dev data_loader with paddle.static.program_guard(main_program, startup_program): [input_ids, segment_ids, labels] = create_data_holder(args.task_name) train_data_loader = DataLoader(dataset=train_dataset, feed_list=[input_ids, segment_ids, labels], batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=False) if args.task_name == "mnli": dev_dataset_matched, dev_dataset_mismatched = dataset_class.get_datasets( ["dev_matched", "dev_mismatched"]) dev_dataset_matched = dev_dataset_matched.apply(trans_func, lazy=True) dev_dataset_mismatched = dev_dataset_mismatched.apply(trans_func, lazy=True) dev_batch_sampler_matched = paddle.io.BatchSampler( dev_dataset_matched, batch_size=args.batch_size, shuffle=False) dev_data_loader_matched = DataLoader( dataset=dev_dataset_matched, batch_sampler=dev_batch_sampler_matched, feed_list=[input_ids, segment_ids, labels], collate_fn=batchify_fn, num_workers=0, return_list=False) dev_batch_sampler_mismatched = paddle.io.BatchSampler( dev_dataset_mismatched, batch_size=args.batch_size, shuffle=False) dev_data_loader_mismatched = DataLoader( dataset=dev_dataset_mismatched, feed_list=[input_ids, segment_ids, labels], batch_sampler=dev_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=0, return_list=False) else: dev_dataset = dataset_class.get_datasets(["dev"]) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader( dataset=dev_dataset, feed_list=[input_ids, segment_ids, labels], batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=False) # Create the training-forward program, and clone it for the validation with paddle.static.program_guard(main_program, startup_program): num_class = 1 if train_dataset.get_labels() is None else len( train_dataset.get_labels()) model, pretrained_state_dict = model_class.from_pretrained( args.model_name_or_path, num_classes=num_class) loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_dataset.get_labels() else paddle.nn.loss.MSELoss() logits = model(input_ids, segment_ids) loss = loss_fct(logits, labels) dev_program = main_program.clone(for_test=True) # Create the training-backward program, this pass will not be # executed in the validation num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs with paddle.static.program_guard(main_program, startup_program): lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) optimizer.minimize(loss) # Create the metric pass for the validation with paddle.static.program_guard(dev_program, startup_program): metric = metric_class() correct = metric.compute(logits, labels) # Initialize the fine-tuning parameter, we will load the parameters in # pre-training model. And initialize the parameter which not in pre-training model # by the normal distribution. exe = paddle.static.Executor(place) exe.run(startup_program) state_dict = model.state_dict() reset_state_dict = reset_program_state_dict(args, model, state_dict, pretrained_state_dict) paddle.static.set_program_state(main_program, reset_state_dict) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 loss_return = exe.run(main_program, feed=batch, fetch_list=[loss]) if global_step % args.logging_steps == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss_return[0], args.logging_steps / (time.time() - tic_train))) tic_train = time.time() lr_scheduler.step() if global_step % args.save_steps == 0: # Validation pass, record the loss and metric if args.task_name == "mnli": evaluate(exe, metric, loss, correct, dev_program, dev_data_loader_matched) evaluate(exe, metric, loss, correct, dev_program, dev_data_loader_mismatched) else: evaluate(exe, metric, loss, correct, dev_program, dev_data_loader) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) paddle.fluid.io.save_params(exe, output_dir) tokenizer.save_pretrained(output_dir)
def do_train(args): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) label_normalize_json = os.path.join("./label_normalized", args.task_name + ".json") # Ernie Model model = ErnieForPretraining.from_pretrained(args.language_model) tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained( args.language_model) # map y label_norm_dict = None with open(label_normalize_json, 'r', encoding="utf-8") as f: label_norm_dict = json.load(f) convert_example_fn = convert_example if args.task_name != "chid" else convert_chid_example evaluate_fn = do_evaluate if args.task_name != "chid" else do_evaluate_chid predict_fn = do_predict if args.task_name != "chid" else do_predict_chid # load dataset train_ds, public_test_ds, test_ds = load_dataset("fewclue", name=args.task_name, splits=("train_0", "test_public", "test")) # Task related transform operations, eg: numbert label -> text_label, english -> chinese transform_fn = partial(transform_fn_dict[args.task_name], label_normalize_dict=label_norm_dict, pattern_id=args.pattern_id) # Task related transform operations, eg: numbert label -> text_label, english -> chinese transform_test_fn = partial(transform_fn_dict[args.task_name], label_normalize_dict=label_norm_dict, is_test=True, pattern_id=args.pattern_id) # Some fewshot_learning strategy is defined by transform_fn # Note: Set lazy=True to transform example inplace immediately, # because transform_fn should only be executed only once when # iterate multi-times for train_ds train_ds = train_ds.map(transform_fn, lazy=False) public_test_ds = public_test_ds.map(transform_fn, lazy=False) test_ds = test_ds.map(transform_test_fn, lazy=False) # dataloader if args.task_name == "chid": # [src_ids, token_type_ids, masked_positions, masked_lm_labels, candidate_labels_ids] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type_ids Stack(dtype="int64"), # masked_positions Stack(dtype="int64"), # masked_lm_labels Stack(dtype="int64" ), # candidate_labels_ids [candidate_num, label_length] ): [data for data in fn(samples)] batchify_test_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type_ids Stack(dtype="int64"), # masked_positions Stack(dtype="int64" ), # candidate_labels_ids [candidate_num, label_length] ): [data for data in fn(samples)] else: # [src_ids, token_type_ids, masked_positions, masked_lm_labels] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type_ids Stack(dtype="int64"), # masked_positions Stack(dtype="int64"), # masked_lm_labels ): [data for data in fn(samples)] batchify_test_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type_ids Stack(dtype="int64"), # masked_positions ): [data for data in fn(samples)] trans_func = partial(convert_example_fn, tokenizer=tokenizer, max_seq_length=args.max_seq_length) trans_test_func = partial(convert_example_fn, tokenizer=tokenizer, max_seq_length=args.max_seq_length, is_test=True) train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) public_test_data_loader = create_dataloader(public_test_ds, mode='eval', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) test_data_loader = create_dataloader(test_ds, mode='eval', batch_size=args.batch_size, batchify_fn=batchify_test_fn, trans_fn=trans_test_func) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) # load model if there is if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) print("warmup from:{}".format(args.init_from_ckpt)) mlm_loss_fn = ErnieMLMCriterion() rdrop_loss = ppnlp.losses.RDropLoss() max_test_acc = 0.0 global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): model.train() # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. for step, batch in enumerate(train_data_loader, start=1): src_ids = batch[0] token_type_ids = batch[1] masked_positions = batch[2] masked_lm_labels = batch[3] max_len = src_ids.shape[1] new_masked_positions = [] for bs_index, mask_pos in enumerate(masked_positions.numpy()): for pos in mask_pos: new_masked_positions.append(bs_index * max_len + pos) new_masked_positions = paddle.to_tensor( np.array(new_masked_positions).astype('int32')) prediction_scores = model(input_ids=src_ids, token_type_ids=token_type_ids, masked_positions=new_masked_positions) if args.rdrop_coef > 0: prediction_scores_2 = model( input_ids=src_ids, token_type_ids=token_type_ids, masked_positions=new_masked_positions) ce_loss = ( mlm_loss_fn(prediction_scores, masked_lm_labels) + mlm_loss_fn(prediction_scores_2, masked_lm_labels)) * 0.5 kl_loss = rdrop_loss(prediction_scores, prediction_scores_2) loss = ce_loss + kl_loss * args.rdrop_coef else: loss = mlm_loss_fn(prediction_scores, masked_lm_labels) global_step += 1 if global_step % 10 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, 'model_state.pdparams') paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir) test_accuracy, total_num = evaluate_fn(model, tokenizer, public_test_data_loader, label_norm_dict) print("epoch:{}, test_accuracy:{:.3f}, total_num:{}".format( epoch, test_accuracy, total_num))
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) set_seed(args) train_ds, dev_ds, test_ds = ppnlp.datasets.DuReaderYesNo.get_datasets( ['train', 'dev', 'test']) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.get_labels(), max_seq_length=args.max_seq_length) train_ds = train_ds.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment Stack(), # length Stack(dtype="int64"), # start_pos ): [data for i, data in enumerate(fn(samples)) if i != 2] train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, return_list=True) dev_ds = dev_ds.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) test_trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.get_labels(), max_seq_length=args.max_seq_length, is_test=True) test_ds = test_ds.apply(test_trans_func, lazy=True) test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) test_batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment Stack() # length ): fn(samples) test_data_loader = DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, return_list=True) model = model_class.from_pretrained(args.model_name_or_path, num_classes=3) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_ds.examples) // args.batch_size * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, label = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = criterion(logits, label) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, metric, dev_data_loader) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, metric, test_data_loader, True)
def train(): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) train_dataset, dev_dataset = load_dataset( 'poetry', splits=('train', 'dev'), lazy=False) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example( tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len, noise_prob=args.noise_prob, use_random_noice=args.use_random_noice) train_dataset = train_dataset.map(trans_func) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # src_tids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tgt_tids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) train_data_loader = DataLoader( dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.map(trans_func) dev_data_loader = DataLoader( dataset=dev_dataset, batch_size=args.batch_size, collate_fn=batchify_fn, num_workers=0, return_list=True) label_num = model.word_emb.weight.shape[0] train_model = StackModel(model) if paddle.distributed.get_world_size() > 1: # All 'forward' outputs derived from the module parameters using in DataParallel # must participate in the calculation of losses and subsequent gradient calculations. # So we use StackModel here to make the model only output loss in its 'forward' function. train_model = paddle.DataParallel(train_model) max_steps = len(train_data_loader) * args.num_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=nn.ClipGradByGlobalNorm(1.0), apply_decay_param_fun=lambda x: x in decay_params) rouge1 = Rouge1() rouge2 = Rouge2() global_step = 1 tic_train = time.time() for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader, start=1): (src_ids, src_tids, src_pids, tgt_ids, tgt_tids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, _) = batch # import pdb; pdb.set_trace() if args.label_smooth > 0.: tgt_labels = nn.functional.label_smooth( nn.functional.one_hot(tgt_labels, label_num), epsilon=args.label_smooth) tgt_pos = paddle.nonzero(attn_ids == attn_id) loss = train_model(src_ids, src_tids, src_pids, tgt_ids, tgt_tids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, tgt_pos) if global_step % args.logging_steps == 0: if paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, lr: %.3e" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train), lr_scheduler.get_lr())) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 and paddle.distributed.get_rank( ) == 0: evaluate(model, dev_data_loader, tokenizer, rouge1, rouge2, attn_id, tgt_type_id, args) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) global_step += 1
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank()) args.model_type = args.model_type.lower() # For teacher teacher_model_class, tokenizer_class = MODEL_CLASSES[ args.teacher_model_type] tokenizer = tokenizer_class.from_pretrained( args.teacher_model_name_or_path) # For student model_class, _ = MODEL_CLASSES[args.model_type] if args.num_layers == 6: ppminilm = PPMiniLMModel(vocab_size=tokenizer.vocab_size, num_hidden_layers=6, hidden_act='relu', intermediate_size=3072, hidden_size=768) # layer: 6 elif args.num_layers == 4: ppminilm = PPMiniLMModel(vocab_size=tokenizer.vocab_size, num_hidden_layers=4, hidden_act='relu', intermediate_size=1024, hidden_size=256, num_attention_heads=16) # layer: 4 else: ppminilm = PPMiniLMModel(vocab_size=tokenizer.vocab_size, num_hidden_layers=2, hidden_act='relu', hidden_size=128, intermediate_size=512) # layer: 2 student = model_class(ppminilm) teacher = teacher_model_class.from_pretrained( args.teacher_model_name_or_path) pad_token_id = 0 if paddle.distributed.get_world_size() > 1: student = paddle.DataParallel(student, find_unused_parameters=True) teacher = paddle.DataParallel(teacher, find_unused_parameters=True) num_training_steps = args.max_steps warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in student.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=student.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm)) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) pool = ThreadPoolExecutor(1) teacher = to_distill(teacher, return_qkv=True, layer_index=args.teacher_layer_index) student = to_distill(student, return_qkv=True, layer_index=args.student_layer_index) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) ] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 shared_file_list = {} if paddle.distributed.get_world_size() > num_files: remainder = paddle.distributed.get_world_size() % num_files data_file = files[ (f_start_id * paddle.distributed.get_world_size() + paddle.distributed.get_rank() + remainder * f_start_id) % num_files] else: data_file = files[ (f_start_id * paddle.distributed.get_world_size() + paddle.distributed.get_rank()) % num_files] previous_file = data_file train_data_loader, _ = create_pretraining_dataset( data_file, shared_file_list, args, worker_init, tokenizer) # TODO(guosheng): better way to process single file single_file = True if f_start_id + 1 == len(files) else False for f_id in range(f_start_id, len(files)): if not single_file and f_id == f_start_id: continue if paddle.distributed.get_world_size() > num_files: data_file = files[(f_id * paddle.distributed.get_world_size() + paddle.distributed.get_rank() + remainder * f_id) % num_files] else: data_file = files[(f_id * paddle.distributed.get_world_size() + paddle.distributed.get_rank()) % num_files] previous_file = data_file dataset_future = pool.submit(create_pretraining_dataset, data_file, shared_file_list, args, worker_init, tokenizer) kl_loss_fct = paddle.nn.KLDivLoss('sum') train_cost_avg = TimeCostAverage() total_samples = 0 batch_start = time.time() for step, batch in enumerate(train_data_loader): global_step += 1 input_ids = batch[0] attention_mask = paddle.unsqueeze( (input_ids == pad_token_id).astype( paddle.get_default_dtype()) * -1e4, axis=[1, 2]) with paddle.amp.auto_cast( args.use_amp, custom_white_list=["layer_norm", "gelu", "softmax"]): student(input_ids) with paddle.no_grad(): teacher(input_ids) # Q-Q relation q_t, q_s = teacher.outputs.q, student.outputs.q batch_size = q_t.shape[0] pad_seq_len = q_t.shape[2] loss_q = calc_multi_relation_loss(kl_loss_fct, q_s, q_t, attention_mask, args.num_relation_heads, args.alpha, args.beta) del q_t, q_s # K-K relation k_t, k_s = teacher.outputs.k, student.outputs.k loss_k = calc_multi_relation_loss(kl_loss_fct, k_s, k_t, attention_mask, args.num_relation_heads, args.alpha, args.beta) del k_t, k_s # V-V relation v_t, v_s = teacher.outputs.v, student.outputs.v loss_v = calc_multi_relation_loss(kl_loss_fct, v_s, v_t, attention_mask, args.num_relation_heads, args.alpha, args.beta) del v_t, v_s loss = loss_q + loss_k + loss_v loss /= args.num_relation_heads * pad_seq_len * batch_size if args.use_amp: scaler.scale(loss).backward() scaler.minimize(optimizer, loss) else: loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() total_samples += args.batch_size train_run_cost = time.time() - batch_start train_cost_avg.record(train_run_cost) if global_step % args.logging_steps == 0: logger.info( "global step: %d, epoch: %d, batch: %d, loss: %f, " "lr: %f, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec" % (global_step, epoch, step, loss, optimizer.get_lr(), train_cost_avg.get_average(), total_samples / args.logging_steps, total_samples / (args.logging_steps * train_cost_avg.get_average()))) total_samples = 0 train_cost_avg.reset() if global_step % args.save_steps == 0 or global_step == num_training_steps: if paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = student._layers if isinstance( student, paddle.DataParallel) else student model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save( optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt")) if global_step >= args.max_steps: del train_data_loader return batch_start = time.time() del train_data_loader train_data_loader, data_file = dataset_future.result(timeout=None)
def train(): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) train_dataset, dev_dataset = Poetry.get_datasets(['train', 'dev']) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example(tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len, noise_prob=args.noise_prob, use_random_noice=args.use_random_noice) train_dataset = train_dataset.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) label_num = model.word_emb.weight.shape[0] if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) max_steps = len(train_data_loader) * args.num_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps, args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=nn.ClipGradByGlobalNorm(1.0), apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) rouge1 = Rouge1() rouge2 = Rouge2() global_step = 1 tic_train = time.time() for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader, start=1): (src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, _) = batch # import pdb; pdb.set_trace() _, __, info = model(src_ids, sent_ids=src_sids, pos_ids=src_pids, attn_bias=mask_src_2_src, encode_only=True) cached_k, cached_v = info['caches'] _, __, info = model(tgt_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_tgt_2_srctgt, past_cache=(cached_k, cached_v), encode_only=True) cached_k2, cached_v2 = info['caches'] past_cache_k = [ paddle.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2) ] past_cache_v = [ paddle.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2) ] if args.label_smooth > 0.: tgt_labels = nn.functional.label_smooth( nn.functional.one_hot(tgt_labels, label_num), epsilon=args.label_smooth) loss, _, __ = model(attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, past_cache=(past_cache_k, past_cache_v), tgt_labels=tgt_labels, tgt_pos=paddle.nonzero(attn_ids == attn_id)) if global_step % args.logging_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, lr: %.3e" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train), lr_scheduler.get_lr())) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0 and ( (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0): evaluate(model, dev_data_loader, tokenizer, rouge1, rouge2, attn_id, tgt_type_id, args) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) global_step += 1
def do_train(args): paddle.enable_static() if not args.eager_run else None paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank()) model_class, tokenizer_class = MODEL_CLASSES['ernie-health'] # Loads or initialize a model. pretrained_models = list( tokenizer_class.pretrained_init_configuration.keys()) if args.model_name_or_path in pretrained_models: tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) generator = ElectraGenerator( ElectraModel(**model_class.pretrained_init_configuration[ args.model_name_or_path + '-generator'])) discriminator = ErnieHealthDiscriminator( ElectraModel(**model_class.pretrained_init_configuration[ args.model_name_or_path + '-discriminator'])) model = model_class(generator, discriminator) args.init_from_ckpt = False else: if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt: # Load checkpoint tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path) with open(os.path.join(args.model_name_or_path, 'run_states.json'), 'r') as f: config_dict = json.load(f) model_name = config_dict['model_name'] if model_name in pretrained_models: generator = ElectraGenerator( ElectraModel(**model_class.pretrained_init_configuration[ model_name + '-generator'])) discriminator = ErnieHealthDiscriminator( ElectraModel(**model_class.pretrained_init_configuration[ model_name + '-discriminator'])) model = model_class(generator, discriminator) model.set_state_dict( paddle.load( os.path.join(args.model_name_or_path, 'model_state.pdparams'))) else: raise ValueError( 'initialize a model from ckpt need model_name ' 'in model_config_file. The supported model_name ' 'are as follows: {}'.format( tokenizer_class.pretrained_init_configuration.keys())) else: raise ValueError( 'initialize a model need identifier or the ' 'directory of storing model. if use identifier, the supported model ' 'identifiers are as follows: {}, if use directory, ' 'make sure set init_from_ckpt as True'.format( model_class.pretrained_init_configuration.keys())) criterion = ErnieHealthPretrainingCriterion( getattr(model.generator, ElectraGenerator.base_model_prefix).config['vocab_size'], model.gen_weight) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) # Loads dataset. tic_load_data = time.time() logger.info('start load data : %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))) train_dataset = MedicalCorpus(data_path=args.input_dir, tokenizer=tokenizer) logger.info('load data done, total : %s s' % (time.time() - tic_load_data)) # Reads data and generates mini-batches. data_collator = DataCollatorForErnieHealth( tokenizer=tokenizer, max_seq_length=args.max_seq_length, mlm_prob=args.mlm_prob) train_data_loader = create_dataloader( train_dataset, batch_size=args.batch_size, mode='train', use_gpu=True if args.device in 'gpu' else False, data_collator=data_collator) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_epochs) args.num_epochs = (num_training_steps - 1) // len(train_data_loader) + 1 lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ['bias', 'norm']) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_params) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) logger.info('start train : %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))) trained_global_step = global_step = 0 t_loss = defaultdict(lambda: paddle.to_tensor([0.0])) log_loss = defaultdict(lambda: paddle.to_tensor([0.0])) loss_list = defaultdict(list) log_list = [] tic_train = time.time() if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt: optimizer.set_state_dict( paddle.load( os.path.join(args.model_name_or_path, 'model_state.pdopt'))) trained_global_step = global_step = config_dict['global_step'] if trained_global_step < num_training_steps: logger.info( '[ start train from checkpoint ] we have already trained %s steps, seeking next step : %s' % (trained_global_step, trained_global_step + 1)) else: logger.info( '[ start train from checkpoint ] we have already trained %s steps, but total training steps is %s, please check configuration !' % (trained_global_step, num_training_steps)) exit(0) if paddle.distributed.get_rank() == 0: writer = LogWriter(os.path.join(args.output_dir, 'loss_log')) for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader): if trained_global_step > 0: trained_global_step -= 1 continue global_step += 1 masked_input_ids, input_ids, gen_labels = batch if args.use_amp: with paddle.amp.auto_cast(): gen_logits, logits_rtd, logits_mts, logits_csp, disc_labels, masks = model( input_ids=masked_input_ids, raw_input_ids=input_ids, generator_labels=gen_labels) loss, gen_loss, rtd_loss, mts_loss, csp_loss = criterion( gen_logits, gen_labels, logits_rtd, logits_mts, logits_csp, disc_labels, masks) scaled = scaler.scale(loss) scaled.backward() t_loss['loss'] += loss.detach() t_loss['gen'] += gen_loss.detach() t_loss['rtd'] += rtd_loss.detach() t_loss['mts'] += mts_loss.detach() t_loss['csp'] += csp_loss.detach() scaler.minimize(optimizer, scaled) else: gen_logits, logits_rtd, logits_mts, logits_csp, disc_labels, masks = model( input_ids=masked_input_ids, raw_input_ids=input_ids, generator_labels=gen_labels) loss, gen_loss, rtd_loss, mts_loss, csp_loss = criterion( gen_logits, gen_labels, logits_rtd, logits_mts, logits_csp, disc_labels, masks) loss.backward() t_loss['loss'] += loss.detach() t_loss['gen'] += gen_loss.detach() t_loss['rtd'] += rtd_loss.detach() t_loss['mts'] += mts_loss.detach() t_loss['csp'] += csp_loss.detach() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: local_loss = dict([ (k, (t_loss[k] - log_loss[k]) / args.logging_steps) for k in ['loss', 'gen', 'rtd', 'mts', 'csp'] ]) if paddle.distributed.get_world_size() > 1: for k in ['loss', 'gen', 'rtd', 'mts', 'csp']: paddle.distributed.all_gather(loss_list[k], local_loss[k]) if paddle.distributed.get_rank() == 0: tmp_loss = dict([ (k, float((paddle.stack(loss_list[k]).sum() / len(loss_list[k])).numpy())) for k in ['loss', 'gen', 'rtd', 'mts', 'csp'] ]) log_str = ( 'global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, ' 'avg_loss: {4:.15f}, generator: {5:.15f}, rtd: {6:.15f}, multi_choice: {7:.15f}, ' 'seq_contrastive: {8:.15f}, lr: {9:.10f}, speed: {10:.2f} s/it' ).format( global_step, num_training_steps, epoch, step, tmp_loss['loss'], tmp_loss['gen'], tmp_loss['rtd'], tmp_loss['mts'], tmp_loss['csp'], optimizer.get_lr(), (time.time() - tic_train) / args.logging_steps) logger.info(log_str) log_list.append(log_str) writer.add_scalar('generator_loss', tmp_loss['gen'], global_step) writer.add_scalar('rtd_loss', tmp_loss['rtd'] * 50, global_step) writer.add_scalar('mts_loss', tmp_loss['mts'] * 20, global_step) writer.add_scalar('csp_loss', tmp_loss['csp'], global_step) writer.add_scalar('total_loss', tmp_loss['loss'], global_step) writer.add_scalar('lr', optimizer.get_lr(), global_step) loss_list = defaultdict(list) else: local_loss = dict([(k, v.numpy()[0]) for k, v in local_loss.items()]) log_str = ( 'global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, ' 'avg_loss: {4:.15f}, generator: {5:.15f}, rtd: {6:.15f}, multi_choice: {7:.15f}, ' 'seq_contrastive_loss: {8:.15f}, lr: {9:.10f}, speed: {10:.2f} s/it' ).format(global_step, num_training_steps, epoch, step, local_loss['loss'], local_loss['gen'], local_loss['rtd'], local_loss['mts'], local_loss['csp'], optimizer.get_lr(), (time.time() - tic_train) / args.logging_steps) logger.info(log_str) log_list.append(log_str) loss_dict = { 'generator_loss': local_loss['gen'], 'rtd_loss': local_loss['rtd'] * 50, 'mts_loss': local_loss['mts'] * 20, 'csp_loss': local_loss['csp'] } for k, v in loss_dict.items(): writer.add_scalar('loss/%s' % k, v, global_step) writer.add_scalar('total_loss', local_loss['loss'], global_step) writer.add_scalar('lr', optimizer.get_lr(), global_step) log_loss = dict(t_loss) tic_train = time.time() if global_step % args.save_steps == 0: if paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, 'model_%d.pdparams' % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model config_to_save = copy.deepcopy( model_to_save.discriminator.electra.config) if 'self' in config_to_save: del config_to_save['self'] run_states = { 'model_name': model_name if args.init_from_ckpt else args.model_name_or_path, 'global_step': global_step, 'epoch': epoch, 'step': step, } with open(os.path.join(output_dir, 'model_config.json'), 'w') as f: json.dump(config_to_save, f) with open(os.path.join(output_dir, 'run_states.json'), 'w') as f: json.dump(run_states, f) paddle.save( model.state_dict(), os.path.join(output_dir, 'model_state.pdparams')) tokenizer.save_pretrained(output_dir) paddle.save(optimizer.state_dict(), os.path.join(output_dir, 'model_state.pdopt')) if len(log_list) > 0: with open(os.path.join(output_dir, 'train.log'), 'w') as f: for log in log_list: if len(log.strip()) > 0: f.write(log.strip() + '\n') if global_step >= num_training_steps: if paddle.distributed.get_rank() == 0: writer.close() return
def run(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) set_seed(args) if paddle.distributed.get_rank() == 0: if os.path.exists(args.model_name_or_path): print("init checkpoint from %s" % args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. contexts = [examples[i]['context'] for i in range(len(examples))] questions = [examples[i]['question'] for i in range(len(examples))] tokenized_examples = tokenizer( questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) for i, tokenized_example in enumerate(tokenized_examples): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_example["input_ids"] cls_index = input_ids.index(tokenizer.cls_token_id) # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offsets = tokenized_example['offset_mapping'] # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_example['token_type_ids'] # One example can give several spans, this is the index of the example containing this span of text. sample_index = tokenized_example['overflow_to_sample'] answers = examples[sample_index]['answers'] answer_starts = examples[sample_index]['answer_starts'] # If no answers are given, set the cls_index as answer. if len(answer_starts) == 0: tokenized_examples[i]["start_positions"] = cls_index tokenized_examples[i]["end_positions"] = cls_index tokenized_examples[i]['answerable_label'] = 0 else: # Start/end character index of the answer in the text. start_char = answer_starts[0] end_char = start_char + len(answers[0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != 1: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 2 while sequence_ids[token_end_index] != 1: token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples[i]["start_positions"] = cls_index tokenized_examples[i]["end_positions"] = cls_index tokenized_examples[i]['answerable_label'] = 0 else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples[i][ "start_positions"] = token_start_index - 1 while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples[i]["end_positions"] = token_end_index + 1 tokenized_examples[i]['answerable_label'] = 1 return tokenized_examples if args.do_train: assert args.train_file != None, "--train_file should be set when training!" train_ds = DuReaderChecklist().read(args.train_file) train_ds.map(prepare_train_features, batched=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64"), "answerable_label": Stack(dtype="int64") }): fn(samples) train_data_loader = DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs if paddle.distributed.get_rank() == 0: dev_count = paddle.fluid.core.get_cuda_device_count() print("Device count: %d" % dev_count) print("Num train examples: %d" % len(train_ds.data)) print("Max train steps: %d" % num_training_steps) lr_scheduler = LinearDecayWithWarmup( args.learning_rate, num_training_steps, args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) criterion = CrossEntropyLossForChecklist() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, start_positions, end_positions, answerable_label = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = criterion(logits, (start_positions, end_positions,answerable_label)) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0 or global_step == num_training_steps: if paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. contexts = [examples[i]['context'] for i in range(len(examples))] questions = [examples[i]['question'] for i in range(len(examples))] tokenized_examples = tokenizer( questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) # For validation, there is no need to compute start and end positions for i, tokenized_example in enumerate(tokenized_examples): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_example['token_type_ids'] # One example can give several spans, this is the index of the example containing this span of text. sample_index = tokenized_example['overflow_to_sample'] tokenized_examples[i]["example_id"] = examples[sample_index]['id'] # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples[i]["offset_mapping"] = [ (o if sequence_ids[k] == 1 else None) for k, o in enumerate(tokenized_example["offset_mapping"]) ] return tokenized_examples if args.do_pred: input_files = [] assert args.predict_file != None, "--predict_file should be set when predicting!" for input_pattern in args.predict_file: input_files.extend(glob.glob(input_pattern)) assert len(input_files) > 0, 'Can not find predict_file {}'.format(args.predict_file) for input_file in input_files: print('Run prediction on {}'.format(input_file)) prefix = os.path.basename(input_file) prefix = re.sub('.json', '', prefix) dev_ds = DuReaderChecklist().read(input_file) dev_ds.map(prepare_validation_features, batched=True) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_data_loader = DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) if paddle.distributed.get_rank() == 0: evaluate(model, dev_data_loader, args, prefix=prefix)
def do_train(args): paddle.enable_static() if not args.eager_run else None paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() dataset_class, metric_class = TASK_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_dataset, dev_dataset = dataset_class.get_datasets(["train", "dev"]) tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_dataset.get_labels(), max_seq_length=args.max_seq_length) train_dataset = train_dataset.apply(trans_func, lazy=True) # train_batch_sampler = SamplerHelper(train_dataset).shuffle().batch( # batch_size=args.batch_size).shard() train_batch_sampler = paddle.io.DistributedBatchSampler( # train_dataset, batch_size=args.batch_size, shuffle=True) train_dataset, batch_size=args.batch_size, shuffle=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment Stack(), # length Stack(dtype="int64" if train_dataset.get_labels() else "float32") # label ): [data for i, data in enumerate(fn(samples)) if i != 2] train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.apply(trans_func, lazy=True) # dev_batch_sampler = SamplerHelper(dev_dataset).batch( # batch_size=args.batch_size) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) # model = model_class.from_pretrained( # args.model_name_or_path,) num_classes=len(train_dataset.get_labels())) model = BertForPretraining( BertModel(**model_class.pretrained_init_configuration[ args.model_name_or_path])) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss() if train_dataset.get_labels( ) else paddle.nn.loss.MSELoss() metric = metric_class() ### TODO: use hapi # trainer = paddle.hapi.Model(model) # trainer.prepare(optimizer, loss_fct, paddle.metric.Accuracy()) # trainer.fit(train_data_loader, # dev_data_loader, # log_freq=args.logging_steps, # epochs=args.num_train_epochs, # save_dir=args.output_dir) model.eval() param_names = list(model.state_dict().keys()) import pickle with open(args.params_pd_path, "rb") as f: np_params = pickle.load(f) model.set_state_dict(dict(zip(param_names, np_params))) paddle.save(model.state_dict(), "%s.pdparams" % args.model_name_or_path) for data in train_data_loader(): print(model(*data[:-1])) exit(0) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct(logits, labels) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0: evaluate(model, loss_fct, metric, dev_data_loader) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) global_step += 1
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) global final_res args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] model_class, tokenizer_class = XLNetForSequenceClassification, XLNetTokenizer train_ds = load_dataset('glue', args.task_name, splits="train") tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, pad_right=False), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, pad_right=False ), # token_type Pad(axis=0, pad_val=0, pad_right=False), # attention_mask Stack(dtype="int64" if train_ds.label_list else "float32"), # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.task_name == "mnli": dev_ds_matched, dev_ds_mismatched = load_dataset( 'glue', args.task_name, splits=["dev_matched", "dev_mismatched"]) dev_ds_matched = dev_ds_matched.map(trans_func, lazy=True) dev_ds_mismatched = dev_ds_mismatched.map(trans_func, lazy=True) dev_batch_sampler_matched = paddle.io.BatchSampler( dev_ds_matched, batch_size=args.batch_size, shuffle=False) dev_data_loader_matched = DataLoader( dataset=dev_ds_matched, batch_sampler=dev_batch_sampler_matched, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_batch_sampler_mismatched = paddle.io.BatchSampler( dev_ds_mismatched, batch_size=args.batch_size, shuffle=False) dev_data_loader_mismatched = DataLoader( dataset=dev_ds_mismatched, batch_sampler=dev_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=0, return_list=True) else: dev_ds = load_dataset('glue', args.task_name, splits='dev') dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if train_ds.label_list is None else len( train_ds.label_list) model = XLNetForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.max_grad_norm) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "layer_norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() global_step = 0 tic_train = time.time() model.train() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, attention_mask, labels = batch logits = model(input_ids, token_type_ids, attention_mask)[0] loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() if args.task_name == "mnli": print("matched ", end="") evaluate(model, loss_fct, metric, dev_data_loader_matched) final_res1 = "matched " + final_res print("mismatched ", end="") evaluate(model, loss_fct, metric, dev_data_loader_mismatched) final_res2 = "mismatched " + final_res final_res = final_res1 + "\r\n" + final_res2 print("eval done total : %s s" % (time.time() - tic_eval)) else: evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if (not paddle.distributed.get_world_size() > 1 ) or paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "%s_ft_model_%d" % (args.task_name, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step == num_training_steps: print(final_res) exit(0) tic_train += time.time() - tic_eval
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds, dev_ds, test_ds = load_dataset(args.dataset, splits=["train", "dev", "test"]) model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained( 'ernie-1.0', num_classes=len(train_ds.label_list)) tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length, is_pair=args.dataset == "xnli_cn") batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64") # label ): [data for data in fn(samples)] train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) global_step = 0 tic_train = time.time() total_train_time = 0 for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, labels = batch with paddle.amp.auto_cast( args.use_amp, custom_white_list=["layer_norm", "softmax", "gelu"], ): logits = model(input_ids, token_type_ids) loss = criterion(logits, labels) probs = F.softmax(logits, axis=1) correct = metric.compute(probs, labels) metric.update(correct) acc = metric.accumulate() if args.use_amp: scaler.scale(loss).backward() scaler.minimize(optimizer, loss) else: loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() global_step += 1 if global_step % args.logging_steps == 0 and rank == 0: time_diff = time.time() - tic_train total_train_time += time_diff print( "global step %d, epoch: %d, batch: %d, loss: %.5f, accuracy: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, acc, args.logging_steps / time_diff)) tic_train = time.time() if global_step % args.valid_steps == 0 and rank == 0: evaluate(model, criterion, metric, dev_data_loader) tic_train = time.time() if global_step % args.save_steps == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) model._layers.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) tic_train = time.time() print("Speed: %.2f steps/s" % (global_step / total_train_time))
def do_train(args): set_seed(args) tokenizer_class, eval_name, test_name, = DATASET_INFO[args.dataset] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) train_ds, eval_ds, test_ds = load_dataset( args.dataset, splits=["train", eval_name, test_name]) num_classes = len(train_ds.label_list) no_entity_id = num_classes - 1 paddle.set_device(args.device) trainer_num = paddle.distributed.get_world_size() if trainer_num > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() if rank == 0: if os.path.exists(args.model_name_or_path): logger.info("init checkpoint from %s" % args.model_name_or_path) model = ErnieDocForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) model_config = model.ernie_doc.config if trainer_num > 1: model = paddle.DataParallel(model) train_ds_iter = SequenceLabelingIterator( train_ds, args.batch_size, tokenizer, trainer_num, trainer_id=rank, memory_len=model_config["memory_len"], max_seq_length=args.max_seq_length, random_seed=args.seed, no_entity_id=no_entity_id) eval_ds_iter = SequenceLabelingIterator( eval_ds, args.batch_size, tokenizer, trainer_num, trainer_id=rank, memory_len=model_config["memory_len"], max_seq_length=args.max_seq_length, mode="eval", no_entity_id=no_entity_id) test_ds_iter = SequenceLabelingIterator( test_ds, args.batch_size, tokenizer, trainer_num, trainer_id=rank, memory_len=model_config["memory_len"], max_seq_length=args.max_seq_length, mode="test", no_entity_id=no_entity_id) train_dataloader = paddle.io.DataLoader.from_generator(capacity=70, return_list=True) train_dataloader.set_batch_generator(train_ds_iter, paddle.get_device()) eval_dataloader = paddle.io.DataLoader.from_generator(capacity=70, return_list=True) eval_dataloader.set_batch_generator(eval_ds_iter, paddle.get_device()) test_dataloader = paddle.io.DataLoader.from_generator(capacity=70, return_list=True) test_dataloader.set_batch_generator(test_ds_iter, paddle.get_device()) num_training_examples = train_ds_iter.get_num_examples() num_training_steps = args.epochs * num_training_examples // args.batch_size // trainer_num logger.info("Device count: %d, trainer_id: %d" % (trainer_num, rank)) logger.info("Num train examples: %d" % num_training_examples) logger.info("Max train steps: %d" % num_training_steps) logger.info("Num warmup steps: %d" % int(num_training_steps * args.warmup_proportion)) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] # Construct dict name_dict = dict() for n, p in model.named_parameters(): name_dict[p.name] = n optimizer = AdamWDL(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, n_layers=model_config["num_hidden_layers"], layerwise_decay=args.layerwise_decay, name_dict=name_dict) criterion = paddle.nn.loss.CrossEntropyLoss() metric = ChunkEvaluator(label_list=train_ds.label_list) global_steps = 0 create_memory = partial(init_memory, args.batch_size, args.memory_length, model_config["hidden_size"], model_config["num_hidden_layers"]) # Copy the memory memories = create_memory() tic_train = time.time() best_f1 = 0 for epoch in range(args.epochs): train_ds_iter.shuffle_sample() train_dataloader.set_batch_generator(train_ds_iter, paddle.get_device()) for step, batch in enumerate(train_dataloader, start=1): global_steps += 1 input_ids, position_ids, token_type_ids, attn_mask, labels, lengths, qids, \ gather_idx, need_cal_loss = batch logits, memories = model(input_ids, memories, token_type_ids, position_ids, attn_mask) logits, labels = list( map(lambda x: paddle.gather(x, gather_idx), [logits, labels])) loss = criterion(logits, labels) * need_cal_loss loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_steps % args.logging_steps == 0: logger.info( "train: global step %d, epoch: %d, loss: %f, lr: %f, speed: %.2f step/s" % (global_steps, epoch, loss, lr_scheduler.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_steps % args.save_steps == 0: # Evaluate logger.info("Eval:") precision, recall, f1_score = evaluate(model, metric, eval_dataloader, create_memory()) # Save if rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % (global_steps)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if f1_score > best_f1: logger.info("Save best model......") best_f1 = f1_score best_model_dir = os.path.join(args.output_dir, "best_model") if not os.path.exists(best_model_dir): os.makedirs(best_model_dir) model_to_save.save_pretrained(best_model_dir) tokenizer.save_pretrained(best_model_dir) if args.max_steps > 0 and global_steps >= args.max_steps: return logger.info("Final test result:") eval_acc = evaluate(model, metric, test_dataloader, create_memory())
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds = load_dataset(read_text_pair, data_path=args.train_set_file, lazy=False) pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained( 'ernie-1.0') tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # query_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # query_segment Pad(axis=0, pad_val=tokenizer.pad_token_id), # title_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tilte_segment ): [data for data in fn(samples)] train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) model = SemanticIndexBatchNeg(pretrained_model, margin=args.margin, scale=args.scale, output_emb_size=args.output_emb_size) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) print("warmup from:{}".format(args.init_from_ckpt)) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch loss = model(query_input_ids=query_input_ids, title_input_ids=title_input_ids, query_token_type_ids=query_token_type_ids, title_token_type_ids=title_token_type_ids) global_step += 1 if global_step % 50 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, 'model_state.pdparams') paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir)
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_ds = load_dataset('glue', args.task_name, splits="train") tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.task_name == "mnli": dev_ds_matched, dev_ds_mismatched = load_dataset( 'glue', args.task_name, splits=["dev_matched", "dev_mismatched"]) dev_ds_matched = dev_ds_matched.map(trans_func, lazy=True) dev_ds_mismatched = dev_ds_mismatched.map(trans_func, lazy=True) dev_batch_sampler_matched = paddle.io.BatchSampler( dev_ds_matched, batch_size=args.batch_size, shuffle=False) dev_data_loader_matched = DataLoader( dataset=dev_ds_matched, batch_sampler=dev_batch_sampler_matched, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_batch_sampler_mismatched = paddle.io.BatchSampler( dev_ds_mismatched, batch_size=args.batch_size, shuffle=False) dev_data_loader_mismatched = DataLoader( dataset=dev_ds_mismatched, batch_sampler=dev_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=0, return_list=True) else: dev_ds = load_dataset('glue', args.task_name, splits='dev') dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if train_ds.label_list == None else len( train_ds.label_list) model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() if args.task_name == "mnli": evaluate(model, loss_fct, metric, dev_data_loader_matched) evaluate(model, loss_fct, metric, dev_data_loader_mismatched) print("eval done total : %s s" % (time.time() - tic_eval)) else: evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "%s_ft_model_%d.pdparams" % (args.task_name, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds, dev_ds = load_dataset("lcqmc", splits=["train", "dev"]) # If you want to use ernie1.0 model, plesace uncomment the following code # tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0') # pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained("ernie-1.0") pretrained_model = ppnlp.transformers.ErnieGramModel.from_pretrained( 'ernie-gram-zh') tokenizer = ppnlp.transformers.ErnieGramTokenizer.from_pretrained( 'ernie-gram-zh') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # text_pair_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # text_pair_segment Stack(dtype="int64") # label ): [data for data in fn(samples)] train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) model = PointwiseMatching(pretrained_model) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, labels = batch logits = model(input_ids=input_ids, token_type_ids=token_type_ids) loss = criterion(logits, labels) correct = metric.compute(logits, labels) metric.update(correct) acc = metric.accumulate() global_step += 1 if global_step > args.max_step: print( "Training steps have achieved max_step, training is stopped." ) return if global_step % 10 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, acc, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.eval_step == 0 and rank == 0: evaluate(model, criterion, metric, dev_data_loader) if global_step % args.save_step == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, 'model_state.pdparams') paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir) save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, 'model_state.pdparams') paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir)
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) # Load train dataset. file_name = 'train.csv' train_ds = load_dataset(read_custom_data, filename=os.path.join(args.data_path, file_name), is_test=False, lazy=False) pretrained_model = ppnlp.transformers.BertModel.from_pretrained( "bert-base-uncased") tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained( 'bert-base-uncased') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype='float32') # label ): [data for data in fn(samples)] train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) model = MultiLabelClassifier(pretrained_model, num_labels=len(train_ds.data[0]["label"])) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) metric = MultiLabelReport() criterion = paddle.nn.BCEWithLogitsLoss() global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, labels = batch logits = model(input_ids, token_type_ids) loss = criterion(logits, labels) probs = F.sigmoid(logits) metric.update(probs, labels) auc, f1_score = metric.accumulate() global_step += 1 if global_step % 10 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, auc: %.5f, f1 score: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, auc, f1_score, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % 100 == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, "model_state.pdparams") paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir)
def run(args): paddle.set_device(args.device) world_size = dist.get_world_size() if world_size > 1: dist.init_parallel_env() set_seed(args.seed) model = UNIMOLMHeadModel.from_pretrained(args.model_name_or_path) tokenizer = UNIMOTokenizer.from_pretrained(args.model_name_or_path) if world_size > 1: model = paddle.DataParallel(model) train_ds = load_dataset(args.dataset_name, splits='train', data_files=args.train_file) dev_ds = load_dataset(args.dataset_name, splits='dev', data_files=args.predict_file) train_ds, train_data_loader = create_data_loader(train_ds, tokenizer, args, 'train') dev_ds, dev_data_loader = create_data_loader(dev_ds, tokenizer, args, 'test') if args.do_train: num_training_steps = args.epochs * len(train_data_loader) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_propotion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, beta1=args.beta1, beta2=args.beta2, epsilon=args.epsilon, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=paddle.nn.ClipGradByGlobalNorm( args.max_grad_norm)) step = 0 total_time = 0.0 for epoch in range(args.epochs): print('\nEpoch %d/%d' % (epoch + 1, args.epochs)) batch_start_time = time.time() for inputs in train_data_loader: step += 1 labels = inputs[-1] logits = model(*inputs[:-1]) labels = paddle.nn.functional.one_hot( labels, num_classes=logits.shape[-1]) labels = paddle.nn.functional.label_smooth(labels) loss = F.cross_entropy(logits, labels, soft_label=True) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() total_time += (time.time() - batch_start_time) if step % args.logging_steps == 0: ppl = paddle.exp(loss) print( 'step %d - loss: %.4f - ppl: %.4f - lr: %.7f - %.3fs/step' % (step, loss, ppl, optimizer.get_lr(), total_time / args.logging_steps)) total_time = 0.0 if step % args.save_steps == 0 or step >= num_training_steps: if dist.get_rank() == 0: save_ckpt(model, tokenizer, args.save_dir, step) print('Saved step {} model.\n'.format(step)) if args.do_predict: model_eval = model._layers if isinstance( model, paddle.DataParallel) else model evaluation(model_eval, dev_data_loader, args, tokenizer) batch_start_time = time.time() print('\nTraining completed.') elif args.do_predict: model_eval = model._layers if isinstance( model, paddle.DataParallel) else model evaluation(model_eval, dev_data_loader, args, tokenizer)
loss = criterion(logits, labels) probs = F.softmax(logits, axis=1) correct = metric.compute(probs, labels) metric.update(correct) acc = metric.accumulate() global_step += 1 if global_step % 10 == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, acc, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % 100 == 0: dev_acc = evaluate(model, criterion, metric, dev_data_loader) test_acc = evaluate(model, criterion, metric, test_data_loader) if test_acc >= 0.959: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) model.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir)
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) global final_res args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] model_class, tokenizer_class = XLNetForSequenceClassification, XLNetTokenizer tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) if args.task_name == "mnli": train_data_loader, dev_data_loader_matched, dev_data_loader_mismatched, train_ds, dev_ds_matched, dev_ds_mismatched = create_data_loader( args, tokenizer) else: train_data_loader, dev_data_loader, train_ds, dev_ds = create_data_loader( args, tokenizer) num_classes = 1 if train_ds.label_list is None else len( train_ds.label_list) model = XLNetForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.max_grad_norm) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "layer_norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() global_step = 0 model.train() train_reader_cost = 0.0 train_run_cost = 0.0 reader_start = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): train_reader_cost += time.time() - reader_start train_start = time.time() global_step += 1 input_ids, token_type_ids, attention_mask, labels = batch logits = model(input_ids, token_type_ids, attention_mask) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() train_run_cost += time.time() - train_start # Profile for model benchmark profiler.add_profiler_step(args.profiler_options) if global_step % args.logging_steps == 0: speed = args.logging_steps / (train_reader_cost + train_run_cost) avg_reader_cost = train_reader_cost / args.logging_steps print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s, avg_reader_cost: %.4f sec, avg_batch_cost: %.4f sec, avg_samples: %d, avg_ips: %.4f sequences/sec" % ( global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), speed, avg_reader_cost, 1.0 / speed, args.batch_size, speed * args.batch_size, )) train_reader_cost = 0.0 train_run_cost = 0.0 if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() if args.task_name == "mnli": print("matched ", end="") evaluate(model, loss_fct, metric, dev_data_loader_matched) final_res1 = "matched " + final_res print("mismatched ", end="") evaluate(model, loss_fct, metric, dev_data_loader_mismatched) final_res2 = "mismatched " + final_res final_res = final_res1 + "\r\n" + final_res2 print("eval done total : %s s" % (time.time() - tic_eval)) else: evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if (not paddle.distributed.get_world_size() > 1 ) or paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "%s_ft_model_%d" % (args.task_name, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step == num_training_steps: print(final_res) exit(0) reader_start = time.time()