def test_forward(self): config = copy.deepcopy(self.config) del config['batch_size'] del config['seq_len'] electra = ElectraModel(**config) model = self.TEST_MODEL_CLASS(electra) input_ids = paddle.to_tensor(self.input_ids, dtype="int64") self.output = model(input_ids) self.check_testcase()
def do_train(args): paddle.enable_static() if not args.eager_run else None paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank()) model_class, tokenizer_class = MODEL_CLASSES['ernie-health'] # Loads or initialize a model. pretrained_models = list( tokenizer_class.pretrained_init_configuration.keys()) if args.model_name_or_path in pretrained_models: tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) generator = ElectraGenerator( ElectraModel(**model_class.pretrained_init_configuration[ args.model_name_or_path + '-generator'])) discriminator = ErnieHealthDiscriminator( ElectraModel(**model_class.pretrained_init_configuration[ args.model_name_or_path + '-discriminator'])) model = model_class(generator, discriminator) args.init_from_ckpt = False else: if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt: # Load checkpoint tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path) with open(os.path.join(args.model_name_or_path, 'run_states.json'), 'r') as f: config_dict = json.load(f) model_name = config_dict['model_name'] if model_name in pretrained_models: generator = ElectraGenerator( ElectraModel(**model_class.pretrained_init_configuration[ model_name + '-generator'])) discriminator = ErnieHealthDiscriminator( ElectraModel(**model_class.pretrained_init_configuration[ model_name + '-discriminator'])) model = model_class(generator, discriminator) model.set_state_dict( paddle.load( os.path.join(args.model_name_or_path, 'model_state.pdparams'))) else: raise ValueError( 'initialize a model from ckpt need model_name ' 'in model_config_file. The supported model_name ' 'are as follows: {}'.format( tokenizer_class.pretrained_init_configuration.keys())) else: raise ValueError( 'initialize a model need identifier or the ' 'directory of storing model. if use identifier, the supported model ' 'identifiers are as follows: {}, if use directory, ' 'make sure set init_from_ckpt as True'.format( model_class.pretrained_init_configuration.keys())) criterion = ErnieHealthPretrainingCriterion( getattr(model.generator, ElectraGenerator.base_model_prefix).config['vocab_size'], model.gen_weight) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) # Loads dataset. tic_load_data = time.time() logger.info('start load data : %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))) train_dataset = MedicalCorpus(data_path=args.input_dir, tokenizer=tokenizer) logger.info('load data done, total : %s s' % (time.time() - tic_load_data)) # Reads data and generates mini-batches. data_collator = DataCollatorForErnieHealth( tokenizer=tokenizer, max_seq_length=args.max_seq_length, mlm_prob=args.mlm_prob) train_data_loader = create_dataloader( train_dataset, batch_size=args.batch_size, mode='train', use_gpu=True if args.device in 'gpu' else False, data_collator=data_collator) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_epochs) args.num_epochs = (num_training_steps - 1) // len(train_data_loader) + 1 lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ['bias', 'norm']) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_params) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) logger.info('start train : %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))) trained_global_step = global_step = 0 t_loss = defaultdict(lambda: paddle.to_tensor([0.0])) log_loss = defaultdict(lambda: paddle.to_tensor([0.0])) loss_list = defaultdict(list) log_list = [] tic_train = time.time() if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt: optimizer.set_state_dict( paddle.load( os.path.join(args.model_name_or_path, 'model_state.pdopt'))) trained_global_step = global_step = config_dict['global_step'] if trained_global_step < num_training_steps: logger.info( '[ start train from checkpoint ] we have already trained %s steps, seeking next step : %s' % (trained_global_step, trained_global_step + 1)) else: logger.info( '[ start train from checkpoint ] we have already trained %s steps, but total training steps is %s, please check configuration !' % (trained_global_step, num_training_steps)) exit(0) if paddle.distributed.get_rank() == 0: writer = LogWriter(os.path.join(args.output_dir, 'loss_log')) for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader): if trained_global_step > 0: trained_global_step -= 1 continue global_step += 1 masked_input_ids, input_ids, gen_labels = batch if args.use_amp: with paddle.amp.auto_cast(): gen_logits, logits_rtd, logits_mts, logits_csp, disc_labels, masks = model( input_ids=masked_input_ids, raw_input_ids=input_ids, generator_labels=gen_labels) loss, gen_loss, rtd_loss, mts_loss, csp_loss = criterion( gen_logits, gen_labels, logits_rtd, logits_mts, logits_csp, disc_labels, masks) scaled = scaler.scale(loss) scaled.backward() t_loss['loss'] += loss.detach() t_loss['gen'] += gen_loss.detach() t_loss['rtd'] += rtd_loss.detach() t_loss['mts'] += mts_loss.detach() t_loss['csp'] += csp_loss.detach() scaler.minimize(optimizer, scaled) else: gen_logits, logits_rtd, logits_mts, logits_csp, disc_labels, masks = model( input_ids=masked_input_ids, raw_input_ids=input_ids, generator_labels=gen_labels) loss, gen_loss, rtd_loss, mts_loss, csp_loss = criterion( gen_logits, gen_labels, logits_rtd, logits_mts, logits_csp, disc_labels, masks) loss.backward() t_loss['loss'] += loss.detach() t_loss['gen'] += gen_loss.detach() t_loss['rtd'] += rtd_loss.detach() t_loss['mts'] += mts_loss.detach() t_loss['csp'] += csp_loss.detach() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: local_loss = dict([ (k, (t_loss[k] - log_loss[k]) / args.logging_steps) for k in ['loss', 'gen', 'rtd', 'mts', 'csp'] ]) if paddle.distributed.get_world_size() > 1: for k in ['loss', 'gen', 'rtd', 'mts', 'csp']: paddle.distributed.all_gather(loss_list[k], local_loss[k]) if paddle.distributed.get_rank() == 0: tmp_loss = dict([ (k, float((paddle.stack(loss_list[k]).sum() / len(loss_list[k])).numpy())) for k in ['loss', 'gen', 'rtd', 'mts', 'csp'] ]) log_str = ( 'global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, ' 'avg_loss: {4:.15f}, generator: {5:.15f}, rtd: {6:.15f}, multi_choice: {7:.15f}, ' 'seq_contrastive: {8:.15f}, lr: {9:.10f}, speed: {10:.2f} s/it' ).format( global_step, num_training_steps, epoch, step, tmp_loss['loss'], tmp_loss['gen'], tmp_loss['rtd'], tmp_loss['mts'], tmp_loss['csp'], optimizer.get_lr(), (time.time() - tic_train) / args.logging_steps) logger.info(log_str) log_list.append(log_str) writer.add_scalar('generator_loss', tmp_loss['gen'], global_step) writer.add_scalar('rtd_loss', tmp_loss['rtd'] * 50, global_step) writer.add_scalar('mts_loss', tmp_loss['mts'] * 20, global_step) writer.add_scalar('csp_loss', tmp_loss['csp'], global_step) writer.add_scalar('total_loss', tmp_loss['loss'], global_step) writer.add_scalar('lr', optimizer.get_lr(), global_step) loss_list = defaultdict(list) else: local_loss = dict([(k, v.numpy()[0]) for k, v in local_loss.items()]) log_str = ( 'global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, ' 'avg_loss: {4:.15f}, generator: {5:.15f}, rtd: {6:.15f}, multi_choice: {7:.15f}, ' 'seq_contrastive_loss: {8:.15f}, lr: {9:.10f}, speed: {10:.2f} s/it' ).format(global_step, num_training_steps, epoch, step, local_loss['loss'], local_loss['gen'], local_loss['rtd'], local_loss['mts'], local_loss['csp'], optimizer.get_lr(), (time.time() - tic_train) / args.logging_steps) logger.info(log_str) log_list.append(log_str) loss_dict = { 'generator_loss': local_loss['gen'], 'rtd_loss': local_loss['rtd'] * 50, 'mts_loss': local_loss['mts'] * 20, 'csp_loss': local_loss['csp'] } for k, v in loss_dict.items(): writer.add_scalar('loss/%s' % k, v, global_step) writer.add_scalar('total_loss', local_loss['loss'], global_step) writer.add_scalar('lr', optimizer.get_lr(), global_step) log_loss = dict(t_loss) tic_train = time.time() if global_step % args.save_steps == 0: if paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, 'model_%d.pdparams' % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model config_to_save = copy.deepcopy( model_to_save.discriminator.electra.config) if 'self' in config_to_save: del config_to_save['self'] run_states = { 'model_name': model_name if args.init_from_ckpt else args.model_name_or_path, 'global_step': global_step, 'epoch': epoch, 'step': step, } with open(os.path.join(output_dir, 'model_config.json'), 'w') as f: json.dump(config_to_save, f) with open(os.path.join(output_dir, 'run_states.json'), 'w') as f: json.dump(run_states, f) paddle.save( model.state_dict(), os.path.join(output_dir, 'model_state.pdparams')) tokenizer.save_pretrained(output_dir) paddle.save(optimizer.state_dict(), os.path.join(output_dir, 'model_state.pdopt')) if len(log_list) > 0: with open(os.path.join(output_dir, 'train.log'), 'w') as f: for log in log_list: if len(log.strip()) > 0: f.write(log.strip() + '\n') if global_step >= num_training_steps: if paddle.distributed.get_rank() == 0: writer.close() return
def do_train(args): paddle.enable_static() if not args.eager_run else None paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank()) args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] # Loads or initializes a model. pretrained_models = list( tokenizer_class.pretrained_init_configuration.keys()) if args.model_name_or_path in pretrained_models: tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) generator = ElectraGenerator( ElectraModel(**model_class.pretrained_init_configuration[ args.model_name_or_path + "-generator"])) discriminator = ElectraDiscriminator( ElectraModel(**model_class.pretrained_init_configuration[ args.model_name_or_path + "-discriminator"])) model = model_class(generator, discriminator) else: if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt: # load checkpoint tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path) for file_id, file_name in model_class.resource_files_names.items(): full_file_name = os.path.join(args.model_name_or_path, file_name) # to be write : load model ckpt file else: raise ValueError( "initialize a model need identifier or the " "path to a directory instead. The supported model " "identifiers are as follows: {}".format( model_class.pretrained_init_configuration.keys())) criterion = ElectraPretrainingCriterion( getattr(model.generator, ElectraGenerator.base_model_prefix).config["vocab_size"], model.gen_weight, model.disc_weight) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) # Loads dataset. tic_load_data = time.time() print("start load data : %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) train_dataset = BookCorpus(data_path=args.input_dir, tokenizer=tokenizer, max_seq_length=args.max_seq_length, mode='train') print("load data done, total : %s s" % (time.time() - tic_load_data)) # Reads data and generates mini-batches. data_collator = DataCollatorForElectra(tokenizer=tokenizer, max_seq_length=args.max_seq_length, mlm=True, mlm_probability=args.mask_prob) train_data_loader = create_dataloader( train_dataset, batch_size=args.train_batch_size, mode='train', use_gpu=True if args.n_gpu else False, data_collator=data_collator) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) print("start train : %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, raw_input_ids, gen_labels = batch gen_logits, disc_logits, disc_labels = model( input_ids=input_ids, raw_input_ids=raw_input_ids, gen_labels=gen_labels) loss = criterion(gen_logits, disc_logits, gen_labels, disc_labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() #print("backward done, total %s s" % (time.time() - tic_train)) #tic_train = time.time() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "model_%d.pdparams" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model #model_to_save.save_pretrained(output_dir) paddle.save( model.state_dict(), os.path.join(output_dir, "model_state.pdparams")) tokenizer.save_pretrained(output_dir) paddle.save(optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt"))
def do_train(args): paddle.enable_static() if not args.eager_run else None paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank()) args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] # Loads or initializes a model. pretrained_models = list(tokenizer_class.pretrained_init_configuration.keys( )) if args.model_name_or_path in pretrained_models: tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) generator = ElectraGenerator( ElectraModel(**model_class.pretrained_init_configuration[ args.model_name_or_path + "-generator"])) discriminator = ElectraDiscriminator( ElectraModel(**model_class.pretrained_init_configuration[ args.model_name_or_path + "-discriminator"])) model = model_class(generator, discriminator) args.init_from_ckpt = False else: if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt: # Load checkpoint tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) with open( os.path.join(args.model_name_or_path, "run_states.json"), 'r') as f: config_dict = json.load(f) model_name = config_dict["model_name"] if model_name in pretrained_models: generator = ElectraGenerator( ElectraModel(**model_class.pretrained_init_configuration[ model_name + "-generator"])) discriminator = ElectraDiscriminator( ElectraModel(**model_class.pretrained_init_configuration[ model_name + "-discriminator"])) model = model_class(generator, discriminator) model.set_state_dict( paddle.load( os.path.join(args.model_name_or_path, "model_state.pdparams"))) else: raise ValueError( "initialize a model from ckpt need model_name " "in model_config_file. The supported model_name " "are as follows: {}".format( tokenizer_class.pretrained_init_configuration.keys())) else: raise ValueError( "initialize a model need identifier or the " "directory of storing model. if use identifier, the supported model " "identifiers are as follows: {}, if use directory, " "make sure set init_from_ckpt as True".format( model_class.pretrained_init_configuration.keys())) criterion = ElectraPretrainingCriterion( getattr(model.generator, ElectraGenerator.base_model_prefix).config["vocab_size"], model.gen_weight, model.disc_weight) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) # Loads dataset. tic_load_data = time.time() print("start load data : %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) train_dataset = BookCorpus( data_path=args.input_dir, tokenizer=tokenizer, max_seq_length=args.max_seq_length, mode='train') print("load data done, total : %s s" % (time.time() - tic_load_data)) # Reads data and generates mini-batches. data_collator = DataCollatorForElectra( tokenizer=tokenizer, max_seq_length=args.max_seq_length, mlm=True, mlm_probability=args.mask_prob) train_data_loader = create_dataloader( train_dataset, batch_size=args.train_batch_size, mode='train', use_gpu=True if args.device in "gpu" else False, data_collator=data_collator) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_params) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) print("start train : %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) trained_global_step = global_step = 0 t_loss = paddle.to_tensor([0.0]) log_loss = paddle.to_tensor([0.0]) loss_list = [] log_list = [] tic_train = time.time() if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt: optimizer.set_state_dict( paddle.load( os.path.join(args.model_name_or_path, "model_state.pdopt"))) trained_global_step = global_step = config_dict["global_step"] if trained_global_step < num_training_steps: print( "[ start train from checkpoint ] we have already trained %s steps, seeking next step : %s" % (trained_global_step, trained_global_step + 1)) else: print( "[ start train from checkpoint ] we have already trained %s steps, but total training steps is %s, please check configuration !" % (trained_global_step, num_training_steps)) exit(0) for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): if trained_global_step > 0: trained_global_step -= 1 continue global_step += 1 input_ids, raw_input_ids, gen_labels = batch if args.use_amp: with paddle.amp.auto_cast(): gen_logits, disc_logits, disc_labels, attention_mask = model( input_ids=input_ids, raw_input_ids=raw_input_ids, gen_labels=gen_labels) loss = criterion(gen_logits, disc_logits, gen_labels, disc_labels, attention_mask) scaled = scaler.scale(loss) scaled.backward() t_loss += loss.detach() scaler.minimize(optimizer, scaled) else: gen_logits, disc_logits, disc_labels, attention_mask = model( input_ids=input_ids, raw_input_ids=raw_input_ids, gen_labels=gen_labels) loss = criterion(gen_logits, disc_logits, gen_labels, disc_labels, attention_mask) loss.backward() t_loss += loss.detach() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: local_loss = (t_loss - log_loss) / args.logging_steps if (paddle.distributed.get_world_size() > 1): paddle.distributed.all_gather(loss_list, local_loss) if paddle.distributed.get_rank() == 0: log_str = ( "global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, " "avg_loss: {4:.15f}, lr: {5:.10f}, speed: {6:.2f} s/it" ).format(global_step, num_training_steps, epoch, step, float((paddle.stack(loss_list).sum() / len( loss_list)).numpy()), optimizer.get_lr(), (time.time() - tic_train) / args.logging_steps) print(log_str) log_list.append(log_str) loss_list = [] else: log_str = ( "global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, " "loss: {4:.15f}, lr: {5:.10f}, speed: {6:.2f} s/it" ).format(global_step, num_training_steps, epoch, step, float(local_loss.numpy()), optimizer.get_lr(), (time.time() - tic_train) / args.logging_steps) print(log_str) log_list.append(log_str) log_loss = t_loss tic_train = time.time() if global_step % args.save_steps == 0: if paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d.pdparams" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model config_to_save = copy.deepcopy( model_to_save.discriminator.electra.config) if 'self' in config_to_save: del config_to_save['self'] run_states = { "model_name": model_name if args.init_from_ckpt else args.model_name_or_path, "global_step": global_step, "epoch": epoch, "step": step, } with open( os.path.join(output_dir, "model_config.json"), 'w') as f: json.dump(config_to_save, f) with open( os.path.join(output_dir, "run_states.json"), 'w') as f: json.dump(run_states, f) paddle.save(model.state_dict(), os.path.join(output_dir, "model_state.pdparams")) tokenizer.save_pretrained(output_dir) paddle.save(optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt")) if len(log_list) > 0: with open(os.path.join(output_dir, "train.log"), 'w') as f: for log in log_list: if len(log.strip()) > 0: f.write(log.strip() + '\n') if global_step >= num_training_steps: return
def main(): parser = PdArgumentParser( (ModelArguments, DataArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() training_args.eval_iters = 10 training_args.test_iters = training_args.eval_iters * 10 # Log model and data config training_args.print_config(model_args, "Model") training_args.print_config(data_args, "Data") paddle.set_device(training_args.device) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 1: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) model_class, tokenizer_class = MODEL_CLASSES['ernie-health'] # Loads or initialize a model. pretrained_models = list( tokenizer_class.pretrained_init_configuration.keys()) if model_args.model_name_or_path in pretrained_models: tokenizer = tokenizer_class.from_pretrained( model_args.model_name_or_path) generator = ElectraGenerator( ElectraModel(**model_class.pretrained_init_configuration[ model_args.model_name_or_path + "-generator"])) discriminator = ErnieHealthDiscriminator( ElectraModel(**model_class.pretrained_init_configuration[ model_args.model_name_or_path + "-discriminator"])) model = model_class(generator, discriminator) else: raise ValueError("Only support %s" % (", ".join(pretrained_models))) # Loads dataset. tic_load_data = time.time() logger.info("start load data : %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) train_dataset = MedicalCorpus(data_path=data_args.input_dir, tokenizer=tokenizer) logger.info("load data done, total : %s s" % (time.time() - tic_load_data)) # Reads data and generates mini-batches. data_collator = DataCollatorForErnieHealth( tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, mlm_prob=data_args.masked_lm_prob, return_dict=True) class CriterionWrapper(paddle.nn.Layer): """ """ def __init__(self): """CriterionWrapper """ super(CriterionWrapper, self).__init__() self.criterion = ErnieHealthPretrainingCriterion( getattr( model.generator, ElectraGenerator.base_model_prefix).config["vocab_size"], model.gen_weight) def forward(self, output, labels): """forward function Args: output (tuple): generator_logits, logits_rtd, logits_mts, logits_csp, disc_labels, mask labels (tuple): generator_labels Returns: Tensor: final loss. """ generator_logits, logits_rtd, logits_mts, logits_csp, disc_labels, masks = output generator_labels = labels loss, gen_loss, rtd_loss, mts_loss, csp_loss = self.criterion( generator_logits, generator_labels, logits_rtd, logits_mts, logits_csp, disc_labels, masks) return loss trainer = Trainer( model=model, criterion=CriterionWrapper(), args=training_args, data_collator=data_collator, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=None, tokenizer=tokenizer, ) checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint # Training if training_args.do_train: train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state()