def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() worker_index = paddle.distributed.get_rank() worker_num = paddle.distributed.get_world_size() local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0)) set_seed(args) # Now, we only support data parallel in dygraph mode for now. topo = Topology(device_rank=worker_index, world_size=worker_num, dp_degree=worker_num) default_global_batch_size = topo.data_info.size * args.micro_batch_size default_global_tokens_num = default_global_batch_size * args.max_seq_len model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) # Define log writer log_writer_path = os.path.join( args.output_dir, "train_log", "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format( args.model_name_or_path, args.micro_batch_size * topo.data_info.size, False, False, worker_index).lower()) if os.path.exists(log_writer_path): import shutil shutil.rmtree(log_writer_path) log_writer = LogWriter(log_writer_path) pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] model_config["hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model = GPTForPretraining(GPTModel(**model_config)) else: model = GPTForPretraining.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob) # Create the critrion for the gpt model criterion = GPTPretrainingCriterion() if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.decay_steps is None: args.decay_steps = args.max_steps warmup_step = args.warmup_rate * args.decay_steps lr_scheduler = None if args.lr_decay_style == "none": lr_scheduler = None elif args.lr_decay_style == "cosine": lr_scheduler = lr.CosineAnnealingWithWarmupDecay( max_lr=args.max_lr, min_lr=args.min_lr, warmup_step=warmup_step, decay_step=args.decay_steps) clip = None if args.grad_clip > 0: clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.grad_clip) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler if lr_scheduler is not None else args.max_lr, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_params) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) if args.model_name_or_path not in pretrained_models_list: logger.info("Try to load checkpoint from %s " % args.model_name_or_path) opt_path = os.path.join(args.model_name_or_path, "model_state.pdopt") if os.path.exists(opt_path): opt_dict = paddle.load(opt_path) optimizer.set_state_dict(opt_dict) else: logger.warning("No optimizer checkpoint file found in %s." % opt_path) global_step = 0 epoch = 0 tic_train = time.time() while True: files = get_train_data_file(args) files.sort() num_files = len(files) for f_id in range(num_files): data_file = files[f_id] train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset( args, [data_file], local_rank=local_rank, data_world_size=topo.data_info.size, data_world_rank=topo.data_info.rank, eos_id=tokenizer.eos_token_id) # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader # many times. and start a new random dataloader. valid_data_loader = valid_data_loader() test_data_loader = test_data_loader() # time count train_reader_cost = 0.0 train_run_cost = 0.0 reader_start = time.time() for step, batch in enumerate(train_data_loader()): train_reader_cost += time.time() - reader_start train_start = time.time() global_step += 1 tokens, loss_mask, attention_mask, position_ids, labels = batch loss_mask.stop_gradient = True attention_mask.stop_gradient = True with paddle.amp.auto_cast( args.use_amp, custom_white_list=["layer_norm", "softmax", "gelu"], custom_black_list=[ "reduce_sum", "c_softmax_with_cross_entropy", "c_embedding" ]): preds = model(tokens, position_ids, attention_mask) loss = criterion(preds, labels, loss_mask) if args.use_amp: scaler.scale(loss).backward() scaler.minimize(optimizer, loss) else: loss.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() optimizer.clear_grad() loss_numpy = loss.numpy() train_run_cost += time.time() - train_start # Profile for model benchmark profiler.add_profiler_step(args.profiler_options) if global_step % args.logging_freq == 0: speed = args.logging_freq / (train_reader_cost + train_run_cost) avg_reader_cost = train_reader_cost / args.logging_freq logger.info( "global step %d, epoch: %d, batch: %d, loss: %.9f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, speed: %.2f step/s, ips: %.0f tokens/s, learning rate: %.5e" % (global_step, epoch, step, loss_numpy, avg_reader_cost, 1. / speed, speed, speed * default_global_tokens_num, optimizer.get_lr())) log_writer.add_scalar("loss", loss_numpy, global_step) log_writer.add_scalar("learning_rate", optimizer.get_lr(), global_step) tic_train = time.time() train_reader_cost = 0.0 train_run_cost = 0.0 if args.check_accuracy: if global_step >= args.max_steps: return else: continue if global_step % args.eval_freq == 0: # Since the valid data broardcast to all devices, we do evaluate on all device. run_evaluate(valid_data_loader, model, criterion, args.eval_iters, log_writer, global_step, epoch, "valid") if global_step % args.save_steps == 0 or global_step >= args.max_steps: if worker_index == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model logger.info("Save model to %s" % output_dir) model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save( optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt")) if global_step >= args.max_steps: run_evaluate(test_data_loader, model, criterion, args.test_iters, log_writer, global_step, epoch, "test") logger.info("The training process is complete.") del train_data_loader return reader_start = time.time() del train_data_loader
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds, dev_ds = load_dataset("chnsenticorp", splits=["train", "dev"]) # If you wanna use bert/roberta/electra pretrained model, # model = ppnlp.transformers.BertForSequenceClassification.from_pretrained('bert-base-chinese', num_class=2) # model = ppnlp.transformers.RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext', num_class=2) # model = ppnlp.transformers.ElectraForSequenceClassification.from_pretrained('chinese-electra-small', num_classes=2) model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained( 'ernie-tiny', num_classes=len(train_ds.label_list)) # If you wanna use bert/roberta/electra pretrained model, # tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese') # tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext') # tokenizer = ppnlp.transformers.ElectraTokenizer.from_pretrained('chinese-electra-small', num_classes=2) # ErnieTinyTokenizer is special for ernie-tiny pretained model. tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( 'ernie-tiny') trans_func = partial( convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64") # label ): [data for data in fn(samples)] train_data_loader = create_dataloader( train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader( dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, labels = batch logits = model(input_ids, token_type_ids) loss = criterion(logits, labels) probs = F.softmax(logits, axis=1) correct = metric.compute(probs, labels) metric.update(correct) acc = metric.accumulate() global_step += 1 if global_step % 10 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, acc, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % 100 == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) evaluate(model, criterion, metric, dev_data_loader) model._layers.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir)
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Create dataset, tokenizer and dataloader. train_ds, test_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) tokenizer = GPTChineseTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) train_ds = train_ds.map(trans_func) ignore_label = -100 batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=0, pad_val=0, dtype='int64'), # input 'seq_len': Stack(dtype='int64'), # seq_len 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader(dataset=train_ds, collate_fn=batchify_fn, num_workers=0, batch_sampler=train_batch_sampler, return_list=True) test_ds = test_ds.map(trans_func) test_data_loader = DataLoader(dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = GPTForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) #model = ErnieCtmForTokenClassification.from_pretrained( # args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) global_step = 0 last_step = args.num_train_epochs * len(train_data_loader) tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, _, labels = batch logits = model(input_ids) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == last_step: if paddle.distributed.get_rank() == 0: evaluate(model, loss_fct, metric, test_data_loader) paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step))
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_ds = load_dataset('glue', args.task_name, splits="train") tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.task_name == "mnli": dev_ds_matched, dev_ds_mismatched = load_dataset( 'glue', args.task_name, splits=["dev_matched", "dev_mismatched"]) dev_ds_matched = dev_ds_matched.map(trans_func, lazy=True) dev_ds_mismatched = dev_ds_mismatched.map(trans_func, lazy=True) dev_batch_sampler_matched = paddle.io.BatchSampler( dev_ds_matched, batch_size=args.batch_size, shuffle=False) dev_data_loader_matched = DataLoader( dataset=dev_ds_matched, batch_sampler=dev_batch_sampler_matched, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_batch_sampler_mismatched = paddle.io.BatchSampler( dev_ds_mismatched, batch_size=args.batch_size, shuffle=False) dev_data_loader_mismatched = DataLoader( dataset=dev_ds_mismatched, batch_sampler=dev_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=0, return_list=True) else: dev_ds = load_dataset('glue', args.task_name, splits='dev') dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if train_ds.label_list == None else len( train_ds.label_list) model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0 or global_step == num_training_steps: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() if args.task_name == "mnli": evaluate(model, loss_fct, metric, dev_data_loader_matched) evaluate(model, loss_fct, metric, dev_data_loader_mismatched) print("eval done total : %s s" % (time.time() - tic_eval)) else: evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "%s_ft_model_%d.pdparams" % (args.task_name, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step >= num_training_steps: return
def do_train(args): if args.device == "gpu": rank = dist.get_rank() trainer_count = dist.get_world_size() else: rank = 0 trainer_count = 1 paddle.set_device("cpu") if trainer_count > 1: dist.init_parallel_env() # Set seed for CE random_seed = eval(str(args.random_seed)) if random_seed is not None: paddle.seed(random_seed) # Define data loader (train_loader), (eval_loader) = reader.create_data_loader(args) # Define model transformer = TransformerModel(src_vocab_size=args.src_vocab_size, trg_vocab_size=args.trg_vocab_size, max_length=args.max_length + 1, n_layer=args.n_layer, n_head=args.n_head, d_model=args.d_model, d_inner_hid=args.d_inner_hid, dropout=args.dropout, weight_sharing=args.weight_sharing, bos_id=args.bos_idx, eos_id=args.eos_idx) # Define loss criterion = CrossEntropyCriterion(args.label_smooth_eps, args.bos_idx) scheduler = paddle.optimizer.lr.NoamDecay(args.d_model, args.warmup_steps, args.learning_rate, last_epoch=0) # Define optimizer optimizer = paddle.optimizer.Adam(learning_rate=scheduler, beta1=args.beta1, beta2=args.beta2, epsilon=float(args.eps), parameters=transformer.parameters()) # Init from some checkpoint, to resume the previous training if args.init_from_checkpoint: model_dict = paddle.load( os.path.join(args.init_from_checkpoint, "transformer.pdparams")) opt_dict = paddle.load( os.path.join(args.init_from_checkpoint, "transformer.pdopt")) transformer.set_state_dict(model_dict) optimizer.set_state_dict(opt_dict) print("loaded from checkpoint.") # Init from some pretrain models, to better solve the current task if args.init_from_pretrain_model: model_dict = paddle.load( os.path.join(args.init_from_pretrain_model, "transformer.pdparams")) transformer.set_state_dict(model_dict) print("loaded from pre-trained model.") if trainer_count > 1: transformer = paddle.DataParallel(transformer) # The best cross-entropy value with label smoothing loss_normalizer = -( (1. - args.label_smooth_eps) * np.log((1. - args.label_smooth_eps)) + args.label_smooth_eps * np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20)) step_idx = 0 # For benchmark reader_cost_avg = AverageStatistical() batch_cost_avg = AverageStatistical() batch_ips_avg = AverageStatistical() # Train loop for pass_id in range(args.epoch): epoch_start = time.time() batch_id = 0 batch_start = time.time() for input_data in train_loader: #NOTE: Used for benchmark and use None as default. if args.max_iter and step_idx == args.max_iter: return train_reader_cost = time.time() - batch_start (src_word, trg_word, lbl_word) = input_data if args.use_amp: scaler = paddle.amp.GradScaler( init_loss_scaling=args.scale_loss) with paddle.amp.auto_cast(): logits = transformer(src_word=src_word, trg_word=trg_word) sum_cost, avg_cost, token_num = criterion(logits, lbl_word) scaled = scaler.scale(avg_cost) # scale the loss scaled.backward() # do backward scaler.minimize(optimizer, scaled) # update parameters optimizer.clear_grad() else: logits = transformer(src_word=src_word, trg_word=trg_word) sum_cost, avg_cost, token_num = criterion(logits, lbl_word) avg_cost.backward() optimizer.step() optimizer.clear_grad() tokens_per_cards = token_num.numpy() train_batch_cost = time.time() - batch_start reader_cost_avg.record(train_reader_cost) batch_cost_avg.record(train_batch_cost) batch_ips_avg.record(train_batch_cost, tokens_per_cards) # NOTE: For benchmark, loss infomation on all cards will be printed. if step_idx % args.print_step == 0 and (args.benchmark or rank == 0): total_avg_cost = avg_cost.numpy() if step_idx == 0: logger.info( "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " "normalized loss: %f, ppl: %f " % (step_idx, pass_id, batch_id, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]))) else: train_avg_batch_cost = args.print_step / batch_cost_avg.get_total_time( ) logger.info( "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " "normalized loss: %f, ppl: %f, avg_speed: %.2f step/sec, " "batch_cost: %.5f sec, reader_cost: %.5f sec, tokens: %d, " "ips: %.5f words/sec" % (step_idx, pass_id, batch_id, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]), train_avg_batch_cost, batch_cost_avg.get_average(), reader_cost_avg.get_average(), batch_ips_avg.get_total_cnt(), batch_ips_avg.get_average_per_sec())) reader_cost_avg.reset() batch_cost_avg.reset() batch_ips_avg.reset() if step_idx % args.save_step == 0 and step_idx != 0: # Validation transformer.eval() total_sum_cost = 0 total_token_num = 0 with paddle.no_grad(): for input_data in eval_loader: (src_word, trg_word, lbl_word) = input_data logits = transformer(src_word=src_word, trg_word=trg_word) sum_cost, avg_cost, token_num = criterion( logits, lbl_word) total_sum_cost += sum_cost.numpy() total_token_num += token_num.numpy() total_avg_cost = total_sum_cost / total_token_num logger.info( "validation, step_idx: %d, avg loss: %f, " "normalized loss: %f, ppl: %f" % (step_idx, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]))) transformer.train() if args.save_model and rank == 0: model_dir = os.path.join(args.save_model, "step_" + str(step_idx)) if not os.path.exists(model_dir): os.makedirs(model_dir) paddle.save( transformer.state_dict(), os.path.join(model_dir, "transformer.pdparams")) paddle.save(optimizer.state_dict(), os.path.join(model_dir, "transformer.pdopt")) batch_id += 1 step_idx += 1 scheduler.step() batch_start = time.time() train_epoch_cost = time.time() - epoch_start logger.info("train epoch: %d, epoch_cost: %.5f s" % (pass_id, train_epoch_cost)) if args.save_model and rank == 0: model_dir = os.path.join(args.save_model, "step_final") if not os.path.exists(model_dir): os.makedirs(model_dir) paddle.save(transformer.state_dict(), os.path.join(model_dir, "transformer.pdparams")) paddle.save(optimizer.state_dict(), os.path.join(model_dir, "transformer.pdopt"))
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() worker_index = paddle.distributed.get_rank() worker_num = paddle.distributed.get_world_size() set_seed(args) worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank()) model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) eod_id = tokenizer.command_name_map["eod"].Id pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) if args.model_name_or_path in pretrained_models_list: model = GPT2ForPretraining( GPT2Model(**model_class.pretrained_init_configuration[ args.model_name_or_path])) else: model = GPT2ForPretraining.from_pretrained(args.model_name_or_path) # creat the critrion for the gpt model criterion = GPT2PretrainingCriterion() if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.decay_steps is None: args.decay_steps = args.max_steps warmup_step = args.warmup_rate * args.decay_steps lr_scheduler = lr.CosineAnnealingWithWarmupDecay( max_lr=args.max_lr, min_lr=args.min_lr, warmup_step=warmup_step, decay_step=args.decay_steps) clip = None if args.grad_clip > 0: clip = paddle.nn.ClipGradByNorm(clip_norm=args.grad_clip) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_params) if args.model_name_or_path not in pretrained_models_list: opt_dict = paddle.load( os.path.join(args.model_name_or_path, "model_state.pdopt")) optimizer.set_state_dict(opt_dict) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if (os.path.isfile(os.path.join(args.input_dir, f)) and "npz_" not in str(f)) ] files.sort() num_files = len(files) for f_id in range(num_files): data_file = files[f_id] train_data_loader = create_pretrained_dataset( args, data_file, worker_init, worker_index, worker_num, eod_id=eod_id) for step, batch in enumerate(train_data_loader): global_step += 1 tokens, loss_mask, attention_mask, position_ids, labels = batch loss_mask.stop_gradient = True attention_mask.stop_gradient = True preds = model(tokens, position_ids, attention_mask) loss = criterion(preds, labels, loss_mask) if global_step % args.logging_steps == 0: if worker_index == 0: logger.info( "global step %d, epoch: %d, lr: %.10f, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, optimizer.get_lr(), step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step >= args.max_steps: if worker_index == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model logger.info("Save model to %s" % output_dir) model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save( optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt")) if global_step >= args.max_steps: logger.info("The training process is complete.") del train_data_loader return del train_data_loader
def main(args): paddle.seed(12345) config = get_config(args.config, overrides=args.override, show=True) # assign the place use_gpu = config.get("use_gpu", True) place = paddle.set_device('gpu' if use_gpu else 'cpu') trainer_num = paddle.distributed.get_world_size() use_data_parallel = trainer_num != 1 config["use_data_parallel"] = use_data_parallel if config["use_data_parallel"]: paddle.distributed.init_parallel_env() net = program.create_model(config.ARCHITECTURE, config.classes_num) # prepare to quant quant_config = get_default_quant_config() quant_config["activation_preprocess_type"] = "PACT" quanter = QAT(config=quant_config) quanter.quantize(net) optimizer, lr_scheduler = program.create_optimizer( config, parameter_list=net.parameters()) init_model(config, net, optimizer) if config["use_data_parallel"]: net = paddle.DataParallel(net) train_dataloader = Reader(config, 'train', places=place)() if config.validate: valid_dataloader = Reader(config, 'valid', places=place)() last_epoch_id = config.get("last_epoch", -1) best_top1_acc = 0.0 # best top1 acc record best_top1_epoch = last_epoch_id for epoch_id in range(last_epoch_id + 1, config.epochs): net.train() # 1. train with train dataset program.run(train_dataloader, config, net, optimizer, lr_scheduler, epoch_id, 'train') # 2. validate with validate dataset if config.validate and epoch_id % config.valid_interval == 0: net.eval() with paddle.no_grad(): top1_acc = program.run(valid_dataloader, config, net, None, None, epoch_id, 'valid') if top1_acc > best_top1_acc: best_top1_acc = top1_acc best_top1_epoch = epoch_id model_path = os.path.join(config.model_save_dir, config.ARCHITECTURE["name"]) save_model(net, optimizer, model_path, "best_model") message = "The best top1 acc {:.5f}, in epoch: {:d}".format( best_top1_acc, best_top1_epoch) logger.info(message) # 3. save the persistable model if epoch_id % config.save_interval == 0: model_path = os.path.join(config.model_save_dir, config.ARCHITECTURE["name"]) save_model(net, optimizer, model_path, epoch_id)
def do_train(args): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds = load_dataset(read_custom_data, filename=os.path.join(args.data_dir, "train.txt"), is_test=False, lazy=False) dev_ds = load_dataset(read_custom_data, filename=os.path.join(args.data_dir, "dev.txt"), is_test=False, lazy=False) tags_to_idx = load_dict(os.path.join(args.data_dir, "tags.txt")) tokenizer = ErnieCtmTokenizer.from_pretrained("wordtag") model = ErnieCtmWordtagModel.from_pretrained("wordtag", num_tag=len(tags_to_idx)) model.crf_loss = LinearChainCrfLoss( LinearChainCrf(len(tags_to_idx), 0.1, with_start_stop_tag=False)) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_len=args.max_seq_len, tags_to_idx=tags_to_idx) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64' ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64' ), # token_type_ids Stack(dtype='int64'), # seq_len Pad(axis=0, pad_val=tags_to_idx["O"], dtype='int64'), # tags ): fn(samples) train_data_loader = create_dataloader(train_ds, mode="train", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode="dev", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) num_train_optimization_steps = len( train_ds) / args.batch_size * args.num_train_epochs decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) logger.info("Total steps: %s" % num_training_steps) logger.info("WarmUp steps: %s" % warmup) metric = SequenceAccuracy() total_loss = 0 global_step = 0 for epoch in range(1, args.num_train_epochs + 1): logger.info(f"Epoch {epoch} beginnig") start_time = time.time() for total_step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, seq_len, tags = batch loss, _ = model(input_ids, token_type_ids, lengths=seq_len, tag_labels=tags) loss = loss.mean() total_loss += loss loss.backward() optimizer.step() optimizer.clear_grad() lr_scheduler.step() if global_step % args.logging_steps == 0 and rank == 0: end_time = time.time() speed = float(args.logging_steps) / (end_time - start_time) logger.info( "global step %d, epoch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, total_loss / args.logging_steps, speed)) start_time = time.time() total_loss = 0 if (global_step % args.save_steps == 0 or global_step == num_training_steps) and rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % (global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) evaluate(model, metric, dev_data_loader, tags, tags_to_idx)
def distributed_data_parallel(self): strategy = paddle.distributed.prepare_context() for net_name, net in self.model.nets.items(): self.model.nets[net_name] = paddle.DataParallel(net, strategy)
def do_train(args): assert args.device in [ "cpu", "gpu", "xpu" ], "Invalid device! Available device should be cpu, gpu, or xpu." # Initialization for the parallel enviroment paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() worker_index = paddle.distributed.get_rank() worker_num = paddle.distributed.get_world_size() # Set the random seed for the training process set_seed(args) worker_init = WorkerInitObj(args.seed + worker_index) # Get the model class and tokenizer class args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) # Define the pretrain model and metric pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) if args.model_name_or_path in pretrained_models_list: model = BigBirdForPretraining( BigBirdModel(**model_class.pretrained_init_configuration[ args.model_name_or_path])) else: model = BigBirdForPretraining.from_pretrained(args.model_name_or_path) # Get bigbird config for generate random attention mask global config config = getattr(model, BigBirdForPretraining.base_model_prefix).config criterion = BigBirdPretrainingCriterion(config["vocab_size"], args.use_nsp) if worker_num > 1: model = paddle.DataParallel(model) # Define learing_rate scheduler and optimizer lr_scheduler = LinearDecayWithWarmup(args.learning_rate, args.max_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) global_step = 0 tic_train = time.time() for epoch in range(args.epochs): files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) ] files.sort() num_files = len(files) for f_id in range(num_files): train_data_loader = create_dataloader(files[f_id], tokenizer, worker_init, args.batch_size, args.max_encoder_length, args.max_pred_length) for step, batch in enumerate(train_data_loader): global_step += 1 (input_ids, segment_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights, next_sentence_labels, masked_lm_scale) = batch[:7] rand_mask_idx_list = batch[7:] prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, rand_mask_idx_list=rand_mask_idx_list, masked_positions=masked_lm_positions) loss = criterion(prediction_scores, seq_relationship_score, masked_lm_ids, next_sentence_labels, masked_lm_scale, masked_lm_weights) if global_step % args.logging_steps == 0 and worker_index == 0: logger.info( "global step %d, epoch: %d, lr: %.10f, loss: %f, speed: %.2f step/s" % (global_step, epoch, optimizer.get_lr(), loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0: if worker_index == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save( optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt")) if global_step >= args.max_steps: del train_data_loader return del train_data_loader
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) # If you wanna use bert/roberta pretrained model, # pretrained_model = ppnlp.transformers.BertModel.from_pretrained('bert-base-chinese') # pretrained_model = ppnlp.transformers.RobertaModel.from_pretrained('roberta-wwm-ext') pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained( 'ernie-1.0') latest_checkpoint, latest_global_step = get_latest_checkpoint(args) logger.info("get latest_checkpoint:{}".format(latest_checkpoint)) model = SemanticIndexANCE(pretrained_model, margin=args.margin, output_emb_size=args.output_emb_size) if latest_checkpoint: state_dict = paddle.load(latest_checkpoint) model.set_dict(state_dict) print("warmup from:{}".format(latest_checkpoint)) model = paddle.DataParallel(model) # If you wanna use bert/roberta pretrained model, # tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese') # tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext') tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # text_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # text_segment Pad(axis=0, pad_val=tokenizer.pad_token_id), # pos_sample_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # pos_sample_segment Pad(axis=0, pad_val=tokenizer.pad_token_id), # neg_sample_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # neg_sample_segment ): [data for data in fn(samples)] global_step = 0 while global_step < args.max_training_steps: latest_ann_data, latest_ann_data_step = get_latest_ann_data( args.ann_data_dir) if latest_ann_data_step == -1: # No ann_data generated yet latest_ann_data = args.train_set_file logger.info( "No ann_data generated yet, Use training_set:{}".format( args.train_set_file)) else: # Using ann_data to training model logger.info("Latest ann_data is ready for training: [{}]".format( latest_ann_data)) train_ds = load_dataset(read_text_triplet, data_path=latest_ann_data, lazy=False) train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=clip) tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): text_input_ids, text_token_type_ids, pos_sample_input_ids, pos_sample_token_type_ids, neg_sample_input_ids, neg_sample_token_type_ids, = batch loss = model( text_input_ids=text_input_ids, pos_sample_input_ids=pos_sample_input_ids, neg_sample_input_ids=neg_sample_input_ids, text_token_type_ids=text_token_type_ids, pos_sample_token_type_ids=pos_sample_token_type_ids, neg_sample_token_type_ids=neg_sample_token_type_ids) global_step += 1 if global_step % 10 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s, trainning_file: %s" % (global_step, epoch, step, loss, 10 / (time.time() - tic_train), latest_ann_data)) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 and rank == 0: save_dir = os.path.join(args.save_dir, str(global_step)) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, 'model_state.pdparams') paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir) # Flag to indicate succeefully save model succeed_flag_file = os.path.join(save_dir, "succeed_flag_file") open(succeed_flag_file, 'a').close()
def train(args): paddle.set_device(args.device) n_procs = dist.get_world_size() rank = dist.get_rank() if n_procs > 1: dist.init_parallel_env() vocab = load_vocab(args.vocab_file, args.max_characters_per_token) elmo = ELMo(args.batch_size, args.char_embed_dim, args.projection_dim, vocab.size, dropout=args.dropout, num_layers=args.num_layers, num_highways=args.num_highways, char_vocab_size=vocab.char_size) if n_procs > 1: elmo = paddle.DataParallel(elmo) elmo.train() gloabl_norm_clip = nn.ClipGradByGlobalNorm(args.max_grad_norm) optimizer = paddle.optimizer.Adagrad(learning_rate=args.lr, parameters=elmo.parameters(), initial_accumulator_value=1.0, grad_clip=gloabl_norm_clip) elmo_loss = ELMoLoss() # Loads pre-trained parameters. if args.init_from_ckpt: weight_state_dict = paddle.load(args.init_from_ckpt + '.pdparams') opt_state_dict = paddle.load(args.init_from_ckpt + '.pdopt') elmo.set_state_dict(weight_state_dict) optimizer.set_state_dict(opt_state_dict) print("Loaded checkpoint from %s" % args.init_from_ckpt) train_dataset = OneBillionWordDataset(args.train_data_path, vocab, args.batch_size, args.unroll_steps, n_procs=n_procs, rank=rank, mode='train', shuffle=True, seed=args.seed) train_dataloader = DataLoader(train_dataset, return_list=True, batch_size=None) n_tokens_per_batch = args.batch_size * args.unroll_steps * n_procs n_steps_per_epoch = int(train_dataset.number_of_tokens / n_tokens_per_batch) n_steps_total = args.epochs * n_steps_per_epoch print("Training for %s epochs and %s steps" % (args.epochs, n_steps_total)) total_time = 0.0 batch_start_time = time.time() for step, inputs in enumerate(train_dataloader, start=1): ids, next_ids, ids_reverse, next_ids_reverse = inputs outputs = elmo([ids, ids_reverse]) loss = elmo_loss(outputs, [next_ids, next_ids_reverse]) ppl = paddle.exp(loss) loss *= args.unroll_steps loss.backward() optimizer.step() optimizer.clear_grad() total_time += (time.time() - batch_start_time) if step % args.log_freq == 0: print("step %d/%d - loss: %.4f - Perplexity: %.4f - %.3fs/step" % (step, n_steps_total, loss.numpy()[0], ppl.numpy()[0], total_time / args.log_freq)) total_time = 0.0 if rank == 0 and step % args.save_freq == 0: save_params(elmo, optimizer, args.save_dir, step) if step == n_steps_total: # training done if rank == 0: save_params(elmo, optimizer, args.save_dir, 'final') break batch_start_time = time.time()
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() dataset_class, metric_class = TASK_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_dataset = dataset_class.get_datasets(["train"]) tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_dataset.get_labels(), max_seq_length=args.max_seq_length) train_dataset = train_dataset.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Stack(), # length Stack(dtype="int64" if train_dataset.get_labels() else "float32") # label ): [data for i, data in enumerate(fn(samples)) if i != 2] train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.task_name == "mnli": dev_dataset_matched, dev_dataset_mismatched = dataset_class.get_datasets( ["dev_matched", "dev_mismatched"]) dev_dataset_matched = dev_dataset_matched.apply(trans_func, lazy=True) dev_dataset_mismatched = dev_dataset_mismatched.apply(trans_func, lazy=True) dev_batch_sampler_matched = paddle.io.BatchSampler( dev_dataset_matched, batch_size=args.batch_size, shuffle=False) dev_data_loader_matched = DataLoader( dataset=dev_dataset_matched, batch_sampler=dev_batch_sampler_matched, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_batch_sampler_mismatched = paddle.io.BatchSampler( dev_dataset_mismatched, batch_size=args.batch_size, shuffle=False) dev_data_loader_mismatched = DataLoader( dataset=dev_dataset_mismatched, batch_sampler=dev_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=0, return_list=True) else: dev_dataset = dataset_class.get_datasets(["dev"]) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if train_dataset.get_labels() == None else len( train_dataset.get_labels()) model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_train_epochs) warmup_steps = args.warmup_steps if args.warmup_steps > 0 else (int( math.floor(num_training_steps * args.warmup_proportion))) lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lambda current_step, num_warmup_steps=warmup_steps, num_training_steps= num_training_steps: float(current_step) / float( max(1, num_warmup_steps)) if current_step < num_warmup_steps else max( 0.0, float(num_training_steps - current_step) / float( max(1, num_training_steps - num_warmup_steps)))) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) loss_fct = paddle.nn.loss.CrossEntropyLoss() if train_dataset.get_labels( ) else paddle.nn.loss.MSELoss() metric = metric_class() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.logging_steps == 0: logger.info( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0: tic_eval = time.time() if args.task_name == "mnli": evaluate(model, loss_fct, metric, dev_data_loader_matched) evaluate(model, loss_fct, metric, dev_data_loader_mismatched) logger.info("eval done total : %s s" % (time.time() - tic_eval)) else: evaluate(model, loss_fct, metric, dev_data_loader) logger.info("eval done total : %s s" % (time.time() - tic_eval)) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "%s_ft_model_%d.pdparams" % (args.task_name, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
def run(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) set_seed(args) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: if os.path.exists(args.model_name_or_path): print("init checkpoint from %s" % args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = [examples[i]['context'] for i in range(len(examples))] questions = [examples[i]['question'] for i in range(len(examples))] tokenized_examples = tokenizer(questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) # Let's label those examples! for i, tokenized_example in enumerate(tokenized_examples): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_example["input_ids"] cls_index = input_ids.index(tokenizer.cls_token_id) # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offsets = tokenized_example['offset_mapping'] # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_example['token_type_ids'] # One example can give several spans, this is the index of the example containing this span of text. sample_index = tokenized_example['overflow_to_sample'] answers = examples[sample_index]['answers'] answer_starts = examples[sample_index]['answer_starts'] # If no answers are given, set the cls_index as answer. if len(answer_starts) == 0: tokenized_examples[i]["start_positions"] = cls_index tokenized_examples[i]["end_positions"] = cls_index else: # Start/end character index of the answer in the text. start_char = answer_starts[0] end_char = start_char + len(answers[0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != 1: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != 1: token_end_index -= 1 # Minus one more to reach actual text token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples[i]["start_positions"] = cls_index tokenized_examples[i]["end_positions"] = cls_index else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples[i][ "start_positions"] = token_start_index - 1 while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples[i][ "end_positions"] = token_end_index + 1 return tokenized_examples if args.do_train: if args.train_file: train_ds = load_dataset('sqaud', data_files=args.train_file) elif args.version_2_with_negative: train_ds = load_dataset('squad', splits='train_v2') else: train_ds = load_dataset('squad', splits='train_v1') train_ds.map(prepare_train_features, batched=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64") }): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = CrossEntropyLossForSQuAD() global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, start_positions, end_positions = batch logits = model(input_ids=input_ids, token_type_ids=token_type_ids) loss = criterion(logits, (start_positions, end_positions)) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch + 1, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if (not args.n_gpu > 1 ) or paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if global_step == num_training_steps: break def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = [examples[i]['context'] for i in range(len(examples))] questions = [examples[i]['question'] for i in range(len(examples))] tokenized_examples = tokenizer(questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) # For validation, there is no need to compute start and end positions for i, tokenized_example in enumerate(tokenized_examples): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_example['token_type_ids'] # One example can give several spans, this is the index of the example containing this span of text. sample_index = tokenized_example['overflow_to_sample'] tokenized_examples[i]["example_id"] = examples[sample_index]['id'] # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples[i]["offset_mapping"] = [ (o if sequence_ids[k] == 1 else None) for k, o in enumerate(tokenized_example["offset_mapping"]) ] return tokenized_examples if args.do_predict: if args.predict_file: dev_ds = load_dataset('sqaud', data_files=args.predict_file) elif args.version_2_with_negative: dev_ds = load_dataset('squad', splits='dev_v2') else: dev_ds = load_dataset('squad', splits='dev_v1') dev_ds.map(prepare_validation_features, batched=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, dev_data_loader, args)
def __init__(self, config, mode="train"): assert mode in ["train", "eval", "infer", "export"] self.mode = mode self.config = config self.eval_mode = self.config["Global"].get("eval_mode", "classification") if "Head" in self.config["Arch"]: self.is_rec = True else: self.is_rec = False # set seed seed = self.config["Global"].get("seed", False) if seed or seed == 0: assert isinstance(seed, int), "The 'seed' must be a integer!" paddle.seed(seed) np.random.seed(seed) random.seed(seed) # init logger self.output_dir = self.config['Global']['output_dir'] log_file = os.path.join(self.output_dir, self.config["Arch"]["name"], f"{mode}.log") init_logger(name='root', log_file=log_file) print_config(config) # init train_func and eval_func assert self.eval_mode in ["classification", "retrieval"], logger.error( "Invalid eval mode: {}".format(self.eval_mode)) self.train_epoch_func = train_epoch self.eval_func = getattr(evaluation, self.eval_mode + "_eval") self.use_dali = self.config['Global'].get("use_dali", False) # for visualdl self.vdl_writer = None if self.config['Global']['use_visualdl'] and mode == "train": vdl_writer_path = os.path.join(self.output_dir, "vdl") if not os.path.exists(vdl_writer_path): os.makedirs(vdl_writer_path) self.vdl_writer = LogWriter(logdir=vdl_writer_path) # set device assert self.config["Global"]["device"] in ["cpu", "gpu", "xpu", "npu"] self.device = paddle.set_device(self.config["Global"]["device"]) logger.info('train with paddle {} and device {}'.format( paddle.__version__, self.device)) # AMP training self.amp = True if "AMP" in self.config else False if self.amp and self.config["AMP"] is not None: self.scale_loss = self.config["AMP"].get("scale_loss", 1.0) self.use_dynamic_loss_scaling = self.config["AMP"].get( "use_dynamic_loss_scaling", False) else: self.scale_loss = 1.0 self.use_dynamic_loss_scaling = False if self.amp: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, 'FLAGS_max_inplace_grad_add': 8, } paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) #TODO(gaotingquan): support rec class_num = config["Arch"].get("class_num", None) self.config["DataLoader"].update({"class_num": class_num}) # build dataloader if self.mode == 'train': self.train_dataloader = build_dataloader(self.config["DataLoader"], "Train", self.device, self.use_dali) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): if self.eval_mode == "classification": self.eval_dataloader = build_dataloader( self.config["DataLoader"], "Eval", self.device, self.use_dali) elif self.eval_mode == "retrieval": self.gallery_query_dataloader = None if len(self.config["DataLoader"]["Eval"].keys()) == 1: key = list(self.config["DataLoader"]["Eval"].keys())[0] self.gallery_query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], key, self.device, self.use_dali) else: self.gallery_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Gallery", self.device, self.use_dali) self.query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Query", self.device, self.use_dali) # build loss if self.mode == "train": loss_info = self.config["Loss"]["Train"] self.train_loss_func = build_loss(loss_info) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): loss_config = self.config.get("Loss", None) if loss_config is not None: loss_config = loss_config.get("Eval") if loss_config is not None: self.eval_loss_func = build_loss(loss_config) else: self.eval_loss_func = None else: self.eval_loss_func = None # build metric if self.mode == 'train': metric_config = self.config.get("Metric") if metric_config is not None: metric_config = metric_config.get("Train") if metric_config is not None: self.train_metric_func = build_metrics(metric_config) else: self.train_metric_func = None else: self.train_metric_func = None if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): metric_config = self.config.get("Metric") if self.eval_mode == "classification": if metric_config is not None: metric_config = metric_config.get("Eval") if metric_config is not None: self.eval_metric_func = build_metrics(metric_config) elif self.eval_mode == "retrieval": if metric_config is None: metric_config = [{"name": "Recallk", "topk": (1, 5)}] else: metric_config = metric_config["Eval"] self.eval_metric_func = build_metrics(metric_config) else: self.eval_metric_func = None # build model self.model = build_model(self.config["Arch"]) # set @to_static for benchmark, skip this by default. apply_to_static(self.config, self.model) # for slim self.pruner = get_pruner(self.config, self.model) self.quanter = get_quaner(self.config, self.model) # load_pretrain if self.config["Global"]["pretrained_model"] is not None: if self.config["Global"]["pretrained_model"].startswith("http"): load_dygraph_pretrain_from_url( self.model, self.config["Global"]["pretrained_model"]) else: load_dygraph_pretrain( self.model, self.config["Global"]["pretrained_model"]) # build optimizer if self.mode == 'train': self.optimizer, self.lr_sch = build_optimizer( self.config["Optimizer"], self.config["Global"]["epochs"], len(self.train_dataloader), [self.model]) # for distributed self.config["Global"][ "distributed"] = paddle.distributed.get_world_size() != 1 if self.config["Global"]["distributed"]: dist.init_parallel_env() if self.config["Global"]["distributed"]: self.model = paddle.DataParallel(self.model) # build postprocess for infer if self.mode == 'infer': self.preprocess_func = create_operators( self.config["Infer"]["transforms"]) self.postprocess_func = build_postprocess( self.config["Infer"]["PostProcess"])
def do_train(): set_seed(args) paddle.set_device("gpu" if args.n_gpu else "cpu") world_size = paddle.distributed.get_world_size() if world_size > 1: paddle.distributed.init_parallel_env() no_entity_label = "O" ignore_label = -1 tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") label_map = load_dict(args.tag_path) id2label = {val: key for key, val in label_map.items()} model = ErnieForTokenClassification.from_pretrained( "ernie-1.0", num_classes=len(label_map)) model = paddle.DataParallel(model) print("============start train==========") train_ds = DuEventExtraction(args.train_data, args.vocab_path, args.tag_path) dev_ds = DuEventExtraction(args.dev_data, args.vocab_path, args.tag_path) test_ds = DuEventExtraction(args.test_data, args.vocab_path, args.tag_path) trans_func = partial(convert_example_to_feature, tokenizer=tokenizer, label_vocab=train_ds.label_vocab, max_seq_len=args.max_seq_len, no_entity_label=no_entity_label, ignore_label=ignore_label, is_test=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input ids Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token] ), # token type ids Stack(), # sequence lens Pad(axis=0, pad_val=ignore_label) # labels ): fn(list(map(trans_func, samples))) batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_loader = paddle.io.DataLoader(dataset=train_ds, batch_sampler=batch_sampler, collate_fn=batchify_fn) dev_loader = paddle.io.DataLoader(dataset=dev_ds, batch_size=args.batch_size, collate_fn=batchify_fn) test_loader = paddle.io.DataLoader(dataset=test_ds, batch_size=args.batch_size, collate_fn=batchify_fn) num_training_steps = len(train_loader) * args.num_epoch optimizer = paddle.optimizer.AdamW(learning_rate=args.learning_rate, parameters=model.parameters()) metric = ChunkEvaluator(label_list=train_ds.label_vocab.keys(), suffix=True) criterion = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) step, best_f1 = 0, 0.0 model.train() for epoch in range(args.num_epoch): for idx, (input_ids, token_type_ids, seq_lens, labels) in enumerate(train_loader): logits = model(input_ids, token_type_ids).reshape([-1, train_ds.label_num]) loss = paddle.mean(criterion(logits, labels.reshape([-1]))) loss.backward() optimizer.step() optimizer.clear_grad() loss_item = loss.numpy().item() if step > 0 and step % args.skip_step == 0 and paddle.distributed.get_rank( ) == 0: print( f'train epoch: {epoch} - step: {step} (total: {num_training_steps}) - loss: {loss_item:.6f}' ) if step > 0 and step % args.valid_step == 0 and paddle.distributed.get_rank( ) == 0: p, r, f1, avg_loss = evaluate(model, criterion, metric, len(label_map), dev_loader) print(f'dev step: {step} - loss: {avg_loss:.5f}, precision: {p:.5f}, recall: {r:.5f}, ' \ f'f1: {f1:.5f} current best {best_f1:.5f}') if f1 > best_f1: best_f1 = f1 print(f'==============================================save best model ' \ f'best performerence {best_f1:5f}') paddle.save(model.state_dict(), '{}/best.pdparams'.format(args.checkpoints)) step += 1 # save the final model if paddle.distributed.get_rank() == 0: paddle.save(model.state_dict(), '{}/final.pdparams'.format(args.checkpoints))
def run(args): paddle.set_device(args.device) set_seed(args) max_seq_length = args.max_seq_length max_num_choices = 10 def preprocess_function(examples, do_predict=False): SPIECE_UNDERLINE = '▁' def _is_chinese_char(cp): if ((cp >= 0x4E00 and cp <= 0x9FFF) or # (cp >= 0x3400 and cp <= 0x4DBF) or # (cp >= 0x20000 and cp <= 0x2A6DF) or # (cp >= 0x2A700 and cp <= 0x2B73F) or # (cp >= 0x2B740 and cp <= 0x2B81F) or # (cp >= 0x2B820 and cp <= 0x2CEAF) or (cp >= 0xF900 and cp <= 0xFAFF) or # (cp >= 0x2F800 and cp <= 0x2FA1F)): # return True return False def is_fuhao(c): if c == '。' or c == ',' or c == '!' or c == '?' or c == ';' or c == '、' or c == ':' or c == '(' or c == ')' \ or c == '-' or c == '~' or c == '「' or c == '《' or c == '》' or c == ',' or c == '」' or c == '"' or c == '“' or c == '”' \ or c == '$' or c == '『' or c == '』' or c == '—' or c == ';' or c == '。' or c == '(' or c == ')' or c == '-' or c == '~' or c == '。' \ or c == '‘' or c == '’': return True return False def _tokenize_chinese_chars(text): """Adds whitespace around any CJK character.""" output = [] is_blank = False for index, char in enumerate(text): cp = ord(char) if is_blank: output.append(char) if context[index - 12:index + 1].startswith("#idiom"): is_blank = False output.append(SPIECE_UNDERLINE) else: if text[index:index + 6] == "#idiom": is_blank = True if len(output) > 0 and output[-1] != SPIECE_UNDERLINE: output.append(SPIECE_UNDERLINE) output.append(char) elif _is_chinese_char(cp) or is_fuhao(char): if len(output) > 0 and output[-1] != SPIECE_UNDERLINE: output.append(SPIECE_UNDERLINE) output.append(char) output.append(SPIECE_UNDERLINE) else: output.append(char) return "".join(output) def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord( c) == 0x202F or c == SPIECE_UNDERLINE: return True return False def add_tokens_for_around(tokens, pos, num_tokens): num_l = num_tokens // 2 num_r = num_tokens - num_l if pos >= num_l and (len(tokens) - 1 - pos) >= num_r: tokens_l = tokens[pos - num_l:pos] tokens_r = tokens[pos + 1:pos + 1 + num_r] elif pos <= num_l: tokens_l = tokens[:pos] right_len = num_tokens - len(tokens_l) tokens_r = tokens[pos + 1:pos + 1 + right_len] elif (len(tokens) - 1 - pos) <= num_r: tokens_r = tokens[pos + 1:] left_len = num_tokens - len(tokens_r) tokens_l = tokens[pos - left_len:pos] else: raise ValueError('impossible') return tokens_l, tokens_r max_tokens_for_doc = max_seq_length - 3 num_tokens = max_tokens_for_doc - 5 num_examples = len(examples.data["candidates"]) if do_predict: result = {"input_ids": [], "token_type_ids": []} else: result = {"input_ids": [], "token_type_ids": [], "labels": []} for idx in range(num_examples): candidate = 0 options = examples.data['candidates'][idx] # Each content may have several sentences. for context in examples.data['content'][idx]: context = context.replace("“", "\"").replace("”", "\"").replace("——", "--"). \ replace("—", "-").replace("―", "-").replace("…", "...").replace("‘", "\'").replace("’", "\'") context = _tokenize_chinese_chars(context) paragraph_text = context.strip() doc_tokens = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False all_doc_tokens = [] for (i, token) in enumerate(doc_tokens): if '#idiom' in token: sub_tokens = [str(token)] else: sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: all_doc_tokens.append(sub_token) tags = [blank for blank in doc_tokens if '#idiom' in blank] # Each sentence may have several tags for tag_index, tag in enumerate(tags): pos = all_doc_tokens.index(tag) tmp_l, tmp_r = add_tokens_for_around(all_doc_tokens, pos, num_tokens) num_l = len(tmp_l) num_r = len(tmp_r) tokens_l = [] for token in tmp_l: if '#idiom' in token and token != tag: # Mask tag which is not considered in this new sample. # Each idiom has four words, so 4 mask tokens are used. tokens_l.extend(['[MASK]'] * 4) else: tokens_l.append(token) tokens_l = tokens_l[-num_l:] del tmp_l tokens_r = [] for token in tmp_r: if '#idiom' in token and token != tag: tokens_r.extend(['[MASK]'] * 4) else: tokens_r.append(token) tokens_r = tokens_r[:num_r] del tmp_r tokens_list = [] # Each tag has ten choices, and the shape of each new # example is [num_choices, seq_len] for i, elem in enumerate(options): option = tokenizer.tokenize(elem) tokens = option + ['[SEP]'] + tokens_l + ['[unused1]' ] + tokens_r tokens_list.append(tokens) new_data = tokenizer(tokens_list, is_split_into_words=True) # Final shape of input_ids: [batch_size, num_choices, seq_len] result["input_ids"].append(new_data["input_ids"]) result["token_type_ids"].append(new_data["token_type_ids"]) if not do_predict: label = examples.data["answers"][idx]["candidate_id"][ candidate] result["labels"].append(label) candidate += 1 if (idx + 1) % 10000 == 0: print(idx + 1, "samples have been processed.") return result if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = AutoModelForMultipleChoice.from_pretrained( args.model_name_or_path, num_choices=max_num_choices) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) train_ds, dev_ds, test_ds = load_dataset( "clue", "chid", split=["train", "validation", "test"]) if args.do_train: args.batch_size = int(args.batch_size / args.gradient_accumulation_steps) column_names = train_ds.column_names train_ds = train_ds.map(partial(preprocess_function), batched=True, batch_size=len(train_ds), num_proc=1, remove_columns=column_names) batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id), # segment 'labels': Stack(dtype="int64") # label }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_data_loader = paddle.io.DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_ds = dev_ds.map(partial(preprocess_function), batched=True, batch_size=len(dev_ds), remove_columns=column_names, num_proc=1) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.eval_batch_size, shuffle=False) dev_data_loader = paddle.io.DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) num_training_steps = int( len(train_data_loader) * args.num_train_epochs / args.gradient_accumulation_steps) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, 0) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=grad_clip) loss_fct = nn.CrossEntropyLoss() metric = Accuracy() model.train() global_step = 0 best_acc = 0.0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, segment_ids, labels = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = loss_fct(logits, labels) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: global_step += 1 optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, num_training_steps, epoch, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() tic_eval = time.time() acc = evaluate(model, loss_fct, metric, dev_data_loader) print("eval acc: %.5f, eval done total : %s s" % (acc, time.time() - tic_eval)) if paddle.distributed.get_rank() == 0 and acc > best_acc: best_acc = acc model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) print("best_acc: ", best_acc) if args.do_predict: column_names = test_ds.column_names test_ds = test_ds.map(partial( preprocess_function, do_predict=True), batched=True, batch_size=len(test_ds), remove_columns=column_names, num_proc=1) test_batch_sampler = paddle.io.BatchSampler( test_ds, batch_size=args.eval_batch_size, shuffle=False) batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id), # segment }): fn(samples) test_data_loader = paddle.io.DataLoader( dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, return_list=True) result = {} idx = 623377 for step, batch in enumerate(test_data_loader): input_ids, segment_ids = batch with paddle.no_grad(): logits = model(input_ids, segment_ids) preds = paddle.argmax(logits, axis=1).numpy().tolist() for pred in preds: result["#idiom" + str(idx)] = pred idx += 1 if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) with open( os.path.join(args.output_dir, 'chid11_predict.json'), "w", encoding='utf-8') as writer: writer.write( json.dumps( result, ensure_ascii=False, indent=4) + "\n")
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) if args.language == 'ch': train_ds, dev_ds = load_dataset("lcqmc", splits=["train", "dev"]) if args.base_model == 'roberta_base': tokenizer = RobertaTokenizer.from_pretrained('roberta-wwm-ext') pretrained_model = RobertaForSequenceClassification.from_pretrained( 'roberta-wwm-ext', num_classes=2) elif args.base_model == 'roberta_large': tokenizer = RobertaTokenizer.from_pretrained( 'roberta-wwm-ext-large') pretrained_model = RobertaForSequenceClassification.from_pretrained( 'roberta-wwm-ext-large', num_classes=2) else: train_ds, dev_ds = load_dataset("glue", "qqp", splits=["train", "dev"]) if args.base_model == 'roberta_base': tokenizer = RobertaBPETokenizer.from_pretrained('roberta-base') pretrained_model = RobertaForSequenceClassification.from_pretrained( 'roberta-base', num_classes=2) elif args.base_model == 'roberta_large': tokenizer = RobertaBPETokenizer.from_pretrained('roberta-large') pretrained_model = RobertaForSequenceClassification.from_pretrained( 'roberta-large', num_classes=2) trans_func = partial( convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length, language=args.language) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # text_pair_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # text_pair_segment Stack(dtype="int64") # label ): [data for data in fn(samples)] train_data_loader = create_dataloader( train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader( dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) model = pretrained_model if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, labels = batch probs = model(input_ids=input_ids, token_type_ids=token_type_ids) loss = criterion(probs, labels) correct = metric.compute(probs, labels) metric.update(correct) acc = metric.accumulate() global_step += 1 if global_step % 100 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, acc, 100 / (time.time() - tic_train)), flush=True) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.eval_step == 0 and rank == 0: evaluate(model, criterion, metric, dev_data_loader) if global_step % args.save_step == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, 'model_state.pdparams') paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir)
def do_train(args): paddle.set_device(args.device) set_seed(args.seed) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Load model and train from scratch # model = RobertaForMaskedLM( # RobertaModel(**RobertaForMaskedLM.pretrained_init_configuration[ # args.model_name_or_path])) model = RobertaForMaskedLM(RobertaModel(**roberta_arch)) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) ignore_label = IGNORE loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) # Load wikipedia dataset via Hugging face datasets # TO DO: paddle datasets import datasets tokenized_datasets = datasets.load_from_disk(args.input_file) train_ds = tokenized_datasets["train"] from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained('roberta-base') # Prepare data for training collator_func = DataCollatorMLM(tokenizer=tokenizer) # data collator train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader( dataset=train_ds, collate_fn=collator_func, num_workers=0, batch_sampler=train_batch_sampler, return_list=True) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] num_training_steps = args.max_steps if args.max_steps > 0 else len(train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) if args.amp: #mixed precision (fp16) scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) # Start training global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, _, labels = batch with paddle.amp.auto_cast( args.amp, #custom_white_list=["layer_norm", "softmax", "gelu"] ): logits = model(input_ids=input_ids) loss = loss_fct(logits, labels) if args.amp: scaler.scale(loss).backward() scaler.minimize(optimizer, loss) # print(args.amp, args.learning_rate, args.weight_decay) else: loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() global_step += 1 if global_step % args.logging_steps == 0: print( "global step %d/%d, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0: if paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "paddle_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=args.learning_rate, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy()
def run(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) if args.version_2_with_negative: train_examples = load_dataset('squad_v2', split='train') dev_examples = load_dataset('squad_v2', split='validation') else: train_examples = load_dataset('squad', split='train') dev_examples = load_dataset('squad', split='validation') column_names = train_examples.column_names set_seed(args) if rank == 0: if os.path.exists(args.model_name_or_path): print("Loads checkpoint from %s" % args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.do_train: train_ds = train_examples.map(partial(prepare_train_features, tokenizer=tokenizer, args=args), batched=True, remove_columns=column_names, num_proc=4) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64") }): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = CrossEntropyLossForSQuAD() global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, _, start_positions, end_positions = batch logits = model(input_ids) loss = criterion(logits, (start_positions, end_positions)) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch + 1, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if global_step == num_training_steps: break if args.do_predict and rank == 0: dev_ds = dev_examples.map(partial(prepare_validation_features, tokenizer=tokenizer, args=args), batched=True, remove_columns=column_names, num_proc=4) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) evaluate(model, dev_data_loader, args)
def train(env): """Train""" args = env.args logging.info("loading ddparser_data.") train = Corpus.load(args.train_data_path, env.fields) dev = Corpus.load(args.valid_data_path, env.fields) test = Corpus.load(args.test_data_path, env.fields) logging.info("init dataset.") train = TextDataset(train, env.fields, args.buckets) dev = TextDataset(dev, env.fields, args.buckets) test = TextDataset(test, env.fields, args.buckets) logging.info("set the ddparser_data loaders.") train.loader = batchify(train, args.batch_size, args.use_data_parallel, True) dev.loader = batchify(dev, args.batch_size) test.loader = batchify(test, args.batch_size) logging.info("{:6} {:5} sentences, ".format('train:', len(train)) + "{:3} batches, ".format(len(train.loader)) + "{} buckets".format(len(train.buckets))) logging.info("{:6} {:5} sentences, ".format('dev:', len(dev)) + "{:3} batches, ".format(len(dev.loader)) + "{} buckets".format(len(dev.buckets))) logging.info("{:6} {:5} sentences, ".format('test:', len(test)) + "{:3} batches, ".format(len(test.loader)) + "{} buckets".format(len(test.buckets))) logging.info("Create the model") model = Model(args) # init parallel strategy if args.use_data_parallel: dist.init_parallel_env() model = paddle.DataParallel(model) if args.encoding_model.startswith( "ernie" ) and args.encoding_model != "ernie-lstm" or args.encoding_model == 'transformer': args['lr'] = args.ernie_lr else: args['lr'] = args.lstm_lr if args.encoding_model.startswith( "ernie") and args.encoding_model != "ernie-lstm": max_steps = 100 * len(train.loader) decay = LinearDecay(args.lr, int(args.warmup_proportion * max_steps), max_steps) else: decay = dygraph.ExponentialDecay(learning_rate=args.lr, decay_steps=args.decay_steps, decay_rate=args.decay) grad_clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.clip) if args.encoding_model.startswith( "ernie") and args.encoding_model != "ernie-lstm": optimizer = AdamW( learning_rate=decay, parameter_list=model.parameters(), weight_decay=args.weight_decay, grad_clip=grad_clip, ) else: optimizer = fluid.optimizer.AdamOptimizer( learning_rate=decay, beta1=args.mu, beta2=args.nu, epsilon=args.epsilon, parameter_list=model.parameters(), grad_clip=grad_clip, ) total_time = datetime.timedelta() best_e, best_metric = 1, Metric() puncts = dygraph.to_variable(env.puncts, zero_copy=False) logging.info("start training.") for epoch in range(1, args.epochs + 1): start = datetime.datetime.now() # train one epoch and update the parameter logging.info("Epoch {} / {}:".format(epoch, args.epochs)) epoch_train(args, model, optimizer, train.loader, epoch) if args.local_rank == 0: loss, dev_metric = epoch_evaluate(args, model, dev.loader, puncts) logging.info("{:6} Loss: {:.4f} {}".format('dev:', loss, dev_metric)) loss, test_metric = epoch_evaluate(args, model, test.loader, puncts) logging.info("{:6} Loss: {:.4f} {}".format('test:', loss, test_metric)) t = datetime.datetime.now() - start # save the model if it is the best so far if dev_metric > best_metric and epoch > args.patience // 10: best_e, best_metric = epoch, dev_metric save(args.model_path, args, model, optimizer) logging.info("{}s elapsed (saved)\n".format(t)) else: logging.info("{}s elapsed\n".format(t)) total_time += t if epoch - best_e >= args.patience: break if args.local_rank == 0: model = load(args.model_path, model) loss, metric = epoch_evaluate(args, model, test.loader, puncts) logging.info("max score of dev is {:.2%} at epoch {}".format( best_metric.score, best_e)) logging.info("the score of test at epoch {} is {:.2%}".format( best_e, metric.score)) logging.info("average time of each epoch is {}s".format(total_time / epoch)) logging.info("{}s elapsed".format(total_time))
def train(self, validate=False): assert self.mode == 'train', "Model not in 'train' mode" Init_mark = False # if validation in training is enabled, metrics should be re-init if validate: self._init_metrics(validate=validate) self._reset_metrics() model = self.model if self.cfg.get('fleet', False): model = fleet.distributed_model(model) self.optimizer = fleet.distributed_optimizer(self.optimizer) elif self._nranks > 1: find_unused_parameters = self.cfg[ 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False model = paddle.DataParallel( self.model, find_unused_parameters=find_unused_parameters) # initial fp16 if self.cfg.get('fp16', False): scaler = amp.GradScaler( enable=self.cfg.use_gpu, init_loss_scaling=1024) self.status.update({ 'epoch_id': self.start_epoch, 'step_id': 0, 'steps_per_epoch': len(self.loader) }) self.status['batch_time'] = stats.SmoothedValue( self.cfg.log_iter, fmt='{avg:.4f}') self.status['data_time'] = stats.SmoothedValue( self.cfg.log_iter, fmt='{avg:.4f}') self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) if self.cfg.get('print_flops', False): self._flops(self.loader) self._compose_callback.on_train_begin(self.status) for epoch_id in range(self.start_epoch, self.cfg.epoch): self.status['mode'] = 'train' self.status['epoch_id'] = epoch_id self._compose_callback.on_epoch_begin(self.status) self.loader.dataset.set_epoch(epoch_id) model.train() iter_tic = time.time() for step_id, data in enumerate(self.loader): self.status['data_time'].update(time.time() - iter_tic) self.status['step_id'] = step_id self._compose_callback.on_step_begin(self.status) if self.cfg.get('fp16', False): with amp.auto_cast(enable=self.cfg.use_gpu): # model forward outputs = model(data) loss = outputs['loss'] # model backward scaled_loss = scaler.scale(loss) scaled_loss.backward() # in dygraph mode, optimizer.minimize is equal to optimizer.step scaler.minimize(self.optimizer, scaled_loss) else: # model forward outputs = model(data) loss = outputs['loss'] # model backward loss.backward() self.optimizer.step() curr_lr = self.optimizer.get_lr() self.lr.step() self.optimizer.clear_grad() self.status['learning_rate'] = curr_lr if self._nranks < 2 or self._local_rank == 0: self.status['training_staus'].update(outputs) self.status['batch_time'].update(time.time() - iter_tic) self._compose_callback.on_step_end(self.status) if self.use_ema: self.ema.update(self.model) iter_tic = time.time() # apply ema weight on model if self.use_ema: weight = copy.deepcopy(self.model.state_dict()) self.model.set_dict(self.ema.apply()) self._compose_callback.on_epoch_end(self.status) if validate and (self._nranks < 2 or self._local_rank == 0) \ and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \ or epoch_id == self.end_epoch - 1): if not hasattr(self, '_eval_loader'): # build evaluation dataset and loader self._eval_dataset = self.cfg.EvalDataset self._eval_batch_sampler = \ paddle.io.BatchSampler( self._eval_dataset, batch_size=self.cfg.EvalReader['batch_size']) self._eval_loader = create('EvalReader')( self._eval_dataset, self.cfg.worker_num, batch_sampler=self._eval_batch_sampler) # if validation in training is enabled, metrics should be re-init # Init_mark makes sure this code will only execute once if validate and Init_mark == False: Init_mark = True self._init_metrics(validate=validate) self._reset_metrics() with paddle.no_grad(): self.status['save_best_model'] = True self._eval_with_loader(self._eval_loader) # restore origin weight on model if self.use_ema: self.model.set_dict(weight) self._compose_callback.on_train_end(self.status)
def train(args): os.makedirs(args.output_dir, exist_ok=True) set_seed(args) label2id_map, id2label_map = get_label_maps() pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index # dist mode if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path) base_model = LayoutXLMModel.from_pretrained(args.model_name_or_path) model = LayoutXLMForRelationExtraction(base_model, dropout=None) # dist mode if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) train_dataset = XFUN(tokenizer, data_dir=args.train_data_dir, label_path=args.train_label_path, label2id_map=label2id_map, img_size=(224, 224), max_seq_len=args.max_seq_length, pad_token_label_id=pad_token_label_id, contains_re=True, add_special_ids=False, return_attention_mask=True, load_mode='all') eval_dataset = XFUN(tokenizer, data_dir=args.eval_data_dir, label_path=args.eval_label_path, label2id_map=label2id_map, img_size=(224, 224), max_seq_len=args.max_seq_length, pad_token_label_id=pad_token_label_id, contains_re=True, add_special_ids=False, return_attention_mask=True, load_mode='all') train_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True) args.train_batch_size = args.per_gpu_train_batch_size * max( 1, paddle.distributed.get_world_size()) train_dataloader = paddle.io.DataLoader(train_dataset, batch_sampler=train_sampler, num_workers=8, use_shared_memory=True, collate_fn=DataCollator()) eval_dataloader = paddle.io.DataLoader( eval_dataset, batch_size=args.per_gpu_eval_batch_size, num_workers=8, shuffle=False, collate_fn=DataCollator()) t_total = len(train_dataloader) * args.num_train_epochs # build linear decay with warmup lr sch lr_scheduler = paddle.optimizer.lr.PolynomialDecay( learning_rate=args.learning_rate, decay_steps=t_total, end_lr=0.0, power=1.0) if args.warmup_steps > 0: lr_scheduler = paddle.optimizer.lr.LinearWarmup( lr_scheduler, args.warmup_steps, start_lr=0, end_lr=args.learning_rate, ) grad_clip = paddle.nn.ClipGradByNorm(clip_norm=10) optimizer = paddle.optimizer.Adam(learning_rate=args.learning_rate, parameters=model.parameters(), epsilon=args.adam_epsilon, grad_clip=grad_clip, weight_decay=args.weight_decay) # Train! logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per GPU = {args.per_gpu_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {args.train_batch_size * paddle.distributed.get_world_size()}" ) logger.info(f" Total optimization steps = {t_total}") global_step = 0 train_dataloader_len = len(train_dataloader) best_metirc = {'f1': 0} model.train() for epoch in range(int(args.num_train_epochs)): for step, batch in enumerate(train_dataloader): outputs = model(**batch) # model outputs are always tuple in ppnlp (see doc) loss = outputs['loss'] loss = loss.mean() logger.info( f"epoch: [{epoch}/{args.num_train_epochs}], iter: [{step}/{train_dataloader_len}], global_step:{global_step}, train loss: {np.mean(loss.numpy())}, lr: {optimizer.get_lr()}" ) loss.backward() optimizer.step() optimizer.clear_grad() # lr_scheduler.step() # Update learning rate schedule global_step += 1 if (paddle.distributed.get_rank() == 0 and args.eval_steps > 0 and global_step % args.eval_steps == 0): # Log metrics if paddle.distributed.get_rank( ) == 0 and args.evaluate_during_training: results = evaluate(model, eval_dataloader, logger) if results['f1'] > best_metirc['f1']: best_metirc = results output_dir = os.path.join(args.output_dir, "checkpoint-best") os.makedirs(output_dir, exist_ok=True) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save( args, os.path.join(output_dir, "training_args.bin")) logger.info(f"Saving model checkpoint to {output_dir}") logger.info(f"eval results: {results}") logger.info(f"best_metirc: {best_metirc}") if (paddle.distributed.get_rank() == 0 and args.save_steps > 0 and global_step % args.save_steps == 0): # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-latest") os.makedirs(output_dir, exist_ok=True) if paddle.distributed.get_rank() == 0: model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save(args, os.path.join(output_dir, "training_args.bin")) logger.info(f"Saving model checkpoint to {output_dir}") logger.info(f"best_metirc: {best_metirc}")
def main(args): """tbd""" compound_encoder_config = load_json_config(args.compound_encoder_config) model_config = load_json_config(args.model_config) if not args.dropout_rate is None: compound_encoder_config['dropout_rate'] = args.dropout_rate model_config['dropout_rate'] = args.dropout_rate compound_encoder = GeoGNNModel(compound_encoder_config) model = GeoPredModel(model_config, compound_encoder) if args.distributed: model = paddle.DataParallel(model) opt = paddle.optimizer.Adam(learning_rate=args.lr, parameters=model.parameters()) print('Total param num: %s' % (len(model.parameters()))) for i, param in enumerate(model.named_parameters()): print(i, param[0], param[1].name) if not args.init_model is None and not args.init_model == "": compound_encoder.set_state_dict(paddle.load(args.init_model)) print('Load state_dict from %s' % args.init_model) # get dataset dataset = load_smiles_to_dataset(args.data_path) if args.DEBUG: dataset = dataset[100:180] dataset = dataset[dist.get_rank()::dist.get_world_size()] smiles_lens = [len(smiles) for smiles in dataset] print('Total size:%s' % (len(dataset))) print('Dataset smiles min/max/avg length: %s/%s/%s' % (np.min(smiles_lens), np.max(smiles_lens), np.mean(smiles_lens))) transform_fn = GeoPredTransformFn(model_config['pretrain_tasks'], model_config['mask_ratio']) # this step will be time consuming due to rdkit 3d calculation dataset.transform(transform_fn, num_workers=args.num_workers) test_index = int(len(dataset) * (1 - args.test_ratio)) train_dataset = dataset[:test_index] test_dataset = dataset[test_index:] print("Train/Test num: %s/%s" % (len(train_dataset), len(test_dataset))) collate_fn = GeoPredCollateFn( atom_names=compound_encoder_config['atom_names'], bond_names=compound_encoder_config['bond_names'], bond_float_names=compound_encoder_config['bond_float_names'], bond_angle_float_names=compound_encoder_config[ 'bond_angle_float_names'], pretrain_tasks=model_config['pretrain_tasks'], mask_ratio=model_config['mask_ratio'], Cm_vocab=model_config['Cm_vocab']) train_data_gen = train_dataset.get_data_loader( batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=collate_fn) list_test_loss = [] for epoch_id in range(args.max_epoch): s = time.time() train_loss = train(args, model, opt, train_data_gen) test_loss = evaluate(args, model, test_dataset, collate_fn) if not args.distributed or dist.get_rank() == 0: paddle.save(compound_encoder.state_dict(), '%s/epoch%d.pdparams' % (args.model_dir, epoch_id)) list_test_loss.append(test_loss['loss']) print("epoch:%d train/loss:%s" % (epoch_id, train_loss)) print("epoch:%d test/loss:%s" % (epoch_id, test_loss)) print("Time used:%ss" % (time.time() - s)) if not args.distributed or dist.get_rank() == 0: print('Best epoch id:%s' % np.argmin(list_test_loss))
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds = load_dataset( read_simcse_text, data_path=args.train_set_file, lazy=False) dev_ds = load_dataset( read_text_pair, data_path=args.test_set_file, lazy=False) pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained( 'ernie-1.0', hidden_dropout_prob=args.dropout, attention_probs_dropout_prob=args.dropout) tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial( convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # query_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # query_segment Pad(axis=0, pad_val=tokenizer.pad_token_id), # title_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tilte_segment ): [data for data in fn(samples)] dev_batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # query_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # query_segment Pad(axis=0, pad_val=tokenizer.pad_token_id), # title_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tilte_segment Stack(dtype="int64"), # labels ): [data for data in fn(samples)] train_data_loader = create_dataloader( train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader( dev_ds, mode='eval', batch_size=args.batch_size, batchify_fn=dev_batchify_fn, trans_fn=trans_func) model = SimCSE( pretrained_model, margin=args.margin, scale=args.scale, output_emb_size=args.output_emb_size) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) print("warmup from:{}".format(args.init_from_ckpt)) model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch loss = model( query_input_ids=query_input_ids, title_input_ids=title_input_ids, query_token_type_ids=query_token_type_ids, title_token_type_ids=title_token_type_ids) global_step += 1 if global_step % 10 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, 10 / (time.time() - tic_train))) tic_train = time.time() if global_step % args.eval_steps == 0 and rank == 0: # need better way to get model Layers spearman_corr, total_num = do_evaluate(model._layers, tokenizer, dev_data_loader, args.infer_with_fc_pooler) print("global step: {}, spearman_corr: {:.4f}, total_num: {}".format(global_step, spearman_corr, total_num)) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, 'model_state.pdparams') paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir) if args.max_steps > 0 and global_step >= args.max_steps: return
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Reads label_map. label_map_path = os.path.join(args.data_path, "predicate2id.json") if not (os.path.exists(label_map_path) and os.path.isfile(label_map_path)): sys.exit("{} dose not exists or is not a file.".format(label_map_path)) with open(label_map_path, 'r', encoding='utf8') as fp: label_map = json.load(fp) num_classes = (len(label_map.keys()) - 2) * 2 + 2 # Loads pretrained model ERNIE model = ErnieForTokenClassification.from_pretrained( "ernie-1.0", num_classes=num_classes) model = paddle.DataParallel(model) tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") criterion = BCELossForDuIE() # Loads dataset. train_dataset = DuIEDataset.from_file( os.path.join(args.data_path, 'train_data.json'), tokenizer, args.max_seq_length, True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) collator = DataCollator() train_data_loader = DataLoader( dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=collator, return_list=True) eval_file_path = os.path.join(args.data_path, 'dev_data.json') test_dataset = DuIEDataset.from_file(eval_file_path, tokenizer, args.max_seq_length, True) test_batch_sampler = paddle.io.BatchSampler( test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) test_data_loader = DataLoader( dataset=test_dataset, batch_sampler=test_batch_sampler, collate_fn=collator, return_list=True) # Defines learning rate strategy. steps_by_epoch = len(train_data_loader) num_training_steps = steps_by_epoch * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_ratio) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) # Starts training. global_step = 0 logging_steps = 50 save_steps = 10000 tic_train = time.time() for epoch in range(args.num_train_epochs): print("\n=====start training of %d epochs=====" % epoch) tic_epoch = time.time() model.train() for step, batch in enumerate(train_data_loader): input_ids, seq_lens, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch logits = model(input_ids=input_ids) mask = (input_ids != 0).logical_and((input_ids != 1)).logical_and( (input_ids != 2)) loss = criterion(logits, labels, mask) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() loss_item = loss.numpy().item() global_step += 1 if global_step % logging_steps == 0 and rank == 0: print( "epoch: %d / %d, steps: %d / %d, loss: %f, speed: %.2f step/s" % (epoch, args.num_train_epochs, step, steps_by_epoch, loss_item, logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % save_steps == 0 and rank == 0: print("\n=====start evaluating ckpt of %d steps=====" % global_step) precision, recall, f1 = evaluate( model, criterion, test_data_loader, eval_file_path, "eval") print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" % (100 * precision, 100 * recall, 100 * f1)) print("saving checkpoing model_%d.pdparams to %s " % (global_step, args.output_dir)) paddle.save(model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) model.train() # back to train mode tic_epoch = time.time() - tic_epoch print("epoch time footprint: %d hour %d min %d sec" % (tic_epoch // 3600, (tic_epoch % 3600) // 60, tic_epoch % 60)) # Does final evaluation. if rank == 0: print("\n=====start evaluating last ckpt of %d steps=====" % global_step) precision, recall, f1 = evaluate(model, criterion, test_data_loader, eval_file_path, "eval") print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" % (100 * precision, 100 * recall, 100 * f1)) paddle.save(model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) print("\n=====training complete=====")
def main(args): paddle.set_device(args.device) world_size = dist.get_world_size() rank = dist.get_rank() if world_size > 1 and args.do_train: dist.init_parallel_env() set_seed(args.seed) dataset_class, metric_class = TASK_CLASSES[args.task_name] tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial( dataset_class.convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_len) test_trans_func = partial( dataset_class.convert_example, tokenizer=tokenizer, max_seq_length=args.test_max_seq_len) metric = metric_class() if args.task_name in ('udc', 'dstc2', 'atis_intent', 'mrda', 'swda'): batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype='int64') # label ): fn(samples) model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=dataset_class.num_classes()) elif args.task_name == 'atis_slot': batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Pad(axis=0, pad_val=0, dtype='int64') # label ): fn(samples) model = BertForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=dataset_class.num_classes(), dropout=0.0) if world_size > 1 and args.do_train: model = paddle.DataParallel(model) if args.do_train: train_data_loader = create_data_loader(args, dataset_class, trans_func, batchify_fn, 'train') if args.do_eval: dev_data_loader = create_data_loader( args, dataset_class, test_trans_func, batchify_fn, 'dev') else: dev_data_loader = None train(args, model, train_data_loader, dev_data_loader, metric, world_size, rank) if args.do_test: if rank == 0: test_data_loader = create_data_loader( args, dataset_class, test_trans_func, batchify_fn, 'test') if args.do_train: # If do_eval=True, use best model to evaluate the test data. # Otherwise, use final model to evaluate the test data. if args.do_eval: args.init_from_ckpt = os.path.join(args.output_dir, 'best') load_ckpt(args, model) else: if not args.init_from_ckpt: raise ValueError('"init_from_ckpt" should be set.') load_ckpt(args, model) print('\nTest begin...') evaluation(args, model, test_data_loader, metric)
param_name_to_exclue_from_weight_decay = re.compile( r'.*layer_norm_scale|.*layer_norm_bias|.*b_0') lr_scheduler = P.optimizer.lr.LambdaDecay( args.lr, get_warmup_and_linear_decay(args.max_steps, args.warmup_steps)) g_clip = P.nn.ClipGradByGlobalNorm(1.0) #experimental opt = P.optimizer.AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n), weight_decay=args.wd, grad_clip=g_clip) model = P.DataParallel(model) scaler = P.amp.GradScaler(enable=args.use_amp) create_if_not_exists(args.save_dir) with P.amp.auto_cast(args.use_amp): for step, samples in enumerate( P.io.DataLoader(train_ds, places=P.CUDAPlace(env.dev_id), batch_size=0)): (src_ids, sent_ids, mlm_label, mask_pos, nsp_label) = samples loss, mlmloss, nsploss = model(src_ids, sent_ids, labels=mlm_label, mlm_pos=mask_pos, nsp_labels=nsp_label) loss = scaler.scale(loss)
def run(args): # initializen distributed env if args.is_parallel == 1: dist.init_parallel_env() data_path = args.data_path + args.dataset + '/' CVs = ['CV1', 'CV2', 'CV3', 'CV4', 'CV5'] for CV in CVs: print('><<><><><><><><><><><><><><><><><><><><><><><><><<><><><><><>') print('start {}'.format(CV)) ##################### load the data ############################ train_file = CV + '_' + args.dataset + '_' + args.split + '_' + 'train' + '.csv' val_file = CV + '_' + args.dataset + '_' + args.split + '_' + 'val' + '.csv' test = 'test_' + args.dataset + '_' + args.split + '.csv' print('Load data...') r_train = pd.read_csv(data_path + CV + '/' + train_file) r_train = r_train.reset_index(drop=True) r_val = pd.read_csv(data_path + CV + '/' + val_file) r_val = r_val.reset_index(drop=True) r_test = pd.read_csv(data_path + test) r_test = r_test.reset_index(drop=True) if args.is_mixed: # load the mixed data if args.dataset == 'DAVIS': mixed_dataset = 'KIBA' if args.dataset == 'KIBA': mixed_dataset = 'DAVIS' # load the mixed data mixed_data_file = mixed_dataset + '_mixed_train_unseenP_seenD.csv' mixed_data = pd.read_csv(data_path + mixed_data_file) mixed_data = mixed_data.reset_index(drop=True) # remove the repeated protein sequence val_t = r_val['Target Sequence'].unique() mixed_t = mixed_data['Target Sequence'].unique() filter1 = list((set(val_t).intersection(set(mixed_t)))) mixed_data = mixed_data[~mixed_data['Target Sequence'].isin(filter1 )] mixed_data = mixed_data.reset_index(drop=True) r_train = load_customised_Davis(r_train) r_val = load_customised_Davis(r_val) r_test = load_customised_Davis(r_test) if args.is_mixed: r_mixed = load_customised_Davis(mixed_data) LEN_train = len(r_train) print('number of train samples are {}'.format(len(r_train))) print('number of validation samples are {}'.format(len(r_val))) print('number of test samples are {}'.format(len(r_test))) if args.is_mixed: r_mixed = load_customised_Davis(mixed_data) print('number of mixed samples are {}'.format(len(r_mixed))) print('Load done.\n') if args.is_mixed: # concatenate the data r_train = np.concatenate((r_train, r_mixed)) ###### get the protein group and index for train/val/test qid_doc_map_train = group_by(r_train, 0) query_idx_train = qid_doc_map_train.keys() train_keys = np.array(list(query_idx_train)) if args.is_mixed: id_doc_map_mixed = group_by(r_mixed, 0) query_idx_mixed = id_doc_map_mixed.keys() mixed_keys = np.array(list(query_idx_mixed)) qid_doc_map_val = group_by(r_val, 0) query_idx_val = qid_doc_map_val.keys() val_keys = np.array(list(query_idx_val)) qid_doc_map_test = group_by(r_test, 0) query_idx_test = qid_doc_map_test.keys() test_keys = np.array(list(query_idx_test)) ###### get the protein group and index for train/val/test # get the true scores of train true_scores = [ r_train[qid_doc_map_train[qid], 1] for qid in query_idx_train ] if args.is_mixed: true_scores_mixed = [ r_mixed[id_doc_map_mixed[qid], 1] for qid in query_idx_mixed ] ###### get val/test dataloader val_index = [] for qid in val_keys: val_index.append(qid_doc_map_val[qid]) val_dataset = Data_test(val_index, r_val) val_dataloader = paddle.io.DataLoader(val_dataset, batch_size=args.test_batch_size, shuffle=True) test_index = [] for qid in test_keys: test_index.append(qid_doc_map_test[qid]) test_dataset = Data_test(test_index, r_test) test_dataloader = paddle.io.DataLoader(test_dataset, batch_size=args.test_batch_size, shuffle=True) # Load model model_config = json.load(open(args.model_config_path, 'r')) model = MolTransModel(model_config) len_SMILES = model_config['drug_max_seq'] len_target = model_config['target_max_seq'] if args.is_parallel == 1: model_parallel = paddle.DataParallel(model) else: model_parallel = model # define the optimizer optimizer = paddle.optimizer.AdamW( parameters=model_parallel.parameters(), weight_decay=0.01, learning_rate=args.learning_rate) print('start to train the model...') for epoch in range(args.N_epoch): ##################### resampling the pairs for each epoch ##################### train_x1_index, train_x2_index, train_scores, Y_train = sample_pairs( true_scores, K=args.sampling_N_train, eps=args.filter_threshold, seed=epoch) if args.is_mixed: mixed_x1_index, mixed_x2_index, mixed_scores, Y_mixed = sample_pairs( true_scores_mixed, K=args.sampling_N_mixed, eps=args.filter_threshold, seed=epoch) # mixed all pairs from train and mixed dataset temp = LEN_train mixed_x1_index = [i + temp for i in mixed_x1_index] mixed_x2_index = [i + temp for i in mixed_x2_index] train_x1_index = train_x1_index + mixed_x1_index train_x2_index = train_x2_index + mixed_x2_index Y_train = np.concatenate((Y_train, Y_mixed)) train_dataset = Data_Encoder_flow(train_x1_index, train_x2_index, Y_train, r_train) if args.is_parallel: train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.train_batch_size, shuffle=True) train_dataloader = paddle.io.DataLoader( train_dataset, batch_sampler=train_batch_sampler) else: train_dataloader = paddle.io.DataLoader( train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=23) ##################### resampling the pairs for each epoch ##################### print('***************train') LOSS = [] model.train() model_parallel.train() start_time = time.time() for batch_id, data in enumerate(train_dataloader()): batch_x1 = data[0] batch_x2 = data[1] batch_y = data[2] ###### define loss and optimization function loss_ = nn.BCEWithLogitsLoss() # split to smiles and protein batch_x1_smiles = batch_x1[:, 0:len_SMILES].astype('int64') batch_x1_protein = batch_x1[:, len_SMILES:len_SMILES + len_target].astype('int64') batch_x1_smiles_mask = batch_x1[:, len_SMILES + len_target:len_SMILES + len_target + len_SMILES].astype('int64') batch_x1_protein_mask = batch_x1[:, len_SMILES + len_target + len_SMILES:].astype('int64') batch_x2_smiles = batch_x2[:, 0:len_SMILES].astype('int64') batch_x2_protein = batch_x2[:, len_SMILES:len_SMILES + len_target].astype('int64') batch_x2_smiles_mask = batch_x2[:, len_SMILES + len_target:len_SMILES + len_target + len_SMILES].astype('int64') batch_x2_protein_mask = batch_x2[:, len_SMILES + len_target + len_SMILES:].astype('int64') optimizer.clear_grad() res = model_parallel(batch_x1_smiles, batch_x1_protein, batch_x2_smiles, batch_x2_protein, batch_x1_smiles_mask, batch_x1_protein_mask, batch_x2_smiles_mask, batch_x2_protein_mask) loss = loss_(res.squeeze(1), batch_y) loss.backward() optimizer.step() # scheduler.step() if batch_id % 100 == 0: print('batch {} loss {}'.format(batch_id, loss.numpy())) LOSS.append(loss.numpy()) end_time = time.time() print('take time {}'.format(end_time - start_time)) print('epoch {}: loss: {} '.format(epoch, np.mean(LOSS))) # validation print('***************validation') val_average_CI, val_weighted_CI, val_overall_CI = model_eval( model, val_dataloader, len_SMILES, len_target) # test print('***************test') test_average_CI, test_weighted_CI, test_overall_CI = model_eval( model, test_dataloader, len_SMILES, len_target) if epoch == 0: best_average_CI = val_average_CI # save the best epoch paddle.save(model.state_dict(), args.save_direct + CV + '_' + 'train_model_best') with open(args.save_direct + CV + '_' + "best_results.txt", "w") as text_file: text_file.write( 'epoch {}: loss: {} '.format(epoch, np.mean(LOSS)) + '\n') text_file.write( "val Average CI is {}".format(val_average_CI) + '\n') text_file.write( "val weighted CI is {}".format(val_weighted_CI) + '\n') text_file.write( "val overall CI is {}".format(val_overall_CI) + '\n') text_file.write( "test Average CI is {}".format(test_average_CI) + '\n') text_file.write( "test weighted CI is {}".format(test_weighted_CI) + '\n') text_file.write( "test overall CI is {}".format(test_overall_CI) + '\n') text_file.write( '##############################################' + '\n') if (epoch != 0) & (val_average_CI >= best_average_CI): best_average_CI = val_average_CI # save the best epoch paddle.save(model.state_dict(), args.save_direct + CV + '_' + 'train_model_best') with open(args.save_direct + CV + '_' + "best_results.txt", "w") as text_file: text_file.write( 'epoch {}: loss: {} '.format(epoch, np.mean(LOSS)) + '\n') text_file.write( "val Average CI is {}".format(val_average_CI) + '\n') text_file.write( "val weighted CI is {}".format(val_weighted_CI) + '\n') text_file.write( "val overall CI is {}".format(val_overall_CI) + '\n') text_file.write( "test Average CI is {}".format(test_average_CI) + '\n') text_file.write( "test weighted CI is {}".format(test_weighted_CI) + '\n') text_file.write( "test overall CI is {}".format(test_overall_CI) + '\n') text_file.write( '##############################################' + '\n') print( '###############################################################')