def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) dev_ds = load_dataset(read_test, src_path=args.test_file, lazy=False) print(dev_ds[0]) pretrained_model = ppnlp.transformers.ErnieGramModel.from_pretrained( 'ernie-gram-zh') tokenizer = ppnlp.transformers.ErnieGramTokenizer.from_pretrained( 'ernie-gram-zh') trans_func_eval = partial( convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length, phase="eval") batchify_fn_eval = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # pair_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # pair_segment Stack(dtype="int64") # label ): [data for data in fn(samples)] dev_data_loader = create_dataloader( dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn_eval, trans_fn=trans_func_eval) model = PairwiseMatching(pretrained_model, margin=args.margin) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) metric = paddle.metric.Auc() evaluate(model, metric, dev_data_loader, "dev")
batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment_ids ): [data for data in fn(samples)] valid_ds = load_dataset(read_text_pair, data_path=args.input_file, lazy=False) valid_data_loader = create_dataloader(valid_ds, mode='predict', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) model = PairwiseMatching(pretrained_model) if args.params_path and os.path.isfile(args.params_path): state_dict = paddle.load(args.params_path) model.set_dict(state_dict) print("Loaded parameters from %s" % args.params_path) else: raise ValueError( "Please set --params_path with correct pretrained model file") y_probs = predict(model, valid_data_loader) valid_ds = load_dataset(read_text_pair, data_path=args.input_file, lazy=False)
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) # train_ds, dev_ds = load_dataset("lcqmc", splits=["train", "dev"]) train_ds = load_dataset(read, src_path=args.train_file, lazy=False) dev_ds = load_dataset(read_test, src_path=args.test_file, lazy=False) print(train_ds[0]) # train_ds = gen_pair(train_ds) # If you want to use ernie1.0 model, plesace uncomment the following code # pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained('ernie-1.0') # tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0') pretrained_model = ppnlp.transformers.ErnieGramModel.from_pretrained( 'ernie-gram-zh') tokenizer = ppnlp.transformers.ErnieGramTokenizer.from_pretrained( 'ernie-gram-zh') trans_func_train = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) trans_func_eval = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length, phase="eval") batchify_fn_train = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # pos_pair_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64" ), # pos_pair_segment Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # neg_pair_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64" ) # neg_pair_segment ): [data for data in fn(samples)] batchify_fn_eval = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # pair_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64" ), # pair_segment Stack(dtype="int64") # label ): [data for data in fn(samples)] train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn_train, trans_fn=trans_func_train) dev_data_loader = create_dataloader(dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn_eval, trans_fn=trans_func_eval) model = PairwiseMatching(pretrained_model, margin=args.margin) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) metric = paddle.metric.Auc() global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): pos_input_ids, pos_token_type_ids, neg_input_ids, neg_token_type_ids = batch loss = model(pos_input_ids=pos_input_ids, neg_input_ids=neg_input_ids, pos_token_type_ids=pos_token_type_ids, neg_token_type_ids=neg_token_type_ids) global_step += 1 if global_step % 10 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.eval_step == 0 and rank == 0: evaluate(model, metric, dev_data_loader, "dev") if global_step % args.save_step == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, 'model_state.pdparams') paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir)