tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained( model_name_or_path) trans_func = partial(convert_example_test, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # text_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # text_segment ): [data for data in fn(samples)] pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained( model_name_or_path) model = SimCSE(pretrained_model, output_emb_size=args.output_emb_size) model = paddle.DataParallel(model) # Load pretrained semantic model if args.params_path and os.path.isfile(args.params_path): state_dict = paddle.load(args.params_path) model.set_dict(state_dict) logger.info("Loaded parameters from %s" % args.params_path) else: raise ValueError( "Please set --params_path with correct pretrained model file") id2corpus = gen_id2corpus(args.corpus_file) # conver_example function's input must be dict corpus_list = [{idx: text} for idx, text in id2corpus.items()]
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds = load_dataset( read_simcse_text, data_path=args.train_set_file, lazy=False) dev_ds = load_dataset( read_text_pair, data_path=args.test_set_file, lazy=False) pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained( 'ernie-1.0', hidden_dropout_prob=args.dropout, attention_probs_dropout_prob=args.dropout) tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial( convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # query_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # query_segment Pad(axis=0, pad_val=tokenizer.pad_token_id), # title_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tilte_segment ): [data for data in fn(samples)] dev_batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # query_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # query_segment Pad(axis=0, pad_val=tokenizer.pad_token_id), # title_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tilte_segment Stack(dtype="int64"), # labels ): [data for data in fn(samples)] train_data_loader = create_dataloader( train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader( dev_ds, mode='eval', batch_size=args.batch_size, batchify_fn=dev_batchify_fn, trans_fn=trans_func) model = SimCSE( pretrained_model, margin=args.margin, scale=args.scale, output_emb_size=args.output_emb_size) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) print("warmup from:{}".format(args.init_from_ckpt)) model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch if(args.dup_rate > 0): query_input_ids,query_token_type_ids=word_repetition(query_input_ids,query_token_type_ids,args.dup_rate) title_input_ids,title_token_type_ids=word_repetition(title_input_ids,title_token_type_ids,args.dup_rate) loss = model( query_input_ids=query_input_ids, title_input_ids=title_input_ids, query_token_type_ids=query_token_type_ids, title_token_type_ids=title_token_type_ids) global_step += 1 if global_step % 10 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, 10 / (time.time() - tic_train))) tic_train = time.time() if global_step % args.eval_steps == 0 and rank == 0: # need better way to get model Layers spearman_corr, total_num = do_evaluate(model._layers, tokenizer, dev_data_loader, args.infer_with_fc_pooler) print("global step: {}, spearman_corr: {:.4f}, total_num: {}".format(global_step, spearman_corr, total_num)) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, 'model_state.pdparams') paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir) if args.max_steps > 0 and global_step >= args.max_steps: return
parser = argparse.ArgumentParser() parser.add_argument("--params_path", type=str, required=True, default='./checkpoint/model_50/model_state.pdparams', help="The path to model parameters to be loaded.") parser.add_argument("--output_path", type=str, default='./output', help="The path of model parameter in static graph to be saved.") args = parser.parse_args() # yapf: enable if __name__ == "__main__": # If you want to use ernie1.0 model, plesace uncomment the following code output_emb_size = 256 pretrained_model = AutoModel.from_pretrained("ernie-1.0") tokenizer = AutoTokenizer.from_pretrained('ernie-1.0') model = SimCSE(pretrained_model, output_emb_size=output_emb_size) if args.params_path and os.path.isfile(args.params_path): state_dict = paddle.load(args.params_path) model.set_dict(state_dict) print("Loaded parameters from %s" % args.params_path) model.eval() # Convert to static graph with specific input description model = paddle.jit.to_static( model, input_spec=[ paddle.static.InputSpec(shape=[None, None], dtype="int64"), # input_ids paddle.static.InputSpec(shape=[None, None], dtype="int64") # segment_ids
paddle.set_device(device) tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial( convert_example, tokenizer=tokenizer, max_seq_length=max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # text_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # text_segment ): [data for data in fn(samples)] pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained( "ernie-1.0") # pretrained_model=ErnieModel.from_pretrained("ernie-1.0") model = SimCSE(pretrained_model, output_emb_size=output_emb_size) # Load pretrained semantic model if params_path and os.path.isfile(params_path): state_dict = paddle.load(params_path) model.set_dict(state_dict) print("Loaded parameters from %s" % params_path) else: raise ValueError( "Please set --params_path with correct pretrained model file") # conver_example function's input must be dict corpus_list = [{idx: text} for idx, text in id2corpus.items()] corpus_ds = MapDataset(corpus_list) corpus_data_loader = create_dataloader(
valid_ds = load_dataset(read_text_pair, data_path=args.text_pair_file, lazy=False, is_test=True) valid_data_loader = create_dataloader(valid_ds, mode='predict', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained( "ernie-1.0") model = SimCSE(pretrained_model, margin=args.margin, scale=args.scale, output_emb_size=args.output_emb_size) if args.params_path and os.path.isfile(args.params_path): state_dict = paddle.load(args.params_path) model.set_dict(state_dict) print("Loaded parameters from %s" % args.params_path) else: raise ValueError( "Please set --params_path with correct pretrained model file") cosin_sim = predict(model, valid_data_loader) for idx, cosine in enumerate(cosin_sim): print('{}'.format(cosine))
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) writer=LogWriter(logdir="./log/scalar_test/train") train_ds = load_dataset( read_simcse_text, data_path=args.train_set_file, lazy=False) pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.dropout, attention_probs_dropout_prob=args.dropout) print("loading model from {}".format(args.model_name_or_path)) tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial( convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # query_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # query_segment Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # title_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # tilte_segment ): [data for data in fn(samples)] train_data_loader = create_dataloader( train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) model = SimCSE( pretrained_model, margin=args.margin, scale=args.scale, output_emb_size=args.output_emb_size) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) print("warmup from:{}".format(args.init_from_ckpt)) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) time_start=time.time() global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch loss = model( query_input_ids=query_input_ids, title_input_ids=title_input_ids, query_token_type_ids=query_token_type_ids, title_token_type_ids=title_token_type_ids) global_step += 1 if global_step % 10 == 0 and rank == 0: print("global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, 10 / (time.time() - tic_train))) writer.add_scalar(tag="loss", step=global_step, value=loss) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % (global_step)) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, 'model_state.pdparams') paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir) time_end=time.time() print('totally cost',time_end-time_start)