def setUp(self): self.set_attr() self.create_input_file() self.set_test_case() self.tokenizer = RobertaTokenizer(vocab_file=self.vocab_file, merges_file=None, do_lower_case=self.do_lower_case)
def test_from_pretrained_pad_left(self): tokenizer = RobertaTokenizer.from_pretrained("roberta-wwm-ext") tokenizer.padding_side = "left" text1 = "这是一个简单文本" text2 = "小孩子都看得懂" # test batch_encode expected_input_ids = [ 0, 0, 101, 6821, 3221, 671, 702, 5042, 1296, 3152, 3315, 102, 2207, 2111, 2094, 6963, 4692, 2533, 2743, 102 ] expected_token_type_ids = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 ] expected_attention_mask = [ 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ] expected_special_tokens_mask = [ 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1 ] results = tokenizer([text1], [text2], 20, stride=1, pad_to_max_seq_len=True, return_attention_mask=True, return_special_tokens_mask=True) self.check_output_equal(results[0]['input_ids'], expected_input_ids) self.check_output_equal(results[0]['token_type_ids'], expected_token_type_ids) self.check_output_equal(results[0]['attention_mask'], expected_attention_mask) self.check_output_equal(results[0]['special_tokens_mask'], expected_special_tokens_mask) # test encode results = tokenizer(text1, text2, 20, stride=1, pad_to_max_seq_len=True) self.check_output_equal(results['input_ids'], expected_input_ids) self.check_output_equal(results['token_type_ids'], expected_token_type_ids)
def train(): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) train_dataset, dev_dataset = Poetry.get_datasets(['train', 'dev']) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example(tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len, noise_prob=args.noise_prob, use_random_noice=args.use_random_noice) train_dataset = train_dataset.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) label_num = model.word_emb.weight.shape[0] if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) max_steps = len(train_data_loader) * args.num_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps, args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=nn.ClipGradByGlobalNorm(1.0), apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) rouge1 = Rouge1() rouge2 = Rouge2() global_step = 1 tic_train = time.time() for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader, start=1): (src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, _) = batch # import pdb; pdb.set_trace() _, __, info = model(src_ids, sent_ids=src_sids, pos_ids=src_pids, attn_bias=mask_src_2_src, encode_only=True) cached_k, cached_v = info['caches'] _, __, info = model(tgt_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_tgt_2_srctgt, past_cache=(cached_k, cached_v), encode_only=True) cached_k2, cached_v2 = info['caches'] past_cache_k = [ paddle.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2) ] past_cache_v = [ paddle.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2) ] if args.label_smooth > 0.: tgt_labels = nn.functional.label_smooth( nn.functional.one_hot(tgt_labels, label_num), epsilon=args.label_smooth) loss, _, __ = model(attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, past_cache=(past_cache_k, past_cache_v), tgt_labels=tgt_labels, tgt_pos=paddle.nonzero(attn_ids == attn_id)) if global_step % args.logging_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, lr: %.3e" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train), lr_scheduler.get_lr())) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0 and ( (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0): evaluate(model, dev_data_loader, tokenizer, rouge1, rouge2, attn_id, tgt_type_id, args) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) global_step += 1
def train(): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) train_dataset, dev_dataset = load_dataset( 'poetry', splits=('train', 'dev'), lazy=False) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example( tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len, noise_prob=args.noise_prob, use_random_noice=args.use_random_noice) train_dataset = train_dataset.map(trans_func) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # src_tids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tgt_tids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) train_data_loader = DataLoader( dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.map(trans_func) dev_data_loader = DataLoader( dataset=dev_dataset, batch_size=args.batch_size, collate_fn=batchify_fn, num_workers=0, return_list=True) label_num = model.word_emb.weight.shape[0] train_model = StackModel(model) if paddle.distributed.get_world_size() > 1: # All 'forward' outputs derived from the module parameters using in DataParallel # must participate in the calculation of losses and subsequent gradient calculations. # So we use StackModel here to make the model only output loss in its 'forward' function. train_model = paddle.DataParallel(train_model) max_steps = len(train_data_loader) * args.num_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=nn.ClipGradByGlobalNorm(1.0), apply_decay_param_fun=lambda x: x in decay_params) rouge1 = Rouge1() rouge2 = Rouge2() global_step = 1 tic_train = time.time() for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader, start=1): (src_ids, src_tids, src_pids, tgt_ids, tgt_tids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, _) = batch # import pdb; pdb.set_trace() if args.label_smooth > 0.: tgt_labels = nn.functional.label_smooth( nn.functional.one_hot(tgt_labels, label_num), epsilon=args.label_smooth) tgt_pos = paddle.nonzero(attn_ids == attn_id) loss = train_model(src_ids, src_tids, src_pids, tgt_ids, tgt_tids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, tgt_pos) if global_step % args.logging_steps == 0: if paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, lr: %.3e" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train), lr_scheduler.get_lr())) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 and paddle.distributed.get_rank( ) == 0: evaluate(model, dev_data_loader, tokenizer, rouge1, rouge2, attn_id, tgt_type_id, args) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) global_step += 1
def predict(): paddle.set_device("gpu" if args.use_gpu else "cpu") model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) dev_dataset = Poetry.get_datasets(['dev']) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example(tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) dev_dataset = dev_dataset.apply(trans_func, lazy=True) test_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) data_loader = DataLoader(dataset=dev_dataset, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) model.eval() vocab = tokenizer.vocab eos_id = vocab[tokenizer.sep_token] sos_id = vocab[tokenizer.cls_token] pad_id = vocab[tokenizer.pad_token] unk_id = vocab[tokenizer.unk_token] vocab_size = len(vocab) evaluated_sentences = [] evaluated_sentences_ids = [] logger.info("Predicting...") for data in data_loader: (src_ids, src_sids, src_pids, _, _, _, _, _, _, _, _, raw_tgt_labels) = data # never use target when infer # Use greedy_search_infilling or beam_search_infilling to get predictions output_ids = beam_search_infilling(model, src_ids, src_sids, eos_id=eos_id, sos_id=sos_id, attn_id=attn_id, pad_id=pad_id, unk_id=unk_id, vocab_size=vocab_size, max_decode_len=args.max_decode_len, max_encode_len=args.max_encode_len, beam_width=args.beam_width, length_penalty=args.length_penalty, tgt_type_id=tgt_type_id) for source_ids, target_ids, predict_ids in zip( src_ids.numpy().tolist(), raw_tgt_labels.numpy().tolist(), output_ids.tolist()): if eos_id in predict_ids: predict_ids = predict_ids[:predict_ids.index(eos_id)] source_sentence = ''.join( map(post_process, vocab.to_tokens(source_ids[1:source_ids.index(eos_id)]))) tgt_sentence = ''.join( map(post_process, vocab.to_tokens(target_ids[1:target_ids.index(eos_id)]))) predict_ids = ''.join( map(post_process, vocab.to_tokens(predict_ids))) print("source :%s\ntarget :%s\npredict:%s\n" % (source_sentence, tgt_sentence, predict_ids))
def evaluate(): paddle.set_device("gpu" if args.use_gpu else "cpu") model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) dev_dataset = Poetry.get_datasets(['dev']) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example(tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) rouge1 = Rouge1() rouge2 = Rouge2() if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) model.eval() vocab = tokenizer.vocab eos_id = vocab[tokenizer.sep_token] sos_id = vocab[tokenizer.cls_token] pad_id = vocab[tokenizer.pad_token] unk_id = vocab[tokenizer.unk_token] vocab_size = len(vocab) evaluated_sentences_ids = [] reference_sentences_ids = [] logger.info("Evaluating...") for data in tqdm(data_loader): (src_ids, src_sids, src_pids, _, _, _, _, _, _, _, _, raw_tgt_labels) = data # never use target when infer # Use greedy_search_infilling or beam_search_infilling to get predictions output_ids = beam_search_infilling(model, src_ids, src_sids, eos_id=eos_id, sos_id=sos_id, attn_id=attn_id, pad_id=pad_id, unk_id=unk_id, vocab_size=vocab_size, max_decode_len=args.max_decode_len, max_encode_len=args.max_encode_len, beam_width=args.beam_width, length_penalty=args.length_penalty, tgt_type_id=tgt_type_id) for ids in output_ids.tolist(): if eos_id in ids: ids = ids[:ids.index(eos_id)] evaluated_sentences_ids.append(ids) for ids in raw_tgt_labels.numpy().tolist(): ids = ids[:ids.index(eos_id)] reference_sentences_ids.append(ids) score1 = rouge1.score(evaluated_sentences_ids, reference_sentences_ids) score2 = rouge2.score(evaluated_sentences_ids, reference_sentences_ids) logger.info("Rouge-1: %.5f ,Rouge-2: %.5f" % (score1 * 100, score2 * 100))
class TestRobertaTokenizer(CpuCommonTest): def set_attr(self): self.do_lower_case = True def create_input_file(self): vocab = [ "[UNK]", "[CLS]", "[SEP]", "th", "##is", "is", "simple", "text", "a", "an", "for", "easy", "which", "children", "[MASK]", "[PAD]" ] curr_dir = os.path.dirname(os.path.realpath(__file__)) vocab_file = os.path.join(curr_dir, "vocab.txt") with open(vocab_file, "w") as fw: for v in vocab: fw.write(v) fw.write('\n') self.vocab_file = vocab_file self.vocab = vocab def set_test_case(self): self.text = "this is a simple text" self.expected_text_array = ['th', '##is', 'is', 'a', 'simple', 'text'] def setUp(self): self.set_attr() self.create_input_file() self.set_test_case() self.tokenizer = RobertaTokenizer(vocab_file=self.vocab_file, merges_file=None, do_lower_case=self.do_lower_case) def test_tokenize(self): text_array = self.tokenizer.tokenize(self.text) self.check_output_equal(text_array, self.expected_text_array) self.check_output_equal( self.tokenizer.convert_tokens_to_string(text_array), self.text) def test_call(self): expected_input_ids = [1, 3, 4, 5, 8, 6, 7, 2] expected_token_type_ids = [0, 0, 0, 0, 0, 0, 0, 0] expected_attention_mask = [1] * len(expected_input_ids) expected_tokens_mask = [1, 0, 0, 0, 0, 0, 0, 1] result = self.tokenizer("This is a simple text", return_attention_mask=True, return_length=True, return_special_tokens_mask=True) self.check_output_equal(result['input_ids'], expected_input_ids) self.check_output_equal(result['token_type_ids'], expected_token_type_ids) self.check_output_equal(result['seq_len'], len(expected_token_type_ids)) self.check_output_equal(result['attention_mask'], expected_attention_mask) self.check_output_equal(result['special_tokens_mask'], expected_tokens_mask) def test_call_pair(self): expected_input_ids = [1, 3, 4, 5, 8, 6, 7, 2, 12, 5, 11, 10, 13, 2] expected_token_type_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] result = self.tokenizer("This is a simple text", "which is easy for children") self.check_output_equal(result['input_ids'], expected_input_ids) self.check_output_equal(result['token_type_ids'], expected_token_type_ids) def test_call_batch(self): expected_input_ids = [1, 3, 4, 5, 8, 6, 7, 2] expected_token_type_ids = [0, 0, 0, 0, 0, 0, 0, 0] results = self.tokenizer( ["This is a simple text", "this Is A simple text"]) for result in results: self.check_output_equal(result['input_ids'], expected_input_ids) self.check_output_equal(result['token_type_ids'], expected_token_type_ids) def test_call_truncate_seq(self): expected_input_ids = [1, 3, 2, 3, 2] expected_token_type_ids = [0, 0, 0, 1, 1] results = self.tokenizer("This is a simple text", "this Is A simple text", 5) self.check_output_equal(results['input_ids'], expected_input_ids) self.check_output_equal(results['token_type_ids'], expected_token_type_ids) # Test PretrainedTokenizer def test_truncate_only_first(self): ids = [1, 3, 4, 5, 8, 6, 7, 2] pair_ids = [12, 5, 11, 10, 13, 2] truncate_ids, truncate_pair_ids, _ = self.tokenizer.truncate_sequences( ids, pair_ids, 3, truncation_strategy='only_first') self.check_output_equal(truncate_ids, ids[:-3]) self.check_output_equal(truncate_pair_ids, pair_ids) def test_truncate_only_second(self): ids = [1, 3, 4, 5, 8, 6, 7, 2] pair_ids = [12, 5, 11, 10, 13, 2] truncate_ids, truncate_pair_ids, _ = self.tokenizer.truncate_sequences( ids, pair_ids, 3, truncation_strategy='only_second') self.check_output_equal(truncate_ids, ids) self.check_output_equal(truncate_pair_ids, pair_ids[:-3]) @assert_raises(ValueError) def test_truncate_do_not_truncate(self): ids = [1, 3, 4, 5, 8, 6, 7, 2] pair_ids = [12, 5, 11, 10, 13, 2] truncate_ids, truncate_pair_ids, _ = self.tokenizer.truncate_sequences( ids, pair_ids, 3, truncation_strategy='do_not_truncate') @assert_raises(ValueError) def test_truncate_error_strategy(self): ids = [1, 3, 4, 5, 8, 6, 7, 2] pair_ids = [12, 5, 11, 10, 13, 2] truncate_ids, truncate_pair_ids, _ = self.tokenizer.truncate_sequences( ids, pair_ids, 1, truncation_strategy='') def test_save_pretrained(self): curr_dir = os.path.dirname(os.path.realpath(__file__)) model_path = os.path.join(curr_dir, "pretrained_model") if not os.path.exists(model_path): os.mkdir(model_path) self.tokenizer.save_pretrained(model_path) vocab_path = os.path.join( model_path, self.tokenizer.resource_files_names['vocab_file']) with open(vocab_path, "r") as fr: vocabs = [vocab.strip() for vocab in fr.readlines()] self.check_output_equal(vocabs, self.vocab) @assert_raises(RuntimeError) def test_from_pretrained_non_exist(self): RobertaTokenizer.from_pretrained("") def test_vocab_size(self): self.check_output_equal(self.tokenizer.vocab_size, len(self.vocab)) def test_all_special_tokens(self): expected_special_tokens_set = set([ self.tokenizer.pad_token, self.tokenizer.mask_token, self.tokenizer.cls_token, self.tokenizer.unk_token, self.tokenizer.sep_token ]) self.check_output_equal(set(self.tokenizer.all_special_tokens), expected_special_tokens_set) self.check_output_equal(set(self.tokenizer.all_special_ids), set([0, 1, 2, 14, 15])) @assert_raises(ValueError) def test_non_exist_vocab_file(self): RobertaTokenizer("non_exist.txt", merges_file=None)
def test_non_exist_vocab_file(self): RobertaTokenizer("non_exist.txt", merges_file=None)
def test_from_pretrained_non_exist(self): RobertaTokenizer.from_pretrained("")