def test_special_tokens_initialization(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest( f"{tokenizer.__class__.__name__} ({pretrained_name})"): added_tokens = [f"<extra_id_{i}>" for i in range(100) ] + [AddedToken("<special>", lstrip=True)] tokenizer_r = self.rust_tokenizer_class.from_pretrained( pretrained_name, additional_special_tokens=added_tokens, **kwargs) tokenizer_cr = self.rust_tokenizer_class.from_pretrained( pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True) tokenizer_p = self.tokenizer_class.from_pretrained( pretrained_name, additional_special_tokens=added_tokens, **kwargs) p_output = tokenizer_p.encode("Hey this is a <special> token") r_output = tokenizer_r.encode("Hey this is a <special> token") cr_output = tokenizer_cr.encode( "Hey this is a <special> token") special_token_id = tokenizer_r.encode( "<special>", add_special_tokens=False)[0] self.assertEqual(p_output, r_output) self.assertEqual(cr_output, r_output) self.assertTrue(special_token_id in p_output) self.assertTrue(special_token_id in r_output) self.assertTrue(special_token_id in cr_output)
def test_space_encoding(self): tokenizer = self.get_tokenizer() sequence = "Encode this sequence." space_encoding = tokenizer.byte_encoder[" ".encode("utf-8")[0]] # Testing encoder arguments encoded = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=False) first_char = tokenizer.convert_ids_to_tokens(encoded[0])[0] self.assertNotEqual(first_char, space_encoding) encoded = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True) first_char = tokenizer.convert_ids_to_tokens(encoded[0])[0] self.assertEqual(first_char, space_encoding) tokenizer.add_special_tokens({"bos_token": "<s>"}) encoded = tokenizer.encode(sequence, add_special_tokens=True) first_char = tokenizer.convert_ids_to_tokens(encoded[1])[0] self.assertNotEqual(first_char, space_encoding) # Testing spaces after special tokens mask = "<mask>" tokenizer.add_special_tokens({ "mask_token": AddedToken(mask, lstrip=True, rstrip=False) }) # mask token has a left space mask_ind = tokenizer.convert_tokens_to_ids(mask) sequence = "Encode <mask> sequence" sequence_nospace = "Encode <mask>sequence" encoded = tokenizer.encode(sequence) mask_loc = encoded.index(mask_ind) first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0] self.assertEqual(first_char, space_encoding) encoded = tokenizer.encode(sequence_nospace) mask_loc = encoded.index(mask_ind) first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0] self.assertNotEqual(first_char, space_encoding)
def test_special_tokens_initialization(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest( f"{tokenizer.__class__.__name__} ({pretrained_name})"): added_tokens = [AddedToken("<special>", lstrip=True)] tokenizer_r = self.rust_tokenizer_class.from_pretrained( pretrained_name, additional_special_tokens=added_tokens, **kwargs) r_output = tokenizer_r.encode("Hey this is a <special> token") special_token_id = tokenizer_r.encode( "<special>", add_special_tokens=False)[0] self.assertTrue(special_token_id in r_output) if self.test_slow_tokenizer: tokenizer_cr = self.rust_tokenizer_class.from_pretrained( pretrained_name, additional_special_tokens=added_tokens, ** kwargs, # , from_slow=True <- unfortunately too slow to convert ) tokenizer_p = self.tokenizer_class.from_pretrained( pretrained_name, additional_special_tokens=added_tokens, **kwargs) p_output = tokenizer_p.encode( "Hey this is a <special> token") cr_output = tokenizer_cr.encode( "Hey this is a <special> token") self.assertEqual(p_output, r_output) self.assertEqual(cr_output, r_output) self.assertTrue(special_token_id in p_output) self.assertTrue(special_token_id in cr_output)
def test_special_tokens_initialization_with_non_empty_additional_special_tokens( self): tokenizer_list = [] if self.test_slow_tokenizer: tokenizer_list.append((self.tokenizer_class, self.get_tokenizer())) if self.test_rust_tokenizer: tokenizer_list.append( (self.rust_tokenizer_class, self.get_rust_tokenizer())) for tokenizer_class, tokenizer_utils in tokenizer_list: with tempfile.TemporaryDirectory() as tmp_dir: tokenizer_utils.save_pretrained(tmp_dir) with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file: special_tokens_map = json.load(json_file) with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file: tokenizer_config = json.load(json_file) added_tokens_extra_ids = [ f"<extra_id_{i}>" for i in range(125) ] special_tokens_map[ "additional_special_tokens"] = added_tokens_extra_ids + [ "an_additional_special_token" ] tokenizer_config[ "additional_special_tokens"] = added_tokens_extra_ids + [ "an_additional_special_token" ] with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile: json.dump(special_tokens_map, outfile) with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile: json.dump(tokenizer_config, outfile) # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and # "special_tokens_map.json" files tokenizer_without_change_in_init = tokenizer_class.from_pretrained( tmp_dir, ) self.assertIn( "an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens) self.assertEqual( ["an_additional_special_token"], tokenizer_without_change_in_init.convert_ids_to_tokens( tokenizer_without_change_in_init.convert_tokens_to_ids( ["an_additional_special_token"])), ) # Now we test that we can change the value of additional_special_tokens in the from_pretrained new_added_tokens = added_tokens_extra_ids + [ AddedToken("a_new_additional_special_token", lstrip=True) ] tokenizer = tokenizer_class.from_pretrained( tmp_dir, additional_special_tokens=new_added_tokens, ) self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens) self.assertEqual( ["a_new_additional_special_token"], tokenizer.convert_ids_to_tokens( tokenizer.convert_tokens_to_ids( ["a_new_additional_special_token"])), )