def test_new_processor_registration(self): try: AutoConfig.register("custom", CustomConfig) AutoFeatureExtractor.register(CustomConfig, CustomFeatureExtractor) AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer) AutoProcessor.register(CustomConfig, CustomProcessor) # Trying to register something existing in the Transformers library will raise an error with self.assertRaises(ValueError): AutoProcessor.register(Wav2Vec2Config, Wav2Vec2Processor) # Now that the config is registered, it can be used as any other config with the auto-API feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR) with tempfile.TemporaryDirectory() as tmp_dir: vocab_file = os.path.join(tmp_dir, "vocab.txt") with open(vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens])) tokenizer = CustomTokenizer(vocab_file) processor = CustomProcessor(feature_extractor, tokenizer) with tempfile.TemporaryDirectory() as tmp_dir: processor.save_pretrained(tmp_dir) new_processor = AutoProcessor.from_pretrained(tmp_dir) self.assertIsInstance(new_processor, CustomProcessor) finally: if "custom" in CONFIG_MAPPING._extra_content: del CONFIG_MAPPING._extra_content["custom"] if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content: del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig] if CustomConfig in TOKENIZER_MAPPING._extra_content: del TOKENIZER_MAPPING._extra_content[CustomConfig] if CustomConfig in PROCESSOR_MAPPING._extra_content: del PROCESSOR_MAPPING._extra_content[CustomConfig]
def test_new_tokenizer_registration(self): try: AutoConfig.register("new-model", NewConfig) AutoTokenizer.register(NewConfig, slow_tokenizer_class=NewTokenizer) # Trying to register something existing in the Transformers library will raise an error with self.assertRaises(ValueError): AutoTokenizer.register(BertConfig, slow_tokenizer_class=BertTokenizer) tokenizer = NewTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER) with tempfile.TemporaryDirectory() as tmp_dir: tokenizer.save_pretrained(tmp_dir) new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir) self.assertIsInstance(new_tokenizer, NewTokenizer) finally: if "new-model" in CONFIG_MAPPING._extra_content: del CONFIG_MAPPING._extra_content["new-model"] if NewConfig in TOKENIZER_MAPPING._extra_content: del TOKENIZER_MAPPING._extra_content[NewConfig]
def test_new_tokenizer_fast_registration(self): try: AutoConfig.register("custom", CustomConfig) # Can register in two steps AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer) self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, None)) AutoTokenizer.register(CustomConfig, fast_tokenizer_class=CustomTokenizerFast) self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, CustomTokenizerFast)) del TOKENIZER_MAPPING._extra_content[CustomConfig] # Can register in one step AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer, fast_tokenizer_class=CustomTokenizerFast) self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, CustomTokenizerFast)) # Trying to register something existing in the Transformers library will raise an error with self.assertRaises(ValueError): AutoTokenizer.register(BertConfig, fast_tokenizer_class=BertTokenizerFast) # We pass through a bert tokenizer fast cause there is no converter slow to fast for our new toknizer # and that model does not have a tokenizer.json with tempfile.TemporaryDirectory() as tmp_dir: bert_tokenizer = BertTokenizerFast.from_pretrained( SMALL_MODEL_IDENTIFIER) bert_tokenizer.save_pretrained(tmp_dir) tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir) with tempfile.TemporaryDirectory() as tmp_dir: tokenizer.save_pretrained(tmp_dir) new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir) self.assertIsInstance(new_tokenizer, CustomTokenizerFast) new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, use_fast=False) self.assertIsInstance(new_tokenizer, CustomTokenizer) finally: if "custom" in CONFIG_MAPPING._extra_content: del CONFIG_MAPPING._extra_content["custom"] if CustomConfig in TOKENIZER_MAPPING._extra_content: del TOKENIZER_MAPPING._extra_content[CustomConfig]