def __new__(mcs, name, bases, dct): def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class): @skipIf(tiny_config is None, "TinyConfig does not exist") @skipIf(checkpoint is None, "checkpoint does not exist") def test(self): model = ModelClass(tiny_config) if hasattr(model, "eval"): model = model.eval() try: tokenizer = get_tiny_tokenizer_from_checkpoint(checkpoint) tokenizer.model_max_length = model.config.max_position_embeddings # Rust Panic exception are NOT Exception subclass # Some test tokenizer contain broken vocabs or custom PreTokenizer, so we # provide some default tokenizer and hope for the best. except: # noqa: E722 logger.warning( f"Tokenizer cannot be created from checkpoint {checkpoint}" ) tokenizer = get_tiny_tokenizer_from_checkpoint("gpt2") tokenizer.model_max_length = model.config.max_position_embeddings self.run_pipeline_test(model, tokenizer) return test mapping = dct.get("model_mapping", {}) if mapping: for configuration, model_architecture in mapping.items(): checkpoint = get_checkpoint_from_architecture( model_architecture) tiny_config = get_tiny_config_from_class(configuration) tokenizer_classes = TOKENIZER_MAPPING.get(configuration, []) for tokenizer_class in tokenizer_classes: if tokenizer_class is not None and tokenizer_class.__name__.endswith( "Fast"): test_name = f"test_pt_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_class.__name__}" dct[test_name] = gen_test(model_architecture, checkpoint, tiny_config, tokenizer_class) tf_mapping = dct.get("tf_model_mapping", {}) if tf_mapping: for configuration, model_architecture in tf_mapping.items(): checkpoint = get_checkpoint_from_architecture( model_architecture) tiny_config = get_tiny_config_from_class(configuration) tokenizer_classes = TOKENIZER_MAPPING.get(configuration, []) for tokenizer_class in tokenizer_classes: if tokenizer_class is not None and tokenizer_class.__name__.endswith( "Fast"): test_name = f"test_tf_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_class.__name__}" dct[test_name] = gen_test(model_architecture, checkpoint, tiny_config, tokenizer_class) return type.__new__(mcs, name, bases, dct)
def __new__(mcs, name, bases, dct): def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class): @skipIf(tiny_config is None, "TinyConfig does not exist") @skipIf(checkpoint is None, "checkpoint does not exist") def test(self): model = ModelClass(tiny_config) if hasattr(model, "eval"): model = model.eval() try: tokenizer = get_tiny_tokenizer_from_checkpoint(checkpoint) if hasattr(model.config, "max_position_embeddings"): tokenizer.model_max_length = model.config.max_position_embeddings # Rust Panic exception are NOT Exception subclass # Some test tokenizer contain broken vocabs or custom PreTokenizer, so we # provide some default tokenizer and hope for the best. except: # noqa: E722 self.skipTest( f"Ignoring {ModelClass}, cannot create a simple tokenizer" ) self.run_pipeline_test(model, tokenizer) return test for prefix, key in [("pt", "model_mapping"), ("tf", "tf_model_mapping")]: mapping = dct.get(key, {}) if mapping: for configuration, model_architectures in mapping.items(): if not isinstance(model_architectures, tuple): model_architectures = (model_architectures, ) for model_architecture in model_architectures: checkpoint = get_checkpoint_from_architecture( model_architecture) tiny_config = get_tiny_config_from_class(configuration) tokenizer_classes = TOKENIZER_MAPPING.get( configuration, []) for tokenizer_class in tokenizer_classes: if tokenizer_class is not None and tokenizer_class.__name__.endswith( "Fast"): test_name = f"test_{prefix}_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_class.__name__}" dct[test_name] = gen_test( model_architecture, checkpoint, tiny_config, tokenizer_class) return type.__new__(mcs, name, bases, dct)
def __new__(mcs, name, bases, dct): def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class, feature_extractor_class): @skipIf(tiny_config is None, "TinyConfig does not exist") @skipIf(checkpoint is None, "checkpoint does not exist") def test(self): if ModelClass.__name__.endswith("ForCausalLM"): tiny_config.is_encoder_decoder = False if hasattr(tiny_config, "encoder_no_repeat_ngram_size"): # specific for blenderbot which supports both decoder-only # encoder/decoder but the test config only reflects # encoder/decoder arch tiny_config.encoder_no_repeat_ngram_size = 0 if ModelClass.__name__.endswith("WithLMHead"): tiny_config.is_decoder = True try: model = ModelClass(tiny_config) except ImportError as e: self.skipTest( f"Cannot run with {tiny_config} as the model requires a library that isn't installed: {e}" ) if hasattr(model, "eval"): model = model.eval() if tokenizer_class is not None: try: tokenizer = get_tiny_tokenizer_from_checkpoint( checkpoint) # XLNet actually defines it as -1. if isinstance(model.config, (RobertaConfig, IBertConfig)): tokenizer.model_max_length = model.config.max_position_embeddings - 2 elif (hasattr(model.config, "max_position_embeddings") and model.config.max_position_embeddings > 0): tokenizer.model_max_length = model.config.max_position_embeddings # Rust Panic exception are NOT Exception subclass # Some test tokenizer contain broken vocabs or custom PreTokenizer, so we # provide some default tokenizer and hope for the best. except: # noqa: E722 self.skipTest( f"Ignoring {ModelClass}, cannot create a simple tokenizer" ) else: tokenizer = None feature_extractor = get_tiny_feature_extractor_from_checkpoint( checkpoint, tiny_config) pipeline, examples = self.get_test_pipeline( model, tokenizer, feature_extractor) if pipeline is None: # The test can disable itself, but it should be very marginal # Concerns: Wav2Vec2ForCTC without tokenizer test (FastTokenizer don't exist) return self.run_pipeline_test(pipeline, examples) def run_batch_test(pipeline, examples): # Need to copy because `Conversation` are stateful if pipeline.tokenizer is not None and pipeline.tokenizer.pad_token_id is None: return # No batching for this and it's OK # 10 examples with batch size 4 means there needs to be a unfinished batch # which is important for the unbatcher dataset = [ copy.deepcopy(random.choice(examples)) for i in range(10) ] for item in pipeline(dataset, batch_size=4): pass run_batch_test(pipeline, examples) return test for prefix, key in [("pt", "model_mapping"), ("tf", "tf_model_mapping")]: mapping = dct.get(key, {}) if mapping: for configuration, model_architectures in mapping.items(): if not isinstance(model_architectures, tuple): model_architectures = (model_architectures, ) for model_architecture in model_architectures: checkpoint = get_checkpoint_from_architecture( model_architecture) tiny_config = get_tiny_config_from_class(configuration) tokenizer_classes = TOKENIZER_MAPPING.get( configuration, []) feature_extractor_class = FEATURE_EXTRACTOR_MAPPING.get( configuration, None) feature_extractor_name = ( feature_extractor_class.__name__ if feature_extractor_class else "nofeature_extractor") if not tokenizer_classes: # We need to test even if there are no tokenizers. tokenizer_classes = [None] for tokenizer_class in tokenizer_classes: if tokenizer_class is not None: tokenizer_name = tokenizer_class.__name__ else: tokenizer_name = "notokenizer" test_name = f"test_{prefix}_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_name}_{feature_extractor_name}" if tokenizer_class is not None or feature_extractor_class is not None: dct[test_name] = gen_test( model_architecture, checkpoint, tiny_config, tokenizer_class, feature_extractor_class, ) @abstractmethod def inner(self): raise NotImplementedError("Not implemented test") # Force these 2 methods to exist dct["test_small_model_pt"] = dct.get("test_small_model_pt", inner) dct["test_small_model_tf"] = dct.get("test_small_model_tf", inner) return type.__new__(mcs, name, bases, dct)
def __new__(mcs, name, bases, dct): def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class, feature_extractor_class): @skipIf(tiny_config is None, "TinyConfig does not exist") @skipIf(checkpoint is None, "checkpoint does not exist") def test(self): if ModelClass.__name__.endswith("ForCausalLM"): tiny_config.is_encoder_decoder = False if ModelClass.__name__.endswith("WithLMHead"): tiny_config.is_decoder = True model = ModelClass(tiny_config) if hasattr(model, "eval"): model = model.eval() if tokenizer_class is not None: try: tokenizer = get_tiny_tokenizer_from_checkpoint( checkpoint) # XLNet actually defines it as -1. if (hasattr(model.config, "max_position_embeddings") and model.config.max_position_embeddings > 0): tokenizer.model_max_length = model.config.max_position_embeddings # Rust Panic exception are NOT Exception subclass # Some test tokenizer contain broken vocabs or custom PreTokenizer, so we # provide some default tokenizer and hope for the best. except: # noqa: E722 self.skipTest( f"Ignoring {ModelClass}, cannot create a simple tokenizer" ) else: tokenizer = None feature_extractor = get_tiny_feature_extractor_from_checkpoint( checkpoint, tiny_config) self.run_pipeline_test(model, tokenizer, feature_extractor) return test for prefix, key in [("pt", "model_mapping"), ("tf", "tf_model_mapping")]: mapping = dct.get(key, {}) if mapping: for configuration, model_architectures in mapping.items(): if not isinstance(model_architectures, tuple): model_architectures = (model_architectures, ) for model_architecture in model_architectures: checkpoint = get_checkpoint_from_architecture( model_architecture) tiny_config = get_tiny_config_from_class(configuration) tokenizer_classes = TOKENIZER_MAPPING.get( configuration, []) feature_extractor_class = FEATURE_EXTRACTOR_MAPPING.get( configuration, None) feature_extractor_name = ( feature_extractor_class.__name__ if feature_extractor_class else "nofeature_extractor") if not tokenizer_classes: # We need to test even if there are no tokenizers. tokenizer_classes = [None] for tokenizer_class in tokenizer_classes: if tokenizer_class is not None: tokenizer_name = tokenizer_class.__name__ else: tokenizer_name = "notokenizer" test_name = f"test_{prefix}_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_name}_{feature_extractor_name}" if tokenizer_class is not None or feature_extractor_class is not None: dct[test_name] = gen_test( model_architecture, checkpoint, tiny_config, tokenizer_class, feature_extractor_class, ) @abstractmethod def inner(self): raise NotImplementedError("Not implemented test") # Force these 2 methods to exist dct["test_small_model_pt"] = dct.get("test_small_model_pt", inner) dct["test_small_model_tf"] = dct.get("test_small_model_tf", inner) return type.__new__(mcs, name, bases, dct)