Python TOKENIZER_MAPPING 예제들, transformers.TOKENIZER_MAPPING Python 예제들

예제 #1

0

파일 보기

    def __new__(mcs, name, bases, dct):
        def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class):
            @skipIf(tiny_config is None, "TinyConfig does not exist")
            @skipIf(checkpoint is None, "checkpoint does not exist")
            def test(self):
                model = ModelClass(tiny_config)
                if hasattr(model, "eval"):
                    model = model.eval()
                try:
                    tokenizer = get_tiny_tokenizer_from_checkpoint(checkpoint)
                    tokenizer.model_max_length = model.config.max_position_embeddings
                # Rust Panic exception are NOT Exception subclass
                # Some test tokenizer contain broken vocabs or custom PreTokenizer, so we
                # provide some default tokenizer and hope for the best.
                except:  # noqa: E722
                    logger.warning(
                        f"Tokenizer cannot be created from checkpoint {checkpoint}"
                    )
                    tokenizer = get_tiny_tokenizer_from_checkpoint("gpt2")
                    tokenizer.model_max_length = model.config.max_position_embeddings
                self.run_pipeline_test(model, tokenizer)

            return test

        mapping = dct.get("model_mapping", {})
        if mapping:
            for configuration, model_architecture in mapping.items():
                checkpoint = get_checkpoint_from_architecture(
                    model_architecture)
                tiny_config = get_tiny_config_from_class(configuration)
                tokenizer_classes = TOKENIZER_MAPPING.get(configuration, [])
                for tokenizer_class in tokenizer_classes:
                    if tokenizer_class is not None and tokenizer_class.__name__.endswith(
                            "Fast"):
                        test_name = f"test_pt_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_class.__name__}"
                        dct[test_name] = gen_test(model_architecture,
                                                  checkpoint, tiny_config,
                                                  tokenizer_class)

        tf_mapping = dct.get("tf_model_mapping", {})
        if tf_mapping:
            for configuration, model_architecture in tf_mapping.items():
                checkpoint = get_checkpoint_from_architecture(
                    model_architecture)
                tiny_config = get_tiny_config_from_class(configuration)
                tokenizer_classes = TOKENIZER_MAPPING.get(configuration, [])
                for tokenizer_class in tokenizer_classes:
                    if tokenizer_class is not None and tokenizer_class.__name__.endswith(
                            "Fast"):
                        test_name = f"test_tf_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_class.__name__}"
                        dct[test_name] = gen_test(model_architecture,
                                                  checkpoint, tiny_config,
                                                  tokenizer_class)

        return type.__new__(mcs, name, bases, dct)

예제 #2

0

파일 보기

    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
        config = kwargs.pop("config", None)
        if not isinstance(config, PretrainedConfig):
            if pretrained_model_name_or_path in UNILM_PRETRAINED_CONFIG_ARCHIVE_MAP:
                pretrained_config_name_or_path = UNILM_PRETRAINED_CONFIG_ARCHIVE_MAP[
                    pretrained_model_name_or_path]
            else:
                pretrained_config_name_or_path = pretrained_model_name_or_path
            config = AutoConfig.from_pretrained(pretrained_config_name_or_path,
                                                **kwargs)

        if "bert-base-japanese" in str(pretrained_model_name_or_path):
            return BertJapaneseTokenizer.from_pretrained(
                pretrained_model_name_or_path, *inputs, **kwargs)

        use_fast = kwargs.pop("use_fast", False)
        for config_class, (tokenizer_class_py, tokenizer_class_fast
                           ) in transformers.TOKENIZER_MAPPING.items():
            if isinstance(config, config_class):
                if tokenizer_class_fast and use_fast:
                    return tokenizer_class_fast.from_pretrained(
                        pretrained_model_name_or_path, *inputs, **kwargs)
                else:
                    return tokenizer_class_py.from_pretrained(
                        pretrained_model_name_or_path, *inputs, **kwargs)

        raise ValueError(
            "Unrecognized configuration class {} to build an AutoTokenizer.\n"
            "Model type should be one of {}.".format(
                config.__class__,
                ", ".join(c.__name__ for c in TOKENIZER_MAPPING.keys())))

예제 #3

0

파일 보기

    def __new__(mcs, name, bases, dct):
        def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class):
            @skipIf(tiny_config is None, "TinyConfig does not exist")
            @skipIf(checkpoint is None, "checkpoint does not exist")
            def test(self):
                model = ModelClass(tiny_config)
                if hasattr(model, "eval"):
                    model = model.eval()
                try:
                    tokenizer = get_tiny_tokenizer_from_checkpoint(checkpoint)
                    if hasattr(model.config, "max_position_embeddings"):
                        tokenizer.model_max_length = model.config.max_position_embeddings
                # Rust Panic exception are NOT Exception subclass
                # Some test tokenizer contain broken vocabs or custom PreTokenizer, so we
                # provide some default tokenizer and hope for the best.
                except:  # noqa: E722
                    self.skipTest(
                        f"Ignoring {ModelClass}, cannot create a simple tokenizer"
                    )
                self.run_pipeline_test(model, tokenizer)

            return test

        for prefix, key in [("pt", "model_mapping"),
                            ("tf", "tf_model_mapping")]:
            mapping = dct.get(key, {})
            if mapping:
                for configuration, model_architectures in mapping.items():
                    if not isinstance(model_architectures, tuple):
                        model_architectures = (model_architectures, )

                    for model_architecture in model_architectures:
                        checkpoint = get_checkpoint_from_architecture(
                            model_architecture)
                        tiny_config = get_tiny_config_from_class(configuration)
                        tokenizer_classes = TOKENIZER_MAPPING.get(
                            configuration, [])
                        for tokenizer_class in tokenizer_classes:
                            if tokenizer_class is not None and tokenizer_class.__name__.endswith(
                                    "Fast"):
                                test_name = f"test_{prefix}_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_class.__name__}"
                                dct[test_name] = gen_test(
                                    model_architecture, checkpoint,
                                    tiny_config, tokenizer_class)

        return type.__new__(mcs, name, bases, dct)

예제 #4

0

파일 보기

    def __new__(mcs, name, bases, dct):
        def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class,
                     feature_extractor_class):
            @skipIf(tiny_config is None, "TinyConfig does not exist")
            @skipIf(checkpoint is None, "checkpoint does not exist")
            def test(self):
                if ModelClass.__name__.endswith("ForCausalLM"):
                    tiny_config.is_encoder_decoder = False
                    if hasattr(tiny_config, "encoder_no_repeat_ngram_size"):
                        # specific for blenderbot which supports both decoder-only
                        # encoder/decoder but the test config  only reflects
                        # encoder/decoder arch
                        tiny_config.encoder_no_repeat_ngram_size = 0
                if ModelClass.__name__.endswith("WithLMHead"):
                    tiny_config.is_decoder = True
                try:
                    model = ModelClass(tiny_config)
                except ImportError as e:
                    self.skipTest(
                        f"Cannot run with {tiny_config} as the model requires a library that isn't installed: {e}"
                    )
                if hasattr(model, "eval"):
                    model = model.eval()
                if tokenizer_class is not None:
                    try:
                        tokenizer = get_tiny_tokenizer_from_checkpoint(
                            checkpoint)
                        # XLNet actually defines it as -1.
                        if isinstance(model.config,
                                      (RobertaConfig, IBertConfig)):
                            tokenizer.model_max_length = model.config.max_position_embeddings - 2
                        elif (hasattr(model.config, "max_position_embeddings")
                              and model.config.max_position_embeddings > 0):
                            tokenizer.model_max_length = model.config.max_position_embeddings
                    # Rust Panic exception are NOT Exception subclass
                    # Some test tokenizer contain broken vocabs or custom PreTokenizer, so we
                    # provide some default tokenizer and hope for the best.
                    except:  # noqa: E722
                        self.skipTest(
                            f"Ignoring {ModelClass}, cannot create a simple tokenizer"
                        )
                else:
                    tokenizer = None
                feature_extractor = get_tiny_feature_extractor_from_checkpoint(
                    checkpoint, tiny_config)
                pipeline, examples = self.get_test_pipeline(
                    model, tokenizer, feature_extractor)
                if pipeline is None:
                    # The test can disable itself, but it should be very marginal
                    # Concerns: Wav2Vec2ForCTC without tokenizer test (FastTokenizer don't exist)
                    return
                self.run_pipeline_test(pipeline, examples)

                def run_batch_test(pipeline, examples):
                    # Need to copy because `Conversation` are stateful
                    if pipeline.tokenizer is not None and pipeline.tokenizer.pad_token_id is None:
                        return  # No batching for this and it's OK

                    # 10 examples with batch size 4 means there needs to be a unfinished batch
                    # which is important for the unbatcher
                    dataset = [
                        copy.deepcopy(random.choice(examples))
                        for i in range(10)
                    ]

                    for item in pipeline(dataset, batch_size=4):
                        pass

                run_batch_test(pipeline, examples)

            return test

        for prefix, key in [("pt", "model_mapping"),
                            ("tf", "tf_model_mapping")]:
            mapping = dct.get(key, {})
            if mapping:
                for configuration, model_architectures in mapping.items():
                    if not isinstance(model_architectures, tuple):
                        model_architectures = (model_architectures, )

                    for model_architecture in model_architectures:
                        checkpoint = get_checkpoint_from_architecture(
                            model_architecture)
                        tiny_config = get_tiny_config_from_class(configuration)
                        tokenizer_classes = TOKENIZER_MAPPING.get(
                            configuration, [])
                        feature_extractor_class = FEATURE_EXTRACTOR_MAPPING.get(
                            configuration, None)
                        feature_extractor_name = (
                            feature_extractor_class.__name__ if
                            feature_extractor_class else "nofeature_extractor")
                        if not tokenizer_classes:
                            # We need to test even if there are no tokenizers.
                            tokenizer_classes = [None]
                        for tokenizer_class in tokenizer_classes:
                            if tokenizer_class is not None:
                                tokenizer_name = tokenizer_class.__name__
                            else:
                                tokenizer_name = "notokenizer"

                            test_name = f"test_{prefix}_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_name}_{feature_extractor_name}"

                            if tokenizer_class is not None or feature_extractor_class is not None:
                                dct[test_name] = gen_test(
                                    model_architecture,
                                    checkpoint,
                                    tiny_config,
                                    tokenizer_class,
                                    feature_extractor_class,
                                )

        @abstractmethod
        def inner(self):
            raise NotImplementedError("Not implemented test")

        # Force these 2 methods to exist
        dct["test_small_model_pt"] = dct.get("test_small_model_pt", inner)
        dct["test_small_model_tf"] = dct.get("test_small_model_tf", inner)

        return type.__new__(mcs, name, bases, dct)

예제 #5

0

파일 보기

파일: test_pipelines_common.py 프로젝트: hSterz/adapter-transformers

    def __new__(mcs, name, bases, dct):
        def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class,
                     feature_extractor_class):
            @skipIf(tiny_config is None, "TinyConfig does not exist")
            @skipIf(checkpoint is None, "checkpoint does not exist")
            def test(self):
                if ModelClass.__name__.endswith("ForCausalLM"):
                    tiny_config.is_encoder_decoder = False
                if ModelClass.__name__.endswith("WithLMHead"):
                    tiny_config.is_decoder = True
                model = ModelClass(tiny_config)
                if hasattr(model, "eval"):
                    model = model.eval()
                if tokenizer_class is not None:
                    try:
                        tokenizer = get_tiny_tokenizer_from_checkpoint(
                            checkpoint)
                        # XLNet actually defines it as -1.
                        if (hasattr(model.config, "max_position_embeddings")
                                and model.config.max_position_embeddings > 0):
                            tokenizer.model_max_length = model.config.max_position_embeddings
                    # Rust Panic exception are NOT Exception subclass
                    # Some test tokenizer contain broken vocabs or custom PreTokenizer, so we
                    # provide some default tokenizer and hope for the best.
                    except:  # noqa: E722
                        self.skipTest(
                            f"Ignoring {ModelClass}, cannot create a simple tokenizer"
                        )
                else:
                    tokenizer = None
                feature_extractor = get_tiny_feature_extractor_from_checkpoint(
                    checkpoint, tiny_config)
                self.run_pipeline_test(model, tokenizer, feature_extractor)

            return test

        for prefix, key in [("pt", "model_mapping"),
                            ("tf", "tf_model_mapping")]:
            mapping = dct.get(key, {})
            if mapping:
                for configuration, model_architectures in mapping.items():
                    if not isinstance(model_architectures, tuple):
                        model_architectures = (model_architectures, )

                    for model_architecture in model_architectures:
                        checkpoint = get_checkpoint_from_architecture(
                            model_architecture)
                        tiny_config = get_tiny_config_from_class(configuration)
                        tokenizer_classes = TOKENIZER_MAPPING.get(
                            configuration, [])
                        feature_extractor_class = FEATURE_EXTRACTOR_MAPPING.get(
                            configuration, None)
                        feature_extractor_name = (
                            feature_extractor_class.__name__ if
                            feature_extractor_class else "nofeature_extractor")
                        if not tokenizer_classes:
                            # We need to test even if there are no tokenizers.
                            tokenizer_classes = [None]
                        for tokenizer_class in tokenizer_classes:
                            if tokenizer_class is not None:
                                tokenizer_name = tokenizer_class.__name__
                            else:
                                tokenizer_name = "notokenizer"

                            test_name = f"test_{prefix}_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_name}_{feature_extractor_name}"

                            if tokenizer_class is not None or feature_extractor_class is not None:
                                dct[test_name] = gen_test(
                                    model_architecture,
                                    checkpoint,
                                    tiny_config,
                                    tokenizer_class,
                                    feature_extractor_class,
                                )

        @abstractmethod
        def inner(self):
            raise NotImplementedError("Not implemented test")

        # Force these 2 methods to exist
        dct["test_small_model_pt"] = dct.get("test_small_model_pt", inner)
        dct["test_small_model_tf"] = dct.get("test_small_model_tf", inner)

        return type.__new__(mcs, name, bases, dct)

예제 #6

0

파일 보기

from transformers import CONFIG_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, MODEL_NAMES_MAPPING, TOKENIZER_MAPPING
from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS, BertConverter
from transformers.models.auto.modeling_auto import auto_class_factory

from .modeling.layoutlmv2 import (
    LayoutLMv2Config,
    LayoutLMv2ForTokenClassification,
    LayoutLMv2Tokenizer,
    LayoutLMv2TokenizerFast,
)

CONFIG_MAPPING.update([("layoutlmv2", LayoutLMv2Config)])
MODEL_NAMES_MAPPING.update([("layoutlmv2", "LayoutLMv2")])
TOKENIZER_MAPPING.update([(LayoutLMv2Config, (LayoutLMv2Tokenizer,
                                              LayoutLMv2TokenizerFast))])
SLOW_TO_FAST_CONVERTERS.update({"LayoutLMv2Tokenizer": BertConverter})
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.update([
    (LayoutLMv2Config, LayoutLMv2ForTokenClassification)
])
AutoModelForTokenClassification = auto_class_factory(
    "AutoModelForTokenClassification",
    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
    head_doc="token classification")

예제 #7

0

파일 보기

def register_bert_model(bert_cls):
    """
    This function wraps a BertModel inherited cls and automatically:
        1. Creates an associated BertConfig
        2. Creates an associated BertForMaskedLM
        3. Creates an associated BertForSequenceClassification
        4. Creates an associated BertForQuestionAnswering
        5. Registers these classes with Transformers model mappings

    This last step ensures that the resulting config and models may be used by
    AutoConfig, AutoModelForMaskedLM, and AutoModelForSequenceClassification.

    Assumptions are made to auto-name these classes and the corresponding model type.
    For instance, SparseBertModel will have model_type="sparse_bert" and associated
    classes like SparseBertConfig.

    To customize the the inputs to the model's config, include the dataclass
    `bert_cls.ConfigKWargs`. This is, in fact, required. Upon initialization of the
    config, the fields of that dataclass will be used to extract extra keyword arguments
    and assign them as attributes to the config.

    Example
    ```
    @register_bert_model
    class SparseBertModel(BertModel):

        @dataclass
        class ConfigKWargs:
            # Keyword arguments to configure sparsity.
            sparsity: float = 0.9

        # Define __init__, ect.
        ...

    # Model is ready to auto load.
    config = AutoConfig.for_model("sparse_bert", sparsity=0.5)
    model = AutoModelForMaskedLM.from_config(model)

    config.sparsity
    >>> 0.5

    type(model)
    >>> SparseBertModelForMaskedLM
    """

    assert bert_cls.__name__.endswith("BertModel")

    # Get first part of name e.g. StaticSparseBertModel -> StaticSparse
    name_prefix = bert_cls.__name__.replace("BertModel", "")

    # Create new bert config and models based off of `bert_cls`.
    config_cls = create_config_class(bert_cls, name_prefix)
    masked_lm_cls = create_masked_lm_class(bert_cls, name_prefix)
    seq_classification_cls = create_sequence_classification_class(bert_cls, name_prefix)
    question_answering_cls = create_question_answering_class(bert_cls, name_prefix)

    # Specify the correct config class
    bert_cls.config_class = config_cls
    masked_lm_cls.config_class = config_cls
    seq_classification_cls.config_class = config_cls
    question_answering_cls.config_class = config_cls

    # Update Transformers mappings to auto-load these new models.
    CONFIG_MAPPING.update({
        config_cls.model_type: config_cls
    })
    TOKENIZER_MAPPING.update({
        config_cls: (BertTokenizer, BertTokenizerFast),
    })
    MODEL_FOR_MASKED_LM_MAPPING.update({
        config_cls: masked_lm_cls,
    })
    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.update({
        config_cls: seq_classification_cls
    })
    MODEL_FOR_QUESTION_ANSWERING_MAPPING.update({
        config_cls: question_answering_cls
    })

    # Update the `models` modules so that these classes may be imported.
    __models_dict__.update({
        config_cls.__name__: config_cls,
        masked_lm_cls.__name__: masked_lm_cls,
        seq_classification_cls.__name__: seq_classification_cls,
        question_answering_cls.__name__: question_answering_cls,
    })

예제 #8

0

파일 보기

    MarkupLMTokenizer,
    MarkupLMForQuestionAnswering,
    MarkupLMForTokenClassification,
    MarkupLMTokenizerFast,
)

CONFIG_MAPPING.update(
    [
        ("markuplm", MarkupLMConfig),
    ]
)
MODEL_NAMES_MAPPING.update([("markuplm", "MarkupLM")])

TOKENIZER_MAPPING.update(
    [
        (MarkupLMConfig, (MarkupLMTokenizer, MarkupLMTokenizerFast)),
    ]
)

SLOW_TO_FAST_CONVERTERS.update(
    {"MarkupLMTokenizer": RobertaConverter}
)

MODEL_FOR_QUESTION_ANSWERING_MAPPING.update(
    [(MarkupLMConfig, MarkupLMForQuestionAnswering)]
)

MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.update(
    [(MarkupLMConfig, MarkupLMForTokenClassification)]
)

예제 #9

0

파일 보기

# Licensed under the MIT License.
import transformers
from fastseq.models.unilm_hf.configuration_unilm import (
    UNILM_PRETRAINED_CONFIG_ARCHIVE_MAP, UnilmConfig)
from fastseq.models.unilm_hf.modeling_unilm import UnilmForSeq2Seq
from fastseq.models.unilm_hf.tokenization_unilm import UnilmTokenizer
from fastseq.utils.api_decorator import replace
from transformers import (CONFIG_MAPPING,
                          MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
                          TOKENIZER_MAPPING, AutoConfig, AutoModelForSeq2SeqLM,
                          AutoTokenizer, PretrainedConfig)

CONFIG_MAPPING['unilm'] = UnilmConfig
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING[UnilmConfig] = UnilmForSeq2Seq
TOKENIZER_MAPPING[UnilmConfig] = (UnilmTokenizer, None)
TOKENIZER_MAPPING.move_to_end(transformers.configuration_bert.BertConfig)


@replace(AutoModelForSeq2SeqLM)
class AutoModelForSeq2SeqLMV2(AutoModelForSeq2SeqLM):
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
                        **kwargs):
        config = kwargs.pop("config", None)
        if not isinstance(config, PretrainedConfig):
            if pretrained_model_name_or_path in UNILM_PRETRAINED_CONFIG_ARCHIVE_MAP:
                pretrained_config_name_or_path = UNILM_PRETRAINED_CONFIG_ARCHIVE_MAP[
                    pretrained_model_name_or_path]
            else:
                pretrained_config_name_or_path = pretrained_model_name_or_path
            config = AutoConfig.from_pretrained(pretrained_config_name_or_path,