def test_spm_converter_bytefallback_warning(self):
        spm_model_file_without_bytefallback = get_tests_dir(
            "fixtures/test_sentencepiece.model")
        spm_model_file_with_bytefallback = get_tests_dir(
            "fixtures/test_sentencepiece_with_bytefallback.model")

        original_tokenizer_without_bytefallback = FakeOriginalTokenizer(
            vocab_file=spm_model_file_without_bytefallback)

        with warnings.catch_warnings(record=True) as w:
            _ = SpmConverter(original_tokenizer_without_bytefallback)
        self.assertEqual(len(w), 0)

        original_tokenizer_with_bytefallback = FakeOriginalTokenizer(
            vocab_file=spm_model_file_with_bytefallback)

        with warnings.catch_warnings(record=True) as w:
            _ = SpmConverter(original_tokenizer_with_bytefallback)
        self.assertEqual(len(w), 1)
        self.assertIn(
            "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
            " which is not implemented in the fast tokenizers.",
            str(w[0].message),
        )
Пример #2
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

from transformers import BertGenerationTokenizer
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_torch, slow
from transformers.utils import cached_property

from ...test_tokenization_common import TokenizerTesterMixin

SPIECE_UNDERLINE = "▁"

SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")


@require_sentencepiece
class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):

    tokenizer_class = BertGenerationTokenizer
    test_rust_tokenizer = False
    test_sentencepiece = True

    def setUp(self):
        super().setUp()

        tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True)
        tokenizer.save_pretrained(self.tmpdirname)
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import unittest
from typing import Tuple

from transformers.models.mluke.tokenization_mluke import MLukeTokenizer
from transformers.testing_utils import get_tests_dir, require_torch, slow

from ...test_tokenization_common import TokenizerTesterMixin


SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
SAMPLE_ENTITY_VOCAB = get_tests_dir("fixtures/test_entity_vocab.json")


class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = MLukeTokenizer
    test_rust_tokenizer = False
    from_pretrained_kwargs = {"cls_token": "<s>"}

    def setUp(self):
        super().setUp()

        self.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}

    def get_tokenizer(self, task=None, **kwargs):
        kwargs.update(self.special_tokens_map)
Пример #4
0
    Wav2Vec2Processor,
)
from transformers.testing_utils import PASS, USER, get_tests_dir, is_staging_test
from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
from transformers.utils import FEATURE_EXTRACTOR_NAME, is_tokenizers_available


sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))

from test_module.custom_configuration import CustomConfig  # noqa E402
from test_module.custom_feature_extraction import CustomFeatureExtractor  # noqa E402
from test_module.custom_processing import CustomProcessor  # noqa E402
from test_module.custom_tokenization import CustomTokenizer  # noqa E402


SAMPLE_PROCESSOR_CONFIG = get_tests_dir("fixtures/dummy_feature_extractor_config.json")
SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.json")
SAMPLE_PROCESSOR_CONFIG_DIR = get_tests_dir("fixtures")


class AutoFeatureExtractorTest(unittest.TestCase):
    vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]

    def test_processor_from_model_shortcut(self):
        processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
        self.assertIsInstance(processor, Wav2Vec2Processor)

    def test_processor_from_local_directory_from_repo(self):
        with tempfile.TemporaryDirectory() as tmpdirname:
            model_config = Wav2Vec2Config()
            processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
Пример #5
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
from typing import Tuple

from transformers import AddedToken, LukeTokenizer
from transformers.testing_utils import get_tests_dir, require_torch, slow

from ...test_tokenization_common import TokenizerTesterMixin

SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.json")
SAMPLE_MERGE_FILE = get_tests_dir("fixtures/merges.txt")
SAMPLE_ENTITY_VOCAB = get_tests_dir("fixtures/test_entity_vocab.json")


class LukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = LukeTokenizer
    test_rust_tokenizer = False
    from_pretrained_kwargs = {"cls_token": "<s>"}

    def setUp(self):
        super().setUp()

        self.special_tokens_map = {
            "entity_token_1": "<ent>",
            "entity_token_2": "<ent2>"
import unittest
from pathlib import Path

import transformers.models.auto
from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig
from transformers.models.bert.configuration_bert import BertConfig
from transformers.models.roberta.configuration_roberta import RobertaConfig
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, get_tests_dir


sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))

from test_module.custom_configuration import CustomConfig  # noqa E402


SAMPLE_ROBERTA_CONFIG = get_tests_dir("fixtures/dummy-config.json")


class AutoConfigTest(unittest.TestCase):
    def test_module_spec(self):
        self.assertIsNotNone(transformers.models.auto.__spec__)
        self.assertIsNotNone(importlib.util.find_spec("transformers.models.auto"))

    def test_config_from_model_shortcut(self):
        config = AutoConfig.from_pretrained("bert-base-uncased")
        self.assertIsInstance(config, BertConfig)

    def test_config_model_type_from_local_file(self):
        config = AutoConfig.from_pretrained(SAMPLE_ROBERTA_CONFIG)
        self.assertIsInstance(config, RobertaConfig)
Пример #7
0

sys.path.append(str(Path(__file__).parent.parent / "utils"))

from test_module.custom_feature_extraction import CustomFeatureExtractor  # noqa E402


if is_torch_available():
    import numpy as np
    import torch

if is_vision_available():
    from PIL import Image


SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = get_tests_dir("fixtures")


def prepare_image_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False):
    """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
    or a list of PyTorch tensors if one specifies torchify=True.
    """

    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"

    if equal_resolution:
        image_inputs = []
        for i in range(feature_extract_tester.batch_size):
            image_inputs.append(
                np.random.randint(
                    255,
import io
import unittest

try:
    from .utils import calculate_bleu
except ImportError:
    from utils import calculate_bleu

import json

from parameterized import parameterized
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
from transformers.testing_utils import get_tests_dir, require_torch, slow, torch_device

filename = get_tests_dir() + "/test_data/fsmt/fsmt_val_data.json"
with io.open(filename, "r", encoding="utf-8") as f:
    bleu_data = json.load(f)


@require_torch
class ModelEvalTester(unittest.TestCase):
    def get_tokenizer(self, mname):
        return FSMTTokenizer.from_pretrained(mname)

    def get_model(self, mname):
        model = FSMTForConditionalGeneration.from_pretrained(mname).to(
            torch_device)
        if torch_device == "cuda":
            model.half()
        return model
Пример #9
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

from transformers import AlbertTokenizer, AlbertTokenizerFast
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow

from ...test_tokenization_common import TokenizerTesterMixin


SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model")


@require_sentencepiece
@require_tokenizers
class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):

    tokenizer_class = AlbertTokenizer
    rust_tokenizer_class = AlbertTokenizerFast
    test_rust_tokenizer = True
    test_sentencepiece = True
    test_sentencepiece_ignore_case = True

    def setUp(self):
        super().setUp()
Пример #10
0
    get_tests_dir,
    require_deepspeed,
    require_torch_gpu,
    slow,
)
from transformers.trainer_utils import set_seed

if is_torch_available():
    from tests.trainer.test_trainer import (  # noqa
        RegressionModelConfig, RegressionPreTrainedModel,
        get_regression_trainer,
    )

set_seed(42)

FIXTURE_DIRECTORY = get_tests_dir("fixtures")
ROOT_DIRECTORY = os.path.join(dirname(get_tests_dir()))
DS_TESTS_DIRECTORY = dirname(os.path.abspath(__file__))

# default torch.distributed port
DEFAULT_MASTER_PORT = "10999"

T5_SMALL = "t5-small"

# *** Working Models ***
ALBERT_TINY = "hf-internal-testing/tiny-albert"
BART_TINY = "sshleifer/bart-tiny-random"
BERT_TINY = "hf-internal-testing/tiny-bert"
BIGBIRD_PEGASUS_TINY = "hf-internal-testing/tiny-random-bigbird_pegasus"
BIG_BIRD_TINY = "hf-internal-testing/tiny-random-big_bird"
BLENDERBOT_TINY = "hf-internal-testing/tiny-random-blenderbot"
Пример #11
0
from transformers import (
    CONFIG_MAPPING,
    FEATURE_EXTRACTOR_MAPPING,
    AutoConfig,
    AutoFeatureExtractor,
    Wav2Vec2Config,
    Wav2Vec2FeatureExtractor,
)
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, get_tests_dir

sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))

from test_module.custom_configuration import CustomConfig  # noqa E402
from test_module.custom_feature_extraction import CustomFeatureExtractor  # noqa E402

SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = get_tests_dir("fixtures")
SAMPLE_FEATURE_EXTRACTION_CONFIG = get_tests_dir(
    "fixtures/dummy_feature_extractor_config.json")
SAMPLE_CONFIG = get_tests_dir("fixtures/dummy-config.json")


class AutoFeatureExtractorTest(unittest.TestCase):
    def test_feature_extractor_from_model_shortcut(self):
        config = AutoFeatureExtractor.from_pretrained(
            "facebook/wav2vec2-base-960h")
        self.assertIsInstance(config, Wav2Vec2FeatureExtractor)

    def test_feature_extractor_from_local_directory_from_key(self):
        config = AutoFeatureExtractor.from_pretrained(
            SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
        self.assertIsInstance(config, Wav2Vec2FeatureExtractor)