class TestDecode(unittest.TestCase): r"""Test case for `lmp.tokenizer.CharDictTokenizer.decode`.""" @classmethod def setUpClass(cls): cls.vocab_source = [ 'Hello World!', 'I am a legend.', ] @classmethod def tearDownClass(cls): del cls.vocab_source gc.collect() def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharDictTokenizer() self.cased_tokenizer.build_vocab(self.__class__.vocab_source) self.uncased_tokenizer = CharDictTokenizer(is_uncased=True) self.uncased_tokenizer.build_vocab(self.__class__.vocab_source) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(CharDictTokenizer.decode), inspect.Signature(parameters=[ inspect.Parameter( name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, ), inspect.Parameter(name='token_ids', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=Iterable[int], default=inspect.Parameter.empty), inspect.Parameter(name='remove_special_tokens', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=bool, default=False) ], return_annotation=str), msg=msg) def test_invalid_input_token_ids(self): r"""Raise `TypeError` when input `token_ids` is invalid.""" msg1 = 'Must raise `TypeError` when input `token_ids` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, object(), lambda x: x, type, None, NotImplemented, ..., [0.0], [1.0], [math.nan], [-math.nan], [math.inf], [-math.inf], [0j], [1j], [''], [b''], [()], [[]], [{}], [set()], [object()], [lambda x: x], [type], [None], [NotImplemented], [...], [0, 0.0], [0, 1.0], [0, math.nan], [0, -math.nan], [0, math.inf], [0, -math.inf], [0, 0j], [0, 1j], [0, ''], [0, b''], [0, ()], [0, []], [0, {}], [0, set()], [0, object()], [0, lambda x: x], [0, type], [0, None], [0, NotImplemented], [0, ...], ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.decode(token_ids=invalid_input) self.assertEqual( cxt_man.exception.args[0], '`token_ids` must be an instance of `Iterable[int]`.', msg=msg2) def test_invalid_input_remove_special_tokens(self): r"""Raise `TypeError` when input `remove_special_tokens` is invalid.""" msg1 = ('Must raise `TypeError` when input `remove_special_tokens` is ' 'invalid.') msg2 = 'Inconsistent error message.' examples = ( 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.decode(token_ids=[], remove_special_tokens=invalid_input) self.assertEqual( cxt_man.exception.args[0], '`remove_special_tokens` must be an instance of `bool`.', msg=msg2) def test_return_type(self): r"""Return `str`.""" msg = 'Must return `str`.' examples = ( [0, 1, 2, 3], [4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0], [], ) for token_ids in examples: for tokenizer in self.tokenizers: self.assertIsInstance(tokenizer.decode(token_ids=token_ids), str, msg=msg) def test_remove_special_tokens(self): r"""Remove special tokens.""" msg = 'Must remove special tokens.' examples = ( ( False, [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1, 2], '[bos]Hello World![eos][pad]', '[bos]hello world![eos][pad]', ), ( False, [0, 14, 5, 9, 15, 5, 9, 5, 3, 1, 2, 2], '[bos]I am a [unk][eos][pad][pad]', '[bos]i am a [unk][eos][pad][pad]', ), ( False, [0, 19, 4, 6, 16, 6, 17, 8, 18, 1], '[bos][unk]legend.[eos]', '[bos][unk]legend.[eos]', ), ( True, [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1, 2], 'Hello World!', 'hello world!', ), ( True, [0, 14, 5, 9, 15, 5, 9, 5, 3, 1, 2, 2], 'I am a [unk]', 'i am a [unk]', ), ( True, [0, 19, 4, 6, 16, 6, 17, 8, 18, 1], '[unk]legend.', '[unk]legend.', ), ) for (remove_special_tokens, token_ids, cased_sequence, uncased_sequence) in examples: self.assertEqual(self.cased_tokenizer.decode( token_ids=token_ids, remove_special_tokens=remove_special_tokens), cased_sequence, msg=msg) self.assertEqual(self.uncased_tokenizer.decode( token_ids=token_ids, remove_special_tokens=remove_special_tokens), uncased_sequence, msg=msg)
class TestTokenize(unittest.TestCase): r"""Test case for `lmp.tokenizer.CharDictTokenizer.tokenize`.""" @classmethod def setUpClass(cls): cls.vocab_source = [ 'Hello World!', 'I am a legend.', ] @classmethod def tearDownClass(cls): del cls.vocab_source gc.collect() def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharDictTokenizer() self.cased_tokenizer.build_vocab(self.__class__.vocab_source) self.uncased_tokenizer = CharDictTokenizer(is_uncased=True) self.uncased_tokenizer.build_vocab(self.__class__.vocab_source) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(CharDictTokenizer.tokenize), inspect.Signature(parameters=[ inspect.Parameter(name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty), inspect.Parameter(name='sequence', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=str, default=inspect.Parameter.empty) ], return_annotation=List[str]), msg=msg) def test_invalid_input_sequence(self): r"""Raise `TypeError` when input `sequence` is invalid.""" msg1 = 'Must raise `TypeError` when input `sequence` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, b'', 0j, 1j, (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as ctx_man: tokenizer.tokenize(invalid_input) self.assertEqual(ctx_man.exception.args[0], '`sequence` must be an instance of `str`.', msg=msg2) def test_return_type(self): r"""Return `List[str]`.""" msg = 'Must return `List[str]`.' examples = ( 'Hello world!', 'H', '', ) for sequence in examples: for tokenizer in self.tokenizers: tokens = tokenizer.tokenize(sequence) self.assertIsInstance(tokens, list, msg=msg) for token in tokens: self.assertIsInstance(token, str, msg=msg) def test_normalize(self): r"""Return sequence is normalized.""" msg = 'Return sequence must be normalized.' examples = ( ( ' HeLlO WoRlD!', [ 'H', 'e', 'L', 'l', 'O', ' ', 'W', 'o', 'R', 'l', 'D', '!', ], [ 'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', ], ), ( 'HeLlO WoRlD! ', [ 'H', 'e', 'L', 'l', 'O', ' ', 'W', 'o', 'R', 'l', 'D', '!', ], [ 'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', ], ), ( ' HeLlO WoRlD! ', [ 'H', 'e', 'L', 'l', 'O', ' ', 'W', 'o', 'R', 'l', 'D', '!', ], [ 'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', ], ), ( '0', ['0'], ['0'], ), ( 'é', [unicodedata.normalize('NFKC', 'é')], [unicodedata.normalize('NFKC', 'é')], ), ( '0é', [ unicodedata.normalize('NFKC', '0'), unicodedata.normalize('NFKC', 'é'), ], [ unicodedata.normalize('NFKC', '0'), unicodedata.normalize('NFKC', 'é'), ], ), ( '', [], [], ), ) for sequence, cased_tokens, uncased_tokens in examples: self.assertEqual(self.cased_tokenizer.tokenize(sequence), cased_tokens, msg=msg) self.assertEqual(self.uncased_tokenizer.tokenize(sequence), uncased_tokens, msg=msg)
class TestEncode(unittest.TestCase): r"""Test case for `lmp.tokenizer.CharDictTokenizer.encode`.""" @classmethod def setUpClass(cls): cls.vocab_source = [ 'Hello World!', 'I am a legend.', ] @classmethod def tearDownClass(cls): del cls.vocab_source gc.collect() def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharDictTokenizer() self.cased_tokenizer.build_vocab(self.__class__.vocab_source) self.uncased_tokenizer = CharDictTokenizer(is_uncased=True) self.uncased_tokenizer.build_vocab(self.__class__.vocab_source) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(CharDictTokenizer.encode), inspect.Signature(parameters=[ inspect.Parameter( name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, ), inspect.Parameter(name='sequence', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=str, default=inspect.Parameter.empty), inspect.Parameter(name='max_seq_len', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=int, default=-1) ], return_annotation=List[int]), msg=msg) def test_invalid_input_sequence(self): r"""Raise `TypeError` when input `sequence` is invalid.""" msg1 = 'Must raise `TypeError` when input `sequence` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.encode(sequence=invalid_input) self.assertEqual(cxt_man.exception.args[0], '`sequence` must be an instance of `str`.', msg=msg2) def test_invalid_input_max_seq_len(self): r"""Raise exception when input `max_seq_len` is invalid.""" msg1 = ( 'Must raise `TypeError` or `ValueError` when input `max_seq_len` ' 'is invalid.') msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -2, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises((TypeError, ValueError), msg=msg1) as cxt_man: tokenizer.encode(sequence='', max_seq_len=invalid_input) if isinstance(cxt_man.exception, TypeError): self.assertEqual( cxt_man.exception.args[0], '`max_seq_len` must be an instance of `int`.', msg=msg2) else: self.assertEqual( cxt_man.exception.args[0], '`max_seq_len` must be greater than `1` or equal to ' '`-1`.', msg=msg2) def test_return_type(self): r"""Return `List[int]`.""" msg = 'Must return `List[int]`.' examples = ( 'Hello world!', 'I am a legend.', 'y = f(x)', '', ) for sequence in examples: for tokenizer in self.tokenizers: token_ids = tokenizer.encode(sequence=sequence) self.assertIsInstance(token_ids, list, msg=msg) for token_id in token_ids: self.assertIsInstance(token_id, int, msg=msg) def test_encode_format(self): r"""Follow encode format.""" msg = 'Must follow encode format: [bos] t1 t2 ... tn [eos].' examples = ( ( 'Hello World!', [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1], ), ( 'I am a legend.', [0, 14, 5, 9, 15, 5, 9, 5, 4, 6, 16, 6, 17, 8, 18, 1], ), ( 'y = f(x)', [0, 3, 5, 3, 5, 3, 3, 3, 3, 1], ), ( '', [0, 1], ), ) for sequence, token_ids in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.encode(sequence=sequence), token_ids, msg=msg) def test_truncate(self): r"""Token ids' length must not exceed `max_seq_len`.""" msg = 'Token ids\' length must not exceed `max_seq_len`.' examples = ( ( 'Hello World!', [0, 10, 6, 4, 4, 7, 5, 11, 7, 1], 10, ), ( 'I am a legend.', [0, 14, 5, 9, 1], 5, ), ( 'y = f(x)', [0, 3, 1], 3, ), ( '', [0, 1], 2, ), ) for sequence, token_ids, max_seq_len in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.encode(sequence=sequence, max_seq_len=max_seq_len), token_ids, msg=msg) def test_padding(self): r"""Token ids' length must pad to `max_seq_len`.""" msg = 'Token ids\' length must pad to `max_seq_len`.' examples = ( ( 'Hello World!', [0, 10, 6, 4, 4, 7, 5, 11, 7, 12, 4, 8, 13, 1, 2], 15, ), ( 'I am a legend.', [ 0, 14, 5, 9, 15, 5, 9, 5, 4, 6, 16, 6, 17, 8, 18, 1, 2, 2, 2, 2, ], 20, ), ( 'y = f(x)', [ 0, 3, 5, 3, 5, 3, 3, 3, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ], 20, ), ( '', [0, 1, 2, 2, 2, 2, 2, 2, 2, 2], 10, ), ) for sequence, token_ids, max_seq_len in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.encode(sequence=sequence, max_seq_len=max_seq_len), token_ids, msg=msg)
class TestVocabSize(unittest.TestCase): r"""Test case for `lmp.tokenizer.CharDictTokenizer.vocab_size`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = CharDictTokenizer() self.uncased_tokenizer = CharDictTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent property signature.' self.assertTrue( inspect.isdatadescriptor(CharDictTokenizer.vocab_size), msg=msg ) self.assertFalse( inspect.isfunction(CharDictTokenizer.vocab_size), msg=msg ) self.assertFalse( inspect.ismethod(CharDictTokenizer.vocab_size), msg=msg ) def test_return_type(self): r"""Return `int`""" msg = 'Must return `int`.' for tokenizer in self.tokenizers: self.assertIsInstance(tokenizer.vocab_size, int, msg=msg) def test_return_value(self): r"""Return vocabulary size.""" msg = 'Inconsistent vocabulary size.' for tokenizer in self.tokenizers: self.assertEqual(tokenizer.vocab_size, 4, msg=msg) def test_increase_vocab_size(self): r"""Increase vocabulary size after `build_vocab`.""" msg = 'Must increase vocabulary size after `build_vocab`.' examples = ( (('HeLlO WoRlD!', 'I aM a LeGeNd.'), 18, 15), (('y = f(x)',), 24, 21), (('',), 24, 21), ) sp_tokens_size = len(list(CharDictTokenizer.special_tokens())) for batch_sequences, cased_vocab_size, uncased_vocab_size in examples: self.cased_tokenizer.build_vocab(batch_sequences) self.assertEqual( self.cased_tokenizer.vocab_size, cased_vocab_size + sp_tokens_size, msg=msg ) self.uncased_tokenizer.build_vocab(batch_sequences) self.assertEqual( self.uncased_tokenizer.vocab_size, uncased_vocab_size + sp_tokens_size, msg=msg ) def test_reset_vocab_size(self): r"""Reset vocabulary size after `reset_vocab`.""" msg = 'Must reset vocabulary size after `reset_vocab`.' examples = ( ('HeLlO WoRlD!', 'I aM a LeGeNd.'), ('y = f(x)',), ('',), ) sp_tokens_size = len(list(CharDictTokenizer.special_tokens())) for batch_sequences in examples: for tokenizer in self.tokenizers: tokenizer.build_vocab(batch_sequences) tokenizer.reset_vocab() self.assertEqual( tokenizer.vocab_size, sp_tokens_size, msg=msg )