def test_yield_value(self): r"""Return iterator which yield `str`.""" msg = 'Must return iterator which yield `str`.' examples = ('[bos]', '[eos]', '[pad]', '[unk]') self.assertIsInstance(CharListTokenizer.special_tokens(), Iterator, msg=msg) out_tokens = list(CharListTokenizer.special_tokens()) for i, ans_token in enumerate(examples): self.assertIsInstance(out_tokens[i], str, msg=msg) self.assertEqual(out_tokens[i], ans_token, msg=msg)
def test_cased_sensitive(self): r"""Vocabulary must be case sensitive.""" msg = 'Vocabulary must be case sensitive.' examples = ( (('ABCD', 'abcd'), 8, 4), (('efghi', 'EFGHI'), 10, 5), ) sp_tokens_size = len(list(CharListTokenizer.special_tokens())) for batch_sequences, cased_vocab_size, uncased_vocab_size in examples: self.cased_tokenizer.reset_vocab() self.cased_tokenizer.build_vocab(batch_sequences=batch_sequences) self.assertEqual( self.cased_tokenizer.vocab_size, cased_vocab_size + sp_tokens_size, msg=msg ) self.uncased_tokenizer.reset_vocab() self.uncased_tokenizer.build_vocab(batch_sequences=batch_sequences) self.assertEqual( self.uncased_tokenizer.vocab_size, uncased_vocab_size + sp_tokens_size, msg=msg )
def test_reset_vocab_size(self): r"""Reset vocabulary size after `reset_vocab`.""" msg = 'Must reset vocabulary size after `reset_vocab`.' examples = ( ('HeLlO WoRlD!', 'I aM a LeGeNd.'), ('y = f(x)',), ('',), ) sp_tokens_size = len(list(CharListTokenizer.special_tokens())) for batch_sequences in examples: for tokenizer in self.tokenizers: tokenizer.build_vocab(batch_sequences) tokenizer.reset_vocab() self.assertEqual( tokenizer.vocab_size, sp_tokens_size, msg=msg )
def test_increase_vocab_size(self): r"""Increase vocabulary size after `build_vocab`.""" msg = 'Must increase vocabulary size after `build_vocab`.' examples = ( (('HeLlO WoRlD!', 'I aM a LeGeNd.'), 18, 15), (('y = f(x)',), 24, 21), (('',), 24, 21), ) sp_tokens_size = len(list(CharListTokenizer.special_tokens())) for batch_sequences, cased_vocab_size, uncased_vocab_size in examples: self.cased_tokenizer.build_vocab(batch_sequences) self.assertEqual( self.cased_tokenizer.vocab_size, cased_vocab_size + sp_tokens_size, msg=msg ) self.uncased_tokenizer.build_vocab(batch_sequences) self.assertEqual( self.uncased_tokenizer.vocab_size, uncased_vocab_size + sp_tokens_size, msg=msg )