def test_invalid_input_experiment(self): r"""Raise exception when input `experiment` is invalid.""" msg1 = ( 'Must raise `TypeError` or `ValueError` when input `experiment` ' 'is invalid.' ) msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: with self.assertRaises( (TypeError, ValueError), msg=msg1 ) as ctx_man: BaseListTokenizer.load(experiment=invalid_input) if isinstance(ctx_man.exception, TypeError): self.assertEqual( ctx_man.exception.args[0], '`experiment` must be an instance of `str`.', msg=msg2 ) else: self.assertEqual( ctx_man.exception.args[0], '`experiment` must not be empty.', msg=msg2 )
def test_yield_value(self): r"""Return iterator which yield `str`.""" msg = 'Must return iterator which yield `str`.' examples = ('[bos]', '[eos]', '[pad]', '[unk]') self.assertIsInstance(BaseListTokenizer.special_tokens(), Iterator, msg=msg) out_tokens = list(BaseListTokenizer.special_tokens()) for i, ans_token in enumerate(examples): self.assertIsInstance(out_tokens[i], str, msg=msg) self.assertEqual(out_tokens[i], ans_token, msg=msg)
def test_experiment_does_not_exist(self): r"""Raise `FileNotFoundError` when `experiment` does not exist.""" msg1 = ( 'Must raise `FileNotFoundError` when `experiment` does not exist.' ) msg2 = 'Inconsistent error message.' examples = (self.__class__.experiment, 'I-AM-A-TEST-AND-I-DONT-EXIST') for experiment in examples: with self.assertRaises(FileNotFoundError, msg=msg1) as ctx_man: BaseListTokenizer.load(experiment=experiment) test_path = os.path.join(DATA_PATH, experiment, 'tokenizer.json') self.assertEqual( ctx_man.exception.args[0], f'File {test_path} does not exist.', msg=msg2 )
def test_load_result(self): r"""Load `tokenizer.json`.""" msg = 'Inconsistent `tokenizer.json` format.' examples = ( { 'is_uncased': False, 'token_to_id': { 'A': 0, 'B': 1, 'C': 2, }, }, { 'is_uncased': True, 'token_to_id': { 'a': 0, 'b': 1, 'c': 2, }, }, ) test_path = os.path.join(self.__class__.test_dir, 'tokenizer.json') for obj in examples: try: # Create test file. with open(test_path, 'w', encoding='utf-8') as output_file: json.dump(obj, output_file) tokenizer = BaseListTokenizer.load( experiment=self.__class__.experiment ) self.assertIsInstance(tokenizer, BaseListTokenizer, msg=msg) for attr_key, attr_value in obj.items(): self.assertTrue(hasattr(tokenizer, attr_key), msg=msg) self.assertIsInstance( getattr(tokenizer, attr_key), type(attr_value), msg=msg ) self.assertEqual( getattr(tokenizer, attr_key), attr_value, msg=msg ) finally: # Clean up test file. os.remove(test_path)
def test_abstract_method(self): r"""Raise `NotImplementedError` when subclass did not implement.""" msg1 = ( 'Must raise `NotImplementedError` when subclass did not implement.' ) msg2 = 'Inconsistent error message.' examples = (True, False) for is_uncased in examples: with self.assertRaises(NotImplementedError, msg=msg1) as ctx_man: BaseListTokenizer(is_uncased=is_uncased).encode('') self.assertEqual(ctx_man.exception.args[0], 'In class `BaseListTokenizer`: ' 'method `tokenize` not implemented yet.', msg=msg2)
def test_invalid_input_is_uncased(self): r"""Raise `TypeError` when input `is_uncased` is invalid.""" msg1 = 'Must raise `TypeError` when input `is_uncased` is invalid.' msg2 = 'Inconsistent error message.' examples = ( 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, '', b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: with self.assertRaises(TypeError, msg=msg1) as ctx_man: BaseListTokenizer(is_uncased=invalid_input) self.assertEqual( ctx_man.exception.args[0], '`is_uncased` must be an instance of `bool`.', msg=msg2 )
def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = BaseListTokenizer() self.uncased_tokenizer = BaseListTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer]
class TestNormalize(unittest.TestCase): r"""Test case for `lmp.tokenizer.BaseListTokenizer.normalize`.""" def setUp(self): r"""Setup both cased and uncased tokenizer instances.""" self.cased_tokenizer = BaseListTokenizer() self.uncased_tokenizer = BaseListTokenizer(is_uncased=True) self.tokenizers = [self.cased_tokenizer, self.uncased_tokenizer] def tearDown(self): r"""Delete both cased and uncased tokenizer instances.""" del self.tokenizers del self.cased_tokenizer del self.uncased_tokenizer gc.collect() def test_signature(self): r"""Ensure signature consistency.""" msg = 'Inconsistent method signature.' self.assertEqual( inspect.signature(BaseListTokenizer.normalize), inspect.Signature(parameters=[ inspect.Parameter(name='self', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, default=inspect.Parameter.empty), inspect.Parameter(name='sequence', kind=inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=str, default=inspect.Parameter.empty) ], return_annotation=str), msg=msg) def test_invalid_input_sequence(self): r"""Raise `TypeError` when input `sequence` is invalid.""" msg1 = 'Must raise `TypeError` when input `sequence` is invalid.' msg2 = 'Inconsistent error message.' examples = ( False, True, 0, 1, -1, 0.0, 1.0, math.nan, -math.nan, math.inf, -math.inf, 0j, 1j, b'', (), [], {}, set(), object(), lambda x: x, type, None, NotImplemented, ..., ) for invalid_input in examples: for tokenizer in self.tokenizers: with self.assertRaises(TypeError, msg=msg1) as cxt_man: tokenizer.normalize(sequence=invalid_input) self.assertEqual(cxt_man.exception.args[0], '`sequence` must be an instance of `str`.', msg=msg2) def test_return_type(self): r"""Return `str`.""" msg = 'Must return `str`.' examples = ( 'Hello world!', 'I am a legend.', 'y = f(x)', '', ) for sequence in examples: for tokenizer in self.tokenizers: self.assertIsInstance(tokenizer.normalize(sequence=sequence), str, msg=msg) def test_unicode_normalize(self): r"""Return NFKC normalized characters.""" msg = 'Must return NFKC normalized characters.' examples = ( ('0', '0', 1), ('é', 'é', 1), ('0é', '0é', 2), ) for sequence, normalized_sequence, sequence_len in examples: for tokenizer in self.tokenizers: out_sequence = tokenizer.normalize(sequence=sequence) self.assertEqual(out_sequence, normalized_sequence, msg=msg) self.assertEqual(len(out_sequence), sequence_len, msg=msg) def test_cased_sensitive(self): r"""Return cased sensitive sequence.""" msg = 'Return sequence must be cased sensitive.' examples = ( ('HeLlO WoRlD!', 'HeLlO WoRlD!', 'hello world!'), ('HELLO WORLD!', 'HELLO WORLD!', 'hello world!'), ('hello world!', 'hello world!', 'hello world!'), ('H', 'H', 'h'), ('h', 'h', 'h'), ) for sequence, cased_sequence, uncased_sequence in examples: self.assertEqual(self.cased_tokenizer.normalize(sequence), cased_sequence, msg=msg) self.assertEqual(self.uncased_tokenizer.normalize(sequence), uncased_sequence, msg=msg) def test_whitespace_strip(self): r"""Strip input sequence.""" msg = 'Must strip both leading and trailing whitespace characters.' examples = ( (' hello world!', 'hello world!'), ('hello world! ', 'hello world!'), (' hello world! ', 'hello world!'), (' hello world! ', 'hello world!'), ('\nhello world!\n', 'hello world!'), (' ', ''), ('', ''), ) for sequence, stripped_sequence in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.normalize(sequence), stripped_sequence, msg=msg) def test_whitespace_collapse(self): r"""Collapse whitespace characters.""" msg = ('Must convert consecutive whitespace characters into single ' 'whitespace character.') examples = ( ('hello world !', 'hello world !'), ('hello world !', 'hello world !'), ('hello world !', 'hello world !'), ('hello world\n\n!', 'hello world !'), ) for sequence, ans_tokens in examples: for tokenizer in self.tokenizers: self.assertEqual(tokenizer.normalize(sequence), ans_tokens, msg=msg)