def test_byte_tensors_error_code(self): tensorizer = ByteTensorizer( text_column="text", lower=False, add_bos_token=True, add_eos_token=True ) s1 = "I want some coffee#" s2 = "This is ^the best show I've ever seen" rows = [{"text": s1}, {"text": s2}] expected_error_code = 1 with self.assertRaises(SystemExit) as cm: for row in rows: tensorizer.numberize(row) self.assertEqual(cm.exception.code, expected_error_code)
def test_create_byte_tensors(self): tensorizer = ByteTensorizer(text_column="text", lower=False) # not initializing because initializing is a no-op for ByteTensorizer s1 = "I want some coffee" s2 = "Turn it up" s3 = "我不会说中文" rows = [{"text": s1}, {"text": s2}, {"text": s3}] expected = [list(s1.encode()), list(s2.encode()), list(s3.encode())] tensors = [tensorizer.numberize(row) for row in rows] self.assertEqual([(bytes, len(bytes)) for bytes in expected], tensors)
def test_create_byte_tensors(self): tensorizer = ByteTensorizer(column="text", lower=False) # not initializing because initializing is a no-op for ByteTensorizer s1 = "I want some coffee" s2 = "Turn it up" rows = [{"text": types.Text(s1)}, {"text": types.Text(s2)}] expected = [[ord(c) for c in s1], [ord(c) for c in s2]] tensors = (tensorizer.numberize(row) for row in rows) chars, seq_len = next(tensors) self.assertEqual(len(s1), len(chars)) self.assertEqual(expected[0], chars) self.assertEqual(len(s1), seq_len) chars, seq_len = next(tensors) self.assertEqual(len(s2), len(chars)) self.assertEqual(expected[1], chars) self.assertEqual(len(s2), seq_len)