def test_tokenize_text_positive02(self): """ Tokenization with bringing the resulting tokens to lowercase. """ src = 'a\t B c Мама мыла \n\r раму 1\n' dst_true = ['a', 'b', 'c', 'мама', 'мыла', 'раму', '1'] dst_predicted = Seq2SeqLSTM.tokenize_text(src, lowercase=True) self.assertEqual(dst_predicted, dst_true)
def test_tokenize_text_positive01(self): """ Tokenization with saving of the characters register. """ src = 'a\t B c Мама мыла \n\r раму 1\n' dst_true = ['a', 'B', 'c', 'Мама', 'мыла', 'раму', '1'] dst_predicted = Seq2SeqLSTM.tokenize_text(src, lowercase=False) self.assertEqual(dst_predicted, dst_true)