Пример #1
0
 def test_tokenizer(self):
     """测试英文切词"""
     sent = "test is it."
     white_split = whitespace_tokenize(sent)
     print(white_split)
     assert white_split == ['test', 'is', 'it', '.']  # segment
     res = [
         'This', 'is', 'a', 'test', 'of', 'the', 'word', 'parser', '.',
         'It', 'should', 'work', 'correctly', '!!!'
     ]
     self.assertEqual(
         whitespace_tokenize(
             'This is a test of the word parser. It should work correctly!!!'
         ), res)
Пример #2
0
 def correct(self, text):
     """
     most probable spelling correction for text
     :param text:
     :return:
     """
     self.check_init()
     tokens = whitespace_tokenize(text)
     res = [self.correct_word(w) if len(w) > 1 else w for w in tokens]
     return res
Пример #3
0
def get_word_freq_dict_from_text(text):
    return Counter(whitespace_tokenize(text))