def test_word_tokenzie(self): sentence = "A 2.1 cm tumor (right tongue) noted on 2013-11-11." wanted = [ "A", " ", "2.1", " ", "cm", " ", "tumor", " ", "(", "right", " ", "tongue", ")", " ", "noted", " ", "on", " ", "2013-11-11", ".", ] self.assertEqual(list(nlptools.word_tokenize(sentence)), wanted)
def normalize_sent(text): output = [] tokens = list(word_tokenize(text)) if is_title(text): for token in tokens: output.append(normalize(token)) else: output.append(normalize(tokens[0])) output.extend(tokens[1:]) return ''.join(output)
def is_title(text): tokens = word_tokenize(text) bol_list = [] for i, token in enumerate(tokens): if i==0: bol_list.append(True) elif token.lower() in stop_words: bol_list.append(True) elif token[0] not in string.ascii_lowercase: bol_list.append(True) else: bol_list.append(False) return all(bol_list)
def test_word_tokenize_intergration(self): for sent in self.sentences: self.assertEqual(''.join(list(nlptools.word_tokenize(sent))), sent)
def test_word_tokenzie2(self): sentence = '-999 1,234,000 3.1415' wanted = ['-999', ' ', '1,234,000', ' ', '3.1415'] self.assertEqual(list(nlptools.word_tokenize(sentence)), wanted)
def test_word_tokenzie(self): sentence = 'A 2.1 cm tumor (right tongue) noted on 2013-11-11.' wanted = ['A', ' ', '2.1', ' ', 'cm', ' ', 'tumor', ' ', '(', 'right', ' ', 'tongue', ')', ' ', 'noted', ' ', 'on', ' ', '2013-11-11', '.'] self.assertEqual(list(nlptools.word_tokenize(sentence)), wanted)
def test_word_tokenize_intergration(self): for sent in self.sentences: self.assertEqual("".join(list(nlptools.word_tokenize(sent))), sent)
def test_word_tokenzie2(self): sentence = "-999 1,234,000 3.1415" wanted = ["-999", " ", "1,234,000", " ", "3.1415"] self.assertEqual(list(nlptools.word_tokenize(sentence)), wanted)
def test_word_tokenzie(self): sentence = 'A 2.1 x 3.3 cm tumor arising from the tongue base (right side) is noted.' wanted = ['A', ' ', '2.1', ' ', 'x', ' ', '3.3', ' ', 'cm', ' ', 'tumor', ' ', 'arising', ' ', 'from', ' ', 'the', ' ', 'tongue', ' ', 'base', ' ', '(', 'right', ' ', 'side', ')', ' ', 'is', ' ', 'noted', '.'] self.assertEqual(list(nlptools.word_tokenize(sentence)), wanted)