예제 #1
0
 def test_word_tokenzie(self):
     sentence = "A 2.1 cm tumor (right tongue) noted on 2013-11-11."
     wanted = [
         "A",
         " ",
         "2.1",
         " ",
         "cm",
         " ",
         "tumor",
         " ",
         "(",
         "right",
         " ",
         "tongue",
         ")",
         " ",
         "noted",
         " ",
         "on",
         " ",
         "2013-11-11",
         ".",
     ]
     self.assertEqual(list(nlptools.word_tokenize(sentence)), wanted)
예제 #2
0
파일: corpus.py 프로젝트: jeroyang/txttk
def normalize_sent(text):
    output = []
    tokens = list(word_tokenize(text))
    if is_title(text):
        for token in tokens:
            output.append(normalize(token))
    else:
        output.append(normalize(tokens[0]))
        output.extend(tokens[1:])
    return ''.join(output)
예제 #3
0
파일: corpus.py 프로젝트: jeroyang/txttk
def is_title(text):
    tokens = word_tokenize(text)
    bol_list = []
    for i, token in enumerate(tokens):
        if i==0:
            bol_list.append(True)
        elif token.lower() in stop_words:
            bol_list.append(True)
        elif token[0] not in string.ascii_lowercase:
            bol_list.append(True)
        else:
            bol_list.append(False)
    return all(bol_list)
예제 #4
0
 def test_word_tokenize_intergration(self):
     for sent in self.sentences:
       self.assertEqual(''.join(list(nlptools.word_tokenize(sent))), sent)
예제 #5
0
 def test_word_tokenzie2(self):
     sentence = '-999 1,234,000 3.1415'
     wanted = ['-999', ' ', '1,234,000', ' ', '3.1415']
     self.assertEqual(list(nlptools.word_tokenize(sentence)), wanted)
예제 #6
0
 def test_word_tokenzie(self):
     sentence = 'A 2.1 cm tumor (right tongue) noted on 2013-11-11.'
     wanted = ['A', ' ', '2.1', ' ', 'cm', ' ', 'tumor', ' ', '(', 'right', ' ', 'tongue', ')', ' ', 'noted', ' ', 'on', ' ', '2013-11-11', '.']
     self.assertEqual(list(nlptools.word_tokenize(sentence)), wanted)
예제 #7
0
 def test_word_tokenize_intergration(self):
     for sent in self.sentences:
         self.assertEqual("".join(list(nlptools.word_tokenize(sent))), sent)
예제 #8
0
 def test_word_tokenzie2(self):
     sentence = "-999 1,234,000 3.1415"
     wanted = ["-999", " ", "1,234,000", " ", "3.1415"]
     self.assertEqual(list(nlptools.word_tokenize(sentence)), wanted)
예제 #9
0
 def test_word_tokenzie(self):
     sentence = 'A 2.1 x 3.3 cm tumor arising from the tongue base (right side) is noted.'
     wanted = ['A', ' ', '2.1', ' ', 'x', ' ', '3.3', ' ', 'cm', ' ', 'tumor', ' ', 'arising', ' ', 'from', ' ', 'the', ' ', 'tongue', ' ', 'base', ' ', '(', 'right', ' ', 'side', ')', ' ', 'is', ' ', 'noted', '.']
     self.assertEqual(list(nlptools.word_tokenize(sentence)), wanted)