def word_segment(sent): ''' Args: sent: A string. A sentence. Returns: A list of words. ''' global lcode if lcode in ['ko']: words = [word for word, _ in kkma.pos(sent)] elif lcode in ['ja']: words = mecab.parse(sent.encode('utf8')).split() elif lcode in ['th']: words = pythai.split(sent) elif lcode in ['vi']: words = ViTokenizer.tokenize(sent).split() elif lcode in ['zh']: words = list(jieba.cut(sent, cut_all=False)) # elif lcode in ['ar']: # words = segmenter.segment(sent).split() else: # Mostly european languages words = sent.split() return words
def analyze_sentence(sentence, is_thai, dictionary): se = etree.Element(u'se') if is_thai: tokens = pythai.split(sentence) for token in tokens: if token: word = etree.Element(u'w') if token in dictionary: for analysis_number in dictionary[token]: analysis = dictionary[token][analysis_number] ana = etree.Element(u'ana') ana.attrib[u'lex'] = token ana.attrib[u'pos'] = u','.join(analysis[1]) ana.attrib[u'trans'] = analysis[0] ana.attrib[u'translit'] = analysis[2] word.append(ana) word.text = token se.append(word) else: sentence = u' '.join(sentence.split()) sentence = sentence.replace(u'\t', u'') sentence = sentence.replace(u'\r\n', u'') sentence = sentence.replace(u'\n', u'') if sentence: word = etree.Element(u'w') ana = etree.Element(u'ana') ana.attrib[u'lex'] = u'' ana.attrib[u'pos'] = u'' ana.attrib[u'trans'] = u'' ana.attrib[u'translit'] = u'' word.append(ana) word.text = sentence se.append(word) return se
def tag_text(text, dictionary): ##hmmm... #print u"###", text result = [u"<body>"] sents = text.split() for i in sents: result.append(u"<se>") for j in pythai.split(i): result.append(tag_word(j, dictionary)) result.append(u"</se>") result.append(u"</body>") return create_xml(result)
def test_split(self): for sentence in self.test_sentences: six.print_(sentence.split, ' '.join(pythai.split(sentence.sentence))) self.assertEqual(' '.join(pythai.split(sentence.sentence)), sentence.split)
def test_split(self): for sentence in self.test_sentences: print sentence.split, ' '.join(pythai.split(sentence.sentence)) self.assertEqual(' '.join(pythai.split(sentence.sentence)), sentence.split)
def pos(string): tokens = pythai.split(string) return tokens
#!/usr/bin/env python # -*- coding: utf-8 -*- import pythai print pythai.split(u"การที่ได้ต้องแสดงว่างานดี") print pythai.split(u"ฉันกินข้าว") print pythai.split(u"ฉันwantกินseeข้าว")
import pythai print pythai.split(u"การที่ได้ต้องแสดงว่างานดี")
def textToSegmentedList(sentence): # change later #return sentence.split(u' ') return pythai.split(sentence)
def pythai_split(u, limit=1): """ Using PyThai to split thai words """ return pythai.split(u)
def token_iterator(sentence): tokens = pythai.split(sentence) for token in tokens: yield token