Пример #1
0
 def split(sent):
     if Global == None:
         return sent.split()
     sent = sent.decode('utf-8', 'ignore').encode('gbk', 'ignore')
     tuples = [(word.decode("gbk").encode("utf-8"), pos) 
             for word, pos in Global.GetTokenPos(sent)]
     return [each[0] for each in tuples]
Пример #2
0
def tokenize_word(sentence, need_pos=False):
    tuples = [(w, p) for w, p in Global.GetTokenPos(
        sentence.decode('utf-8', "ignore").encode('gbk', 'ignore'))]
    res = []
    pos = []
    for t in tuples:
        if t[1] in RepSet:
            res.append(RepSet[t[1]])
        else:
            res.append(t[0].decode('gbk', 'ignore').encode('utf8', 'ignore'))
        if need_pos:
            pos.append(t[1])
    return res, pos