def split(sent): if Global == None: return sent.split() sent = sent.decode('utf-8', 'ignore').encode('gbk', 'ignore') tuples = [(word.decode("gbk").encode("utf-8"), pos) for word, pos in Global.GetTokenPos(sent)] return [each[0] for each in tuples]
def tokenize_word(sentence, need_pos=False): tuples = [(w, p) for w, p in Global.GetTokenPos( sentence.decode('utf-8', "ignore").encode('gbk', 'ignore'))] res = [] pos = [] for t in tuples: if t[1] in RepSet: res.append(RepSet[t[1]]) else: res.append(t[0].decode('gbk', 'ignore').encode('utf8', 'ignore')) if need_pos: pos.append(t[1]) return res, pos