def seg_word(self, word, mode = "BASIC"): ret = wordseg.scw_segment_words(WordSeg.m_dict_handle, self.m_result_handle, word, len(word), 1) if ret < 0: return None token_count = wordseg.scw_get_token_1(self.m_result_handle, self.m_mode[mode], self.m_token_handle, self.m_max_term_count) l = wordseg.tokens_to_list(self.m_token_handle, token_count) ts = [] for token in l: ts.append(token[7]) return ts
def tokenizeString(self, text, encoding='utf8', seg_type='WPCOMP'): """ # @Synopsis tokenize a given text string, return token and its position of # sentence(pos) # # @Args text string to be tokenized # @Args encoding support utf8, gbk and unicode # @Args seg_type basic or complex mode # # @Returns dict{'errno': error number, 'data': [(token, pos)]} """ ret = { 'errno': 0, 'data': [], } if len(text) == 0: return ret try: if encoding == 'utf8': text = text.decode('utf8', errors='ignore').encode('gbk') elif encoding == 'unicode': text = text.encode('gbk') data = [] wordseg.scw_segment_words(self.scw_worddict, self.scw_out, text, len(text), 1) token_cnt = wordseg.scw_get_token_1(self.scw_out, self.SEG_TYPE_DICT[seg_type], self.tokens, self.MAX_TERM_CNT) tokens = wordseg.tokens_to_list(self.tokens, token_cnt) token_cnt = postag.tag_postag(self.scw_tagdict, self.tokens, token_cnt) postag_ret = postag.print_tags(self.tokens, token_cnt) for token, pos in postag_ret: token = token.decode('gbk', 'ignore') data.append([token, pos]) ret['data'] = data return ret except Exception as e: print e.message if encoding == 'unicode': print text.encode('utf8') else: print text.decode(encoding).encode('utf8') ret['errno'] = 1 return ret
def seg_word(self, word, mode="BASIC"): ret = wordseg.scw_segment_words(WordSeg.m_dict_handle, self.m_result_handle, word, len(word), 1) if ret < 0: return None token_count = wordseg.scw_get_token_1(self.m_result_handle, self.m_mode[mode], self.m_token_handle, self.m_max_term_count) l = wordseg.tokens_to_list(self.m_token_handle, token_count) ts = [] for token in l: ts.append(token[7]) return ts
def get_words(self, content): ''' @brief 取得分词结果 @param content 文本内容 @return 分词结果,以列表形式返回 ''' ANGTYPE_SIMP_CHINESE = 1 # 语言类型,简体中文为1,详细参见ul_ccode.h succ = 1 if (succ == wordseg.scw_segment_words(self.dict_handle, self.result_handle, content, ANGTYPE_SIMP_CHINESE)): token_count = wordseg.scw_get_token_1(self.result_handle, wordseg.SCW_BASIC, self.token_handle, self.max_term_count) token_list = wordseg.tokens_to_list(self.token_handle, token_count) word_list = [token[7] for token in token_list] return word_list else: log.warning("[Segment Word Fail! func=scw_segment_words,content=%s", content) return []
class Tokenizer(): def __init__(self): print >> sys.stderr, "WordSegUtil constructed" self.maxTermCount = 2048 dict_ab_url = (os.path.dirname(os.path.abspath(__file__))) + "/dict" #print dict_ab_url # 加载词典 #print os.path.join(dict_ab_url, "worddict") self.hWordDict = wordseg.scw_load_worddict( os.path.join(dict_ab_url, "worddict")) self.hTagDict = postag.tag_create(os.path.join(dict_ab_url, "tagdict")) # hNerDict = wordner.ner_dict_load(os.path.join(dict_ab_url, "nerdict")) self.hRankDict = wordrank.wdr_create( os.path.join(dict_ab_url, "rankdict")) self.hScwOut = wordseg.scw_create_out(self.maxTermCount * 10) # hNerOut = wordner.ner_out_create(hNerDict, self.maxTermCount) self.hRanks = wordrank.create_ranks(self.maxTermCount) # token self.hTokens = wordseg.create_tokens(self.maxTermCount) self.hTokens = wordseg.init_tokens(self.hTokens, self.maxTermCount) # 专名过滤 self.nerWhiteTags = set([ "PER", # 人名 #"LOC", # 地名 #"ORG", # 机构 #"SFT", # 软件 "GME", # 游戏 "SNG", # 歌曲 #"NVL", # 小说 "VDO", # 视频 "BRD", # 品牌 "CTN", # 动漫 "VDO_MVE", # 电影 "VDO_TV", # 电视剧 "VDO_TVSHOW" # 电视节目 ]) def __del__(self): wordrank.destroy_ranks(self.hRanks) wordseg.destroy_tokens(self.hTokens) # wordner.ner_out_destroy(Tokenize.hNerOut) wordseg.scw_destroy_out(self.hScwOut) wordrank.wdr_destroy(self.hRankDict) # wordner.ner_dict_destroy(Tokenize.hNerDict) postag.tag_destroy(self.hTagDict) wordseg.scw_destroy_worddict(self.hWordDict) print >> sys.stderr, "Tokenize destroied" def tokenize_string(self, text, coding="utf8", segType=SEG_DEFAULT): ret = {"error": 0, "reason": "", "ret": [], "text": text} try: if coding == "utf8": text = text.decode("utf8").encode("gbk") elif coding == 'unicode': text = text.encode('gbk') segRes = [] # 切词 if len(text) == 0 or not isinstance(text, str): return ret wordseg.scw_segment_words(self.hWordDict, self.hScwOut, text, len(text), 1) # 你妹的,错误中文编码会在这里抛异常 # if 0 > wordseg.scw_segment_words(Tokenize.hWordDict, Tokenize.hScwOut, text, 1): # ret["error"]=1 # ret["reason"]="scw_segment_words failed" # return ret except Exception, e: ret["error"] = 1 ret["reason"] = "scw_segment_words failed" return ret tokensLen = wordseg.scw_get_token_1(self.hScwOut, segType, self.hTokens, self.maxTermCount) tokensList = wordseg.tokens_to_list(self.hTokens, tokensLen) # 专名识别 # if 0 > wordner.ner_tag(Tokenize.hNerDict, Tokenize.hTokens, tokensLen, Tokenize.hNerOut, langid): # print >> sys.stderr, "WARNING: ner_tag failed" # return segRes, nerRes # # gran = 2 # nerRes = wordner.get_tag_list(Tokenize.hNerOut, Tokenize.hTokens, tokensLen, gran) # nerRes = [ (term, wordner.get_type_name(Tokenize.hNerDict, langid, nerTag)) for term, nerTag in nerRes ] # nerRes = [ (term, nerTag) for term, nerTag in nerRes if nerTag in Tokenize.nerWhiteTags ] #tokensLen = wordrank.get_nertokens(Tokenize.hScwOut, Tokenize.hNerOut, Tokenize.hTokens, Tokenize.maxTermCount) #tokensList = wordseg.tokens_to_list(Tokenize.hTokens, tokensLen) # 词性标注 tokensLen = postag.tag_postag(self.hTagDict, self.hTokens, tokensLen) postagRes = postag.print_tags(self.hTokens, tokensLen) position = 0 for token, pos in postagRes: token = token.decode('gbk', 'ignore') segRes.append([token, pos, position]) position += len(token) ret["ret"] = segRes #return segRes return ret
dict_handle = wordseg.scw_load_worddict("./dict/wordseg_dict/") result_handle = wordseg.scw_create_out(MAX_TERM_COUNT) token_handle = wordseg.create_tokens(MAX_TERM_COUNT) token_handle = wordseg.init_tokens(token_handle, MAX_TERM_COUNT) for query, title, label in pos: for char in strip_chars: query = query.strip(char) title = title.strip(char) query_title = [] for line in [query, title]: wordseg.scw_segment_words(dict_handle, result_handle, line, 1) token_count = wordseg.scw_get_token_1(result_handle, wordseg.SCW_WPCOMP, token_handle, MAX_TERM_COUNT) query_title.append([ token[7] for token in wordseg.tokens_to_list(token_handle, token_count) ]) query = " ".join(query_title[0]) title = " ".join(query_title[1]) final.append([query, title, label]) wordseg.destroy_tokens(token_handle) wordseg.scw_destroy_out(result_handle) wordseg.scw_destroy_worddict(dict_handle) for query, title, label in final: