for t, v in get_dict(DICTS.EXT_STOPWORD).iteritems(): stop_words.append(t) for t, v in get_dict(DICTS.STOPWORD).iteritems(): stop_words.append(t) for t, v in get_dict(DICTS.STOP_SENTENCE).iteritems(): stop_words.append(t) STOP_WORDS = frozenset(stop_words) __init_stop_words() accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+") _cuttor = Cuttor() _cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U)) _cuttor.add_stage(SurnameCutting()) _cuttor.add_stage(SuffixCutting()) class ChineseTokenizer(Tokenizer): def __call__(self, text, **kargs): words = _cuttor.tokenize(text, search=True) token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w): if len(w) > 1: pass else: continue token.original = token.text = w token.pos = start_pos
global STOP_WORDS stop_words = [] for t,v in get_dict(DICTS.EXT_STOPWORD).iteritems(): stop_words.append(t) for t,v in get_dict(DICTS.STOPWORD).iteritems(): stop_words.append(t) for t,v in get_dict(DICTS.STOP_SENTENCE).iteritems(): stop_words.append(t) STOP_WORDS = frozenset(stop_words) __init_stop_words() accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+") _cuttor = Cuttor() _cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) _cuttor.add_stage(SurnameCutting()) _cuttor.add_stage(SuffixCutting()) class ChineseTokenizer(Tokenizer): def __call__(self,text,**kargs): words = _cuttor.tokenize(text, search=True) token = Token() for (w,start_pos,stop_pos) in words: if not accepted_chars.match(w): if len(w)>1: pass else: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos
#cuttor.set_topk(3) # Use stage 1 to cut english and number cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) # Or use stage 2 to cut english and number #cuttor.add_stage(RegexCutting(re.compile('\d+', re.I|re.U))) #cuttor.add_stage(RegexCutting(re.compile('[a-zA-Z]+', re.I|re.U))) # Use stage 3 to cut chinese name #surname = SurnameCutting() #cuttor.add_stage(surname) # Or use stage 4 to cut chinese name surname = SurnameCutting2() cuttor.add_stage(surname) # Use stage 4 to cut chinese address or english name suffix = SuffixCutting() cuttor.add_stage(suffix) #seglist = cuttor.cut(str) #print '\nCut with name \n%s\n' % ','.join(list(seglist)) #seglist = cuttor.cut_topk(str, 3) #for seg in seglist: # print ','.join(seg) #for s in cuttor.cut_to_sentence(str): # print s
#cuttor.set_topk(3) # Use stage 1 to cut english and number cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U)) # Or use stage 2 to cut english and number #cuttor.add_stage(RegexCutting(re.compile('\d+', re.I|re.U))) #cuttor.add_stage(RegexCutting(re.compile('[a-zA-Z]+', re.I|re.U))) # Use stage 3 to cut chinese name #surname = SurnameCutting() #cuttor.add_stage(surname) # Or use stage 4 to cut chinese name surname = SurnameCutting2() cuttor.add_stage(surname) # Use stage 4 to cut chinese address or english name suffix = SuffixCutting() cuttor.add_stage(suffix) #seglist = cuttor.cut(str) #print '\nCut with name \n%s\n' % ','.join(list(seglist)) #seglist = cuttor.cut_topk(str, 3) #for seg in seglist: # print ','.join(seg) #for s in cuttor.cut_to_sentence(str): # print s