Exemplo n.º 1
0
    for t, v in get_dict(DICTS.EXT_STOPWORD).iteritems():
        stop_words.append(t)
    for t, v in get_dict(DICTS.STOPWORD).iteritems():
        stop_words.append(t)
    for t, v in get_dict(DICTS.STOP_SENTENCE).iteritems():
        stop_words.append(t)
    STOP_WORDS = frozenset(stop_words)


__init_stop_words()

accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")

_cuttor = Cuttor()
_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U))
_cuttor.add_stage(SurnameCutting())
_cuttor.add_stage(SuffixCutting())


class ChineseTokenizer(Tokenizer):
    def __call__(self, text, **kargs):
        words = _cuttor.tokenize(text, search=True)
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w):
                if len(w) > 1:
                    pass
                else:
                    continue
            token.original = token.text = w
            token.pos = start_pos
Exemplo n.º 2
0
    global STOP_WORDS
    stop_words = []
    for t,v in get_dict(DICTS.EXT_STOPWORD).iteritems():
        stop_words.append(t)
    for t,v in get_dict(DICTS.STOPWORD).iteritems():
        stop_words.append(t)
    for t,v in get_dict(DICTS.STOP_SENTENCE).iteritems():
        stop_words.append(t)
    STOP_WORDS = frozenset(stop_words)
__init_stop_words()

accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")

_cuttor = Cuttor()
_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))
_cuttor.add_stage(SurnameCutting())
_cuttor.add_stage(SuffixCutting())

class ChineseTokenizer(Tokenizer):
    def __call__(self,text,**kargs):
        words = _cuttor.tokenize(text, search=True)
        token  = Token()
        for (w,start_pos,stop_pos) in words:
            if not accepted_chars.match(w):
                if len(w)>1:
                    pass
                else:
                    continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
Exemplo n.º 3
0
#cuttor.set_topk(3)

# Use stage 1 to cut english and number 
cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))

# Or use stage 2 to cut english and number 
#cuttor.add_stage(RegexCutting(re.compile('\d+', re.I|re.U)))
#cuttor.add_stage(RegexCutting(re.compile('[a-zA-Z]+', re.I|re.U)))

# Use stage 3 to cut chinese name
#surname = SurnameCutting()
#cuttor.add_stage(surname)

# Or use stage 4 to cut chinese name
surname = SurnameCutting2()
cuttor.add_stage(surname)

# Use stage 4 to cut chinese address or english name
suffix = SuffixCutting()
cuttor.add_stage(suffix)

#seglist = cuttor.cut(str)
#print '\nCut with name \n%s\n' % ','.join(list(seglist))

#seglist = cuttor.cut_topk(str, 3)
#for seg in seglist:
#    print ','.join(seg)

#for s in cuttor.cut_to_sentence(str):
#    print s
Exemplo n.º 4
0
#cuttor.set_topk(3)

# Use stage 1 to cut english and number
cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U))

# Or use stage 2 to cut english and number
#cuttor.add_stage(RegexCutting(re.compile('\d+', re.I|re.U)))
#cuttor.add_stage(RegexCutting(re.compile('[a-zA-Z]+', re.I|re.U)))

# Use stage 3 to cut chinese name
#surname = SurnameCutting()
#cuttor.add_stage(surname)

# Or use stage 4 to cut chinese name
surname = SurnameCutting2()
cuttor.add_stage(surname)

# Use stage 4 to cut chinese address or english name
suffix = SuffixCutting()
cuttor.add_stage(suffix)

#seglist = cuttor.cut(str)
#print '\nCut with name \n%s\n' % ','.join(list(seglist))

#seglist = cuttor.cut_topk(str, 3)
#for seg in seglist:
#    print ','.join(seg)

#for s in cuttor.cut_to_sentence(str):
#    print s