Python Cuttor.add_stage示例

编程语言: Python

命名空间/包名称: yaha

类/类型: Cuttor

方法/功能: add_stage

hotexamples.com的示例: 4

Python Cuttor.add_stage - 已找到4个示例。这些是从开源项目中提取的最受好评的yaha.Cuttor.add_stage现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Cuttor(8)

set_stage1_regex(7)

cut_to_sentence(4)

cut(3)

add_stage(2)

cut_all(1)

exist(1)

tokenize(1)

word_type(1)

示例#1

显示文件

文件： analyzer.py 项目： zmjm4/yaha

    for t, v in get_dict(DICTS.EXT_STOPWORD).iteritems():
        stop_words.append(t)
    for t, v in get_dict(DICTS.STOPWORD).iteritems():
        stop_words.append(t)
    for t, v in get_dict(DICTS.STOP_SENTENCE).iteritems():
        stop_words.append(t)
    STOP_WORDS = frozenset(stop_words)


__init_stop_words()

accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")

_cuttor = Cuttor()
_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U))
_cuttor.add_stage(SurnameCutting())
_cuttor.add_stage(SuffixCutting())


class ChineseTokenizer(Tokenizer):
    def __call__(self, text, **kargs):
        words = _cuttor.tokenize(text, search=True)
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w):
                if len(w) > 1:
                    pass
                else:
                    continue
            token.original = token.text = w
            token.pos = start_pos

示例#2

显示文件

文件： analyzer.py 项目： ZoeyYoung/Bookmarks_Cloud

    global STOP_WORDS
    stop_words = []
    for t,v in get_dict(DICTS.EXT_STOPWORD).iteritems():
        stop_words.append(t)
    for t,v in get_dict(DICTS.STOPWORD).iteritems():
        stop_words.append(t)
    for t,v in get_dict(DICTS.STOP_SENTENCE).iteritems():
        stop_words.append(t)
    STOP_WORDS = frozenset(stop_words)
__init_stop_words()

accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")

_cuttor = Cuttor()
_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))
_cuttor.add_stage(SurnameCutting())
_cuttor.add_stage(SuffixCutting())

class ChineseTokenizer(Tokenizer):
    def __call__(self,text,**kargs):
        words = _cuttor.tokenize(text, search=True)
        token  = Token()
        for (w,start_pos,stop_pos) in words:
            if not accepted_chars.match(w):
                if len(w)>1:
                    pass
                else:
                    continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos

示例#3

显示文件

文件： test_cuttor.py 项目： ouyanghuangzheng/yaha

#cuttor.set_topk(3)

# Use stage 1 to cut english and number 
cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))

# Or use stage 2 to cut english and number 
#cuttor.add_stage(RegexCutting(re.compile('\d+', re.I|re.U)))
#cuttor.add_stage(RegexCutting(re.compile('[a-zA-Z]+', re.I|re.U)))

# Use stage 3 to cut chinese name
#surname = SurnameCutting()
#cuttor.add_stage(surname)

# Or use stage 4 to cut chinese name
surname = SurnameCutting2()
cuttor.add_stage(surname)

# Use stage 4 to cut chinese address or english name
suffix = SuffixCutting()
cuttor.add_stage(suffix)

#seglist = cuttor.cut(str)
#print '\nCut with name \n%s\n' % ','.join(list(seglist))

#seglist = cuttor.cut_topk(str, 3)
#for seg in seglist:
#    print ','.join(seg)

#for s in cuttor.cut_to_sentence(str):
#    print s

示例#4

显示文件

#cuttor.set_topk(3)

# Use stage 1 to cut english and number
cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U))

# Or use stage 2 to cut english and number
#cuttor.add_stage(RegexCutting(re.compile('\d+', re.I|re.U)))
#cuttor.add_stage(RegexCutting(re.compile('[a-zA-Z]+', re.I|re.U)))

# Use stage 3 to cut chinese name
#surname = SurnameCutting()
#cuttor.add_stage(surname)

# Or use stage 4 to cut chinese name
surname = SurnameCutting2()
cuttor.add_stage(surname)

# Use stage 4 to cut chinese address or english name
suffix = SuffixCutting()
cuttor.add_stage(suffix)

#seglist = cuttor.cut(str)
#print '\nCut with name \n%s\n' % ','.join(list(seglist))

#seglist = cuttor.cut_topk(str, 3)
#for seg in seglist:
#    print ','.join(seg)

#for s in cuttor.cut_to_sentence(str):
#    print s