Exemplo n.º 1
0
def __cut_DAG(sentence):
    DAG = jieba.get_DAG(sentence)
    route = {}
    jieba.calc(sentence, DAG, 0, route=route)
    x = 0
    buf = u''
    N = len(sentence)
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if y - x == 1:
            buf += l_word
        else:
            if len(buf) > 0:
                if len(buf) == 1:
                    yield pair(buf, word_tag_tab.get(buf, 'x'))
                    buf = u''
                else:
                    regognized = __cut(buf)
                    for t in regognized:
                        yield t
                    buf = u''
            yield pair(l_word, word_tag_tab.get(l_word, 'x'))
        x = y

    if len(buf) > 0:
        if len(buf) == 1:
            yield pair(buf, word_tag_tab.get(buf, 'x'))
        else:
            regognized = __cut(buf)
            for t in regognized:
                yield t
Exemplo n.º 2
0
def __cut_DAG(sentence):
	DAG = jieba.get_DAG(sentence)
	route ={}
	
	jieba.calc(sentence,DAG,0,route=route)

	x = 0
	buf =u''
	N = len(sentence)
	while x<N:
		y = route[x][1]+1
		l_word = sentence[x:y]
		if y-x==1:
			buf+= l_word
		else:
			if len(buf)>0:
				if len(buf)==1:
					yield pair(buf,word_tag_tab.get(buf,'x'))
					buf=u''
				else:
					regognized = __cut_detail(buf)
					for t in regognized:
						yield t
					buf=u''
			yield pair(l_word,word_tag_tab.get(l_word,'x'))
		x =y

	if len(buf)>0:
		if len(buf)==1:
			yield pair(buf,word_tag_tab.get(buf,'x'))
		else:
			regognized = __cut_detail(buf)
			for t in regognized:
				yield t
Exemplo n.º 3
0
    def __cut_DAG_NO_HMM(self, sentence):
        _DAG = jieba.get_DAG(sentence)
        # print('[__cut_DAG_NO_HMM] sentence: ', sentence)
        # print('[__cut_DAG_NO_HMM] DAG: ', _DAG)

        my_route = self.get_route(sentence, _DAG)
        # print('[__cut_DAG_NO_HMM] my_route: ', my_route)

        if len(my_route) > 1:
            _tmp_freq = 0
            _list = []

            for _ in my_route:
                if _['freq'] > _tmp_freq:
                    _tmp_freq = _['freq']
                    _list = _['list']

            # print('[__cut_DAG_NO_HMM] max freq list: ', _list)
            if _list:
                for __ in _list:
                    yield __
            else:
                print('[__cut_DAG_NO_HMM] sentence: ', sentence)
                print('[__cut_DAG_NO_HMM] my_route: ', my_route)

        else:
            route = {}
            jieba.calc(sentence, _DAG, route)

            x = 0
            N = len(sentence)
            buf = ''
            while x < N:
                y = route[x][1] + 1
                l_word = sentence[x:y]
                if self.re_eng.match(l_word) and len(l_word) == 1:
                    buf += l_word
                    x = y
                else:
                    if buf:
                        yield buf
                        buf = ''
                    yield l_word
                    x = y
            if buf:
                yield buf
                buf = ''
Exemplo n.º 4
0
def __cut_DAG(sentence):
    DAG = jieba.get_DAG(sentence)
    route = {}

    jieba.calc(sentence, DAG, 0, route=route)

    x = 0
    buf = u""
    N = len(sentence)
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if y - x == 1:
            buf += l_word
        else:
            if len(buf) > 0:
                if len(buf) == 1:
                    yield pair(buf, word_tag_tab.get(buf, "x"))
                    buf = u""
                else:
                    if buf not in jieba.FREQ:
                        regognized = __cut_detail(buf)
                        for t in regognized:
                            yield t
                    else:
                        for elem in buf:
                            yield pair(elem, word_tag_tab.get(elem, "x"))
                    buf = u""
            yield pair(l_word, word_tag_tab.get(l_word, "x"))
        x = y

    if len(buf) > 0:
        if len(buf) == 1:
            yield pair(buf, word_tag_tab.get(buf, "x"))
        else:
            if buf not in jieba.FREQ:
                regognized = __cut_detail(buf)
                for t in regognized:
                    yield t
            else:
                for elem in buf:
                    yield pair(elem, word_tag_tab.get(elem, "x"))
Exemplo n.º 5
0
def __cut_DAG(sentence):
    DAG = jieba.get_DAG(sentence)
    route = {}

    jieba.calc(sentence, DAG, 0, route=route)

    x = 0
    buf = ''
    N = len(sentence)
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if y - x == 1:
            buf += l_word
        else:
            if buf:
                if len(buf) == 1:
                    yield pair(buf, word_tag_tab.get(buf, 'x'))
                    buf = ''
                else:
                    if (buf not in jieba.FREQ):
                        recognized = __cut_detail(buf)
                        for t in recognized:
                            yield t
                    else:
                        for elem in buf:
                            yield pair(elem, word_tag_tab.get(elem, 'x'))
                    buf = ''
            yield pair(l_word, word_tag_tab.get(l_word, 'x'))
        x = y

    if buf:
        if len(buf) == 1:
            yield pair(buf, word_tag_tab.get(buf, 'x'))
        elif (buf not in jieba.FREQ):
            recognized = __cut_detail(buf)
            for t in recognized:
                yield t
        else:
            for elem in buf:
                yield pair(elem, word_tag_tab.get(elem, 'x'))
Exemplo n.º 6
0
def __cut_DAG_NO_HMM(sentence):
    DAG = jieba.get_DAG(sentence)
    route = {}
    jieba.calc(sentence, DAG, route)
    x = 0
    N = len(sentence)
    buf = ''
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if re_eng1.match(l_word):
            buf += l_word
            x = y
        else:
            if buf:
                yield pair(buf, 'eng')
                buf = ''
            yield pair(l_word, word_tag_tab.get(l_word, 'x'))
            x = y
    if buf:
        yield pair(buf, 'eng')
        buf = ''
Exemplo n.º 7
0
def __cut_DAG_NO_HMM(sentence):
    DAG = jieba.get_DAG(sentence)
    route = {}
    jieba.calc(sentence, DAG, route)
    x = 0
    N = len(sentence)
    buf = ''
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if re_eng1.match(l_word):
            buf += l_word
            x = y
        else:
            if buf:
                yield pair(buf, 'eng')
                buf = ''
            yield pair(l_word, word_tag_tab.get(l_word, 'x'))
            x = y
    if buf:
        yield pair(buf, 'eng')
        buf = ''
Exemplo n.º 8
0
def __cut_DAG(sentence):
    DAG = jieba.get_DAG(sentence)
    route = {}

    jieba.calc(sentence, DAG, route)

    x = 0
    buf = ''
    N = len(sentence)
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if y - x == 1:
            buf += l_word
        else:
            if buf:
                if len(buf) == 1:
                    yield pair(buf, word_tag_tab.get(buf, 'x'))
                elif not jieba.FREQ.get(buf):
                    recognized = __cut_detail(buf)
                    for t in recognized:
                        yield t
                else:
                    for elem in buf:
                        yield pair(elem, word_tag_tab.get(elem, 'x'))
                buf = ''
            yield pair(l_word, word_tag_tab.get(l_word, 'x'))
        x = y

    if buf:
        if len(buf) == 1:
            yield pair(buf, word_tag_tab.get(buf, 'x'))
        elif not jieba.FREQ.get(buf):
            recognized = __cut_detail(buf)
            for t in recognized:
                yield t
        else:
            for elem in buf:
                yield pair(elem, word_tag_tab.get(elem, 'x'))
Exemplo n.º 9
0
def __cut_DAG_NO_HMM(sentence):
    DAG = jieba.get_DAG(sentence)
    route ={}
    jieba.calc(sentence,DAG,0,route=route)
    x = 0
    N = len(sentence)
    buf =u''
    re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
    while x<N:
        y = route[x][1]+1
        l_word = sentence[x:y]
        if re_eng.match(l_word) and len(l_word)==1:
            buf += l_word
            x = y
        else:
            if len(buf)>0:
                yield pair(buf,'eng')
                buf = u''
            yield pair(l_word,word_tag_tab.get(l_word,'x'))
            x =y
    if len(buf)>0:
        yield pair(buf,'eng')
        buf = u''
Exemplo n.º 10
0
def __cut_DAG_NO_HMM(sentence):
    DAG = jieba.get_DAG(sentence)
    route ={}
    jieba.calc(sentence,DAG,0,route=route)
    x = 0
    N = len(sentence)
    buf =u''
    re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
    while x<N:
        y = route[x][1]+1
        l_word = sentence[x:y]
        if re_eng.match(l_word) and len(l_word)==1:
            buf += l_word
            x = y
        else:
            if len(buf)>0:
                yield pair(buf,'eng')
                buf = u''
            yield pair(l_word,word_tag_tab.get(l_word,'x'))
            x =y
    if len(buf)>0:
        yield pair(buf,'eng')
        buf = u''
Exemplo n.º 11
0
 def cutc(self,content):
     jieba.calc()