예제 #1
0
파일: __init__.py 프로젝트: 5kg/renren
def __cut_DAG(sentence):
	DAG = get_DAG(sentence)
	route ={}
	calc(sentence,DAG,0,route=route)
	x = 0
	buf =u''
	N = len(sentence)
	while x<N:
		y = route[x][1]+1
		l_word = sentence[x:y]
		if y-x==1:
			buf+= l_word
		else:
			if len(buf)>0:
				if len(buf)==1:
					yield buf
					buf=u''
				else:
					regognized = finalseg.cut(buf)
					for t in regognized:
						yield t
					buf=u''
			yield l_word		
		x =y

	if len(buf)>0:
		if len(buf)==1:
			yield buf
		else:
			regognized = finalseg.cut(buf)
			for t in regognized:
				yield t
예제 #2
0
def __cut_DAG(sentence):
    DAG = get_DAG(sentence)
    route = {}
    calc(sentence, DAG, 0, route=route)
    x = 0
    buf = u''
    N = len(sentence)
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if y - x == 1:
            buf += l_word
        else:
            if len(buf) > 0:
                if len(buf) == 1:
                    yield buf
                    buf = u''
                else:
                    regognized = finalseg.cut(buf)
                    for t in regognized:
                        yield t
                    buf = u''
            yield l_word
        x = y

    if len(buf) > 0:
        if len(buf) == 1:
            yield buf
        else:
            regognized = finalseg.cut(buf)
            for t in regognized:
                yield t
예제 #3
0
def cosine_similarity(s1, s2):
    if not isinstance(s1, unicode):
        s1 = s1.lower().decode('utf-8')
    if not isinstance(s2, unicode):
        s2 = s2.lower().decode('utf-8')
    s1 = re.sub(u'\(|\)|(|)| |\s+|\#', u'', s1)
    s2 = re.sub(u'\(|\)|(|)| |\s+|\#', u'', s2)
    t1 = [i for i in finalseg.cut(s1)]
    t2 = [i for i in finalseg.cut(s2)]
    #print ",".join(t1)
    #print ",".join(t2)
    terms = list(set([i for i in t1 + t2]))
    v1 = [0] * len(terms)
    v2 = [0] * len(terms)
    for i in xrange(len(terms)):
        if terms[i] in s1:
            v1[i] += 1
        if terms[i] in s2:
            v2[i] += 1
    sum_xy = 0
    sum_x2 = 0
    sum_y2 = 0
    for j in xrange(len(terms)):
        sum_xy += v1[j] * v2[j]
        sum_x2 += v1[j]**2
        sum_y2 += v2[j]**2
    #return sum_xy / (math.sqrt(sum_x2) * math.sqrt(sum_y2))
    return sum_xy / (math.sqrt(sum_x2) *
                     math.sqrt(sum_y2)) if sum_x2 != 0 and sum_y2 != 0 else 0
예제 #4
0
def cosine_similarity(s1, s2):
    if not isinstance(s1, unicode):
        s1 = s1.lower().decode('utf-8')
    if not isinstance(s2, unicode):
        s2 = s2.lower().decode('utf-8')
    s1 = re.sub(u'\(|\)|(|)| |\s+|\#', u'', s1)
    s2 = re.sub(u'\(|\)|(|)| |\s+|\#', u'', s2)
    t1 = [i for i in finalseg.cut(s1)]
    t2 = [i for i in finalseg.cut(s2)]
    terms = list(set([i for i in t1+t2]))
    v1 = [0] * len(terms)
    v2 = [0] * len(terms)
    for i in xrange(len(terms)):
        if terms[i] in s1:
            v1[i] += 1
        if terms[i] in s2:
            v2[i] += 1
    sum_xy = 0
    sum_x2 = 0
    sum_y2 = 0
    for j in xrange(len(terms)):
        sum_xy += v1[j] * v2[j]
        sum_x2 += v1[j] ** 2
        sum_y2 += v2[j] ** 2
    #retur sum_xy / (math.sqrt(sum_x2) * math.sqrt(sum_y2))
    return sum_xy / (math.sqrt(sum_x2) * math.sqrt(sum_y2)) if sum_x2!=0 and sum_y2!=0 else 0
예제 #5
0
def __cut_DAG(sentence):
    N = len(sentence)
    i, j = 0, 0
    p = trie
    DAG = {}
    while i < N:
        c = sentence[j]
        if c in p:
            p = p[c]
            if "" in p:
                if not i in DAG:
                    DAG[i] = []
                DAG[i].append(j)
            j += 1
            if j >= N:
                i += 1
                j = i
                p = trie
        else:
            p = trie
            i += 1
            j = i
    for i in xrange(len(sentence)):
        if not i in DAG:
            DAG[i] = [i]
            # pprint.pprint(DAG)
    route = {}
    calc(sentence, DAG, 0, route=route)
    x = 0
    buf = u""
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if y - x == 1:
            buf += l_word
        else:
            if len(buf) > 0:
                if len(buf) == 1:
                    yield buf
                    buf = u""
                else:
                    regognized = finalseg.cut(buf)
                    for t in regognized:
                        yield t
                    buf = u""
            yield l_word
        x = y

    if len(buf) > 0:
        if len(buf) == 1:
            yield buf
        else:
            regognized = finalseg.cut(buf)
            for t in regognized:
                yield t
예제 #6
0
def __cut_DAG(sentence):
    N = len(sentence)
    i, j = 0, 0
    p = trie
    DAG = {}
    while i < N:
        c = sentence[j]
        if c in p:
            p = p[c]
            if '' in p:
                if not i in DAG:
                    DAG[i] = []
                DAG[i].append(j)
            j += 1
            if j >= N:
                i += 1
                j = i
                p = trie
        else:
            p = trie
            i += 1
            j = i
    for i in xrange(len(sentence)):
        if not i in DAG:
            DAG[i] = [i]
    #pprint.pprint(DAG)
    route = {}
    calc(sentence, DAG, 0, route=route)
    x = 0
    buf = u''
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if y - x == 1:
            buf += l_word
        else:
            if len(buf) > 0:
                if len(buf) == 1:
                    yield buf
                    buf = u''
                else:
                    regognized = finalseg.cut(buf)
                    for t in regognized:
                        yield t
                    buf = u''
            yield l_word
        x = y

    if len(buf) > 0:
        if len(buf) == 1:
            yield buf
        else:
            regognized = finalseg.cut(buf)
            for t in regognized:
                yield t
예제 #7
0
파일: __init__.py 프로젝트: unbuilt/jieba
def __cut_DAG(sentence):
    DAG = get_DAG(sentence)
    route = {}
    calc(sentence, DAG, 0, route=route)
    x = 0
    buf = u""
    N = len(sentence)
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if y - x == 1:
            buf += l_word
        else:
            if len(buf) > 0:
                if len(buf) == 1:
                    yield buf
                    buf = u""
                else:
                    if not (buf in FREQ):
                        regognized = finalseg.cut(buf)
                        for t in regognized:
                            yield t
                    else:
                        for elem in buf:
                            yield elem
                    buf = u""
            yield l_word
        x = y

    if len(buf) > 0:
        if len(buf) == 1:
            yield buf
        else:
            if not (buf in FREQ):
                regognized = finalseg.cut(buf)
                for t in regognized:
                    yield t
            else:
                for elem in buf:
                    yield elem
예제 #8
0
def __cut_DAG(sentence):
    DAG = get_DAG(sentence)
    route = {}
    calc(sentence, DAG, 0, route=route)
    x = 0
    buf = u''
    N = len(sentence)
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if y - x == 1:
            buf += l_word
        else:
            if buf:
                if len(buf) == 1:
                    yield buf
                    buf = u''
                else:
                    if (buf not in FREQ):
                        recognized = finalseg.cut(buf)
                        for t in recognized:
                            yield t
                    else:
                        for elem in buf:
                            yield elem
                    buf = u''
            yield l_word
        x = y

    if buf:
        if len(buf) == 1:
            yield buf
        elif (buf not in FREQ):
            recognized = finalseg.cut(buf)
            for t in recognized:
                yield t
        else:
            for elem in buf:
                yield elem
예제 #9
0
def post_nlp():
    verification(request)
    rejs = request.json
    data = rejs.get('data')
    action = rejs.get('action')
    method = data.get('method')
    if method == 'hmm_participle':
        cut_result = participle_text(data.get('text'))
        return jsonify({'result': 'success', 'cut_result': cut_result})
    if method == 'default':
        sentence = data.get('text')
        seg_list = finalseg.cut(sentence)
        res = "/ ".join(seg_list)
        return jsonify({'result': 'success', 'cut_result': res})
    if method == 'seg_hmm':
        sentence = data.get('text')
        seg_str = cut_ner.cut_sentence(sentence)
        return jsonify({'result': 'success', 'cut_result': seg_str})
    if method == 'ner_hmm':
        sentence = data.get('text')
        ner_cut = cut_ner.cut_ansj(sentence)
        return jsonify({'result': 'success', 'cut_result': ner_cut})
예제 #10
0
#encoding=utf-8
import finalseg

sentence_list = ["姚晨和老凌离婚了", "他说的确实在理", "长春市长春节讲话"]

print u"=默认效果"

for sentence in sentence_list:
    seg_list = finalseg.cut(sentence)
    print "/ ".join(seg_list)

print u"\n=打开新词发现功能后的效果\n"

for sentence in sentence_list:
    seg_list = finalseg.cut(sentence, find_new_word=True)
    print "/ ".join(seg_list)
예제 #11
0
파일: test.py 프로젝트: Masterlvng/finalseg
#encoding=utf-8
import finalseg

sentence_list = [
"姚晨和老凌离婚了",
"他说的确实在理",
"长春市长春节讲话"
]

print u"=默认效果"

for sentence in sentence_list:
	seg_list = finalseg.cut(sentence)
	print "/ ".join(seg_list)

print u"\n=打开新词发现功能后的效果\n"


for sentence in sentence_list:
	seg_list = finalseg.cut(sentence,find_new_word=True)
	print "/ ".join(seg_list)

예제 #12
0
 def cut(self, sentence):
     cut_rst = finalseg.cut(sentence)
     return "/ ".join(cut_rst)