def __cut_DAG(sentence): DAG = get_DAG(sentence) route ={} calc(sentence,DAG,0,route=route) x = 0 buf =u'' N = len(sentence) while x<N: y = route[x][1]+1 l_word = sentence[x:y] if y-x==1: buf+= l_word else: if len(buf)>0: if len(buf)==1: yield buf buf=u'' else: regognized = finalseg.cut(buf) for t in regognized: yield t buf=u'' yield l_word x =y if len(buf)>0: if len(buf)==1: yield buf else: regognized = finalseg.cut(buf) for t in regognized: yield t
def __cut_DAG(sentence): DAG = get_DAG(sentence) route = {} calc(sentence, DAG, 0, route=route) x = 0 buf = u'' N = len(sentence) while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if y - x == 1: buf += l_word else: if len(buf) > 0: if len(buf) == 1: yield buf buf = u'' else: regognized = finalseg.cut(buf) for t in regognized: yield t buf = u'' yield l_word x = y if len(buf) > 0: if len(buf) == 1: yield buf else: regognized = finalseg.cut(buf) for t in regognized: yield t
def cosine_similarity(s1, s2): if not isinstance(s1, unicode): s1 = s1.lower().decode('utf-8') if not isinstance(s2, unicode): s2 = s2.lower().decode('utf-8') s1 = re.sub(u'\(|\)|(|)| |\s+|\#', u'', s1) s2 = re.sub(u'\(|\)|(|)| |\s+|\#', u'', s2) t1 = [i for i in finalseg.cut(s1)] t2 = [i for i in finalseg.cut(s2)] #print ",".join(t1) #print ",".join(t2) terms = list(set([i for i in t1 + t2])) v1 = [0] * len(terms) v2 = [0] * len(terms) for i in xrange(len(terms)): if terms[i] in s1: v1[i] += 1 if terms[i] in s2: v2[i] += 1 sum_xy = 0 sum_x2 = 0 sum_y2 = 0 for j in xrange(len(terms)): sum_xy += v1[j] * v2[j] sum_x2 += v1[j]**2 sum_y2 += v2[j]**2 #return sum_xy / (math.sqrt(sum_x2) * math.sqrt(sum_y2)) return sum_xy / (math.sqrt(sum_x2) * math.sqrt(sum_y2)) if sum_x2 != 0 and sum_y2 != 0 else 0
def cosine_similarity(s1, s2): if not isinstance(s1, unicode): s1 = s1.lower().decode('utf-8') if not isinstance(s2, unicode): s2 = s2.lower().decode('utf-8') s1 = re.sub(u'\(|\)|(|)| |\s+|\#', u'', s1) s2 = re.sub(u'\(|\)|(|)| |\s+|\#', u'', s2) t1 = [i for i in finalseg.cut(s1)] t2 = [i for i in finalseg.cut(s2)] terms = list(set([i for i in t1+t2])) v1 = [0] * len(terms) v2 = [0] * len(terms) for i in xrange(len(terms)): if terms[i] in s1: v1[i] += 1 if terms[i] in s2: v2[i] += 1 sum_xy = 0 sum_x2 = 0 sum_y2 = 0 for j in xrange(len(terms)): sum_xy += v1[j] * v2[j] sum_x2 += v1[j] ** 2 sum_y2 += v2[j] ** 2 #retur sum_xy / (math.sqrt(sum_x2) * math.sqrt(sum_y2)) return sum_xy / (math.sqrt(sum_x2) * math.sqrt(sum_y2)) if sum_x2!=0 and sum_y2!=0 else 0
def __cut_DAG(sentence): N = len(sentence) i, j = 0, 0 p = trie DAG = {} while i < N: c = sentence[j] if c in p: p = p[c] if "" in p: if not i in DAG: DAG[i] = [] DAG[i].append(j) j += 1 if j >= N: i += 1 j = i p = trie else: p = trie i += 1 j = i for i in xrange(len(sentence)): if not i in DAG: DAG[i] = [i] # pprint.pprint(DAG) route = {} calc(sentence, DAG, 0, route=route) x = 0 buf = u"" while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if y - x == 1: buf += l_word else: if len(buf) > 0: if len(buf) == 1: yield buf buf = u"" else: regognized = finalseg.cut(buf) for t in regognized: yield t buf = u"" yield l_word x = y if len(buf) > 0: if len(buf) == 1: yield buf else: regognized = finalseg.cut(buf) for t in regognized: yield t
def __cut_DAG(sentence): N = len(sentence) i, j = 0, 0 p = trie DAG = {} while i < N: c = sentence[j] if c in p: p = p[c] if '' in p: if not i in DAG: DAG[i] = [] DAG[i].append(j) j += 1 if j >= N: i += 1 j = i p = trie else: p = trie i += 1 j = i for i in xrange(len(sentence)): if not i in DAG: DAG[i] = [i] #pprint.pprint(DAG) route = {} calc(sentence, DAG, 0, route=route) x = 0 buf = u'' while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if y - x == 1: buf += l_word else: if len(buf) > 0: if len(buf) == 1: yield buf buf = u'' else: regognized = finalseg.cut(buf) for t in regognized: yield t buf = u'' yield l_word x = y if len(buf) > 0: if len(buf) == 1: yield buf else: regognized = finalseg.cut(buf) for t in regognized: yield t
def __cut_DAG(sentence): DAG = get_DAG(sentence) route = {} calc(sentence, DAG, 0, route=route) x = 0 buf = u"" N = len(sentence) while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if y - x == 1: buf += l_word else: if len(buf) > 0: if len(buf) == 1: yield buf buf = u"" else: if not (buf in FREQ): regognized = finalseg.cut(buf) for t in regognized: yield t else: for elem in buf: yield elem buf = u"" yield l_word x = y if len(buf) > 0: if len(buf) == 1: yield buf else: if not (buf in FREQ): regognized = finalseg.cut(buf) for t in regognized: yield t else: for elem in buf: yield elem
def __cut_DAG(sentence): DAG = get_DAG(sentence) route = {} calc(sentence, DAG, 0, route=route) x = 0 buf = u'' N = len(sentence) while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if y - x == 1: buf += l_word else: if buf: if len(buf) == 1: yield buf buf = u'' else: if (buf not in FREQ): recognized = finalseg.cut(buf) for t in recognized: yield t else: for elem in buf: yield elem buf = u'' yield l_word x = y if buf: if len(buf) == 1: yield buf elif (buf not in FREQ): recognized = finalseg.cut(buf) for t in recognized: yield t else: for elem in buf: yield elem
def post_nlp(): verification(request) rejs = request.json data = rejs.get('data') action = rejs.get('action') method = data.get('method') if method == 'hmm_participle': cut_result = participle_text(data.get('text')) return jsonify({'result': 'success', 'cut_result': cut_result}) if method == 'default': sentence = data.get('text') seg_list = finalseg.cut(sentence) res = "/ ".join(seg_list) return jsonify({'result': 'success', 'cut_result': res}) if method == 'seg_hmm': sentence = data.get('text') seg_str = cut_ner.cut_sentence(sentence) return jsonify({'result': 'success', 'cut_result': seg_str}) if method == 'ner_hmm': sentence = data.get('text') ner_cut = cut_ner.cut_ansj(sentence) return jsonify({'result': 'success', 'cut_result': ner_cut})
#encoding=utf-8 import finalseg sentence_list = ["姚晨和老凌离婚了", "他说的确实在理", "长春市长春节讲话"] print u"=默认效果" for sentence in sentence_list: seg_list = finalseg.cut(sentence) print "/ ".join(seg_list) print u"\n=打开新词发现功能后的效果\n" for sentence in sentence_list: seg_list = finalseg.cut(sentence, find_new_word=True) print "/ ".join(seg_list)
#encoding=utf-8 import finalseg sentence_list = [ "姚晨和老凌离婚了", "他说的确实在理", "长春市长春节讲话" ] print u"=默认效果" for sentence in sentence_list: seg_list = finalseg.cut(sentence) print "/ ".join(seg_list) print u"\n=打开新词发现功能后的效果\n" for sentence in sentence_list: seg_list = finalseg.cut(sentence,find_new_word=True) print "/ ".join(seg_list)
def cut(self, sentence): cut_rst = finalseg.cut(sentence) return "/ ".join(cut_rst)