def f(text): sentences = sentence.get_sentences(text) doc = [] for sent in sentences: #print "%s" % sent.encode('utf8') words = segment.seg(sent) words = swfilter.filter(words) doc.append(words) rank = TextRank(doc) rank.solve() ret = [] for index in rank.top_index(limit=5): ret.append(sentences[index].strip())
def handle(self, doc): words = segment.seg(doc) words = swfilter.filter(words) return words
def text_preprocess(text): allowed_words = ['.', ',', '?', '!', '\\', '-', 'n', ';', ' ', 'W'] for sen_id, sentance in enumerate(text): sentance = (sentance).replace('\n', '.') sentance = (sentance).replace('\t', '.') sentance = (sentance).replace('。', '.') sentance = (sentance).replace(',', ',') sentance = (sentance).replace('?', '?') text[sen_id] = sentance pattern1 = re.compile('[a-zA-Z]+') pattern2 = re.compile(r'\[.*?\]') pattern3 = re.compile(r'\.+') pattern4 = re.compile(r' +') pattern5 = re.compile(r'…+') pattern6 = re.compile(r'\\r') pattern7 = re.compile('[0-9]+') pattern8 = re.compile(r',+') pattern9 = re.compile(r'《.*?》') pattern10 = re.compile(r'。+') pattern11 = re.compile(r',+') pattern12 = re.compile(r'【.*?】') text = [pattern1.sub("", lines) for lines in text] # 去掉英语字符 text = [pattern2.sub("", lines) for lines in text] # 去掉[]中的部分 text = [pattern3.sub(".", lines) for lines in text] # 去掉... text = [pattern4.sub(".", lines) for lines in text] # 去掉空格 text = [pattern5.sub(".", lines) for lines in text] # 去掉… text = [pattern6.sub("", lines) for lines in text] # 去掉空格\r text = [pattern7.sub("n", lines) for lines in text] # 用n代表数字 text = [pattern8.sub(",", lines) for lines in text] # 去掉,,, text = [pattern9.sub("", lines) for lines in text] # 去掉《》中的部分 text = [pattern10.sub(".", lines) for lines in text] # 去掉。。。 text = [pattern11.sub(",", lines) for lines in text] # 去掉,,, text = [pattern12.sub("", lines) for lines in text] # 去掉【】中 text = [pattern3.sub(".", lines) for lines in text] # 去掉... text_id = 0 max_len = 20 processed_text = [] while (True): if text_id >= text.__len__(): break if text[text_id].__len__() <= max_len: text_id += 1 continue for i in range(text[text_id].__len__() - max_len): if text[text_id][i + max_len] in [',', '.', '?']: text.append(text[text_id][i + max_len + 1:]) text[text_id] = text[text_id][:i + max_len] text_id += 1 break if i == text[text_id].__len__() - max_len - 1: text_id += 1 break if i == sentence_min_len: break_sign = 0 for j in range(max_len - 1, -1, -1): if text[text_id][j] in [',', '.', '。', ',', '\n']: text.append(text[text_id][j + 1:]) text[text_id] = text[text_id][:j + 1] text_id += 1 break_sign = 1 break if break_sign: break else: # text.append(text[text_id][i + max_len + 1:]) text[text_id] = text[text_id][:i + max_len + 1] text_id += 1 break for sentance in text: if if_segment(): sentance_seg = segment.seg(sentance) else: sentance_seg = sentance sentance_seg2 = ['\start'] for word in sentance_seg: if word in allowed_words or is_chinese(word): sentance_seg2.append(word) sentance_seg2.append('\end') processed_text.append(sentance_seg2) return processed_text
return cm,accu def accuracy(cm): row_num = len(cm) right_num = 0 for i in range(row_num): right_num += cm[i][i] total_num = sum(cm.sum(0)) return float(right_num) / float(total_num) if __name__ == "__main__": filename = "/home/chi/PycharmProjects/Seg_AR/data/example" segment.SENSORLIST = tools.getSensorList(filename) true_labels = annotation.origin_annotation(filename) seq_index = [] with open(filename, 'r') as fr: for line in fr: row = line.split() seq_index.append(segment.SENSORLIST.index(row[2])) sizes=range(10,150,5) thetas = [0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5] for size in sizes: for theta in thetas: border_1 = segment.seg(seq_index, size, theta) vote_labels = annotation.seg_labels(true_labels,border_1) cm,accu = confusionmatrix(vote_labels,true_labels) print accu
def select(img): global ims, para, contour, ps, imggg, C, parts, index, para1, ims1, ps1 # Segmentation im, cl, d = segment.seg(img) # Setting values of the global variables ps = cl.copy() imggg = im j = imggg.copy() ims = imggg.copy() C = np.zeros(ims.shape, np.uint8) contour = None para = True # Selction of contour cv2.namedWindow('image', cv2.WINDOW_NORMAL) cv2.resizeWindow('image', (int(im.shape[1] / 2), int(im.shape[0] / 2))) cv2.moveWindow('image', 40, 0) cv2.setMouseCallback('image', draw_c) while (1): cv2.imshow('image', ims) k = cv2.waitKey(1) & 0xFF if k == ord('q'): break cv2.destroyAllWindows() # Finding all the points on the contour Cp = cv2.cvtColor(C, cv2.COLOR_BGR2GRAY) point = cv2.findNonZero(Cp) # Taking as input the number of parts to which the contour should be divided num_parts = input( "In how many parts you want to divide the selected vessel (Please enter an integer <=5): " ) #print(point.shape) print("Select the required part and press Q.") parts = np.array_split(point, int(num_parts), axis=0) colbgr = [(193, 182, 255), (255, 0, 102), (255, 128, 0), (0, 255, 255), (10, 200, 10)] Cparts = np.zeros(C.shape) for i, part in enumerate(parts): if (i >= 5): cv2.drawContours(Cparts, part, -1, (255, 255, 255), 5) else: cv2.drawContours(Cparts, part, -1, colbgr[i], 5) # Global variables para1 = True ps1 = Cparts ims1 = ps1.copy() parts = np.array_split(point, int(num_parts), axis=0) # At max supports 10 parts if (int(num_parts) <= 1 or int(num_parts) > 10): num_parts = 1 C_parts_selected = parts[0] return (C_parts_selected, d) # Selecting the part if number of parts <=10 cv2.namedWindow('image1', cv2.WINDOW_NORMAL) cv2.resizeWindow('image1', (int(im.shape[1] / 2), int(im.shape[0] / 2))) cv2.moveWindow('image1', 40, 0) cv2.setMouseCallback('image1', draw_part) while (1): cv2.imshow('image1', ims1) k = cv2.waitKey(1) & 0xFF if k == ord('q'): break cv2.destroyAllWindows() C_parts_selected = parts[index] return C_parts_selected, d
def handle(self, doc): words = segment.seg(doc) #words = swfilter.filter(list(words)) return list(words)
#!/usr/bin/env python3 import segment as sg inputSylla = "" output = "" #print(sg.seg("ektygcmqcqlg")) #print(sg.seg("wygdgdrndhqa")) inputSylla = input() while inputSylla != "#": ans = sg.seg(inputSylla) for i in range(0,len(ans)-1): print(ans[i],end="") print("") inputSylla = input()
#print(sg.seg("wygdgdrndhqa")) while True: #read input if inputSylla[-1] == "#": inputSylla = "" inp_ch = input("Please input char:") if inp_ch != "" and inp_ch[0].isdigit(): #if input is a number print("Choose :" + on_screen[int(inp_ch)][0] + str(on_screen[int(inp_ch)][1])) output += on_screen[int(inp_ch)][0] inputSylla = inputSylla[on_screen[int(inp_ch)][1]:] else: #update input Sylla inputSylla += inp_ch print(sg.seg(inputSylla)) #max incorrection pattern screen = [] it = 4 counter = 0 flag = True while it >= 0 and flag: tp_list = [] for wubi_line in sg.wubi: if wubi_line[0].startswith(inputSylla[0:it]): wubi_code = int(wubi_line[1]) if wubi_code in sg.uni_map: tp_list.append([sg.wd_map[wubi_code],it,sg.uni_map[wubi_code]]) counter += 1
pg.pg_init() # 将教育经历的两个属性合并 '''for ele in segment.seg(pg.get_edu()): print len(ele)''' # 测试 '''for mlist in segment.seg(pg.get_edu()): for ele in mlist: for x in ele: output_file.write(x+' ') output_file.write('\t||\t') output_file.write('\n') ''' raw = list() people = pg.get_edu(0, 0) raw = map(lambda x: x[0] + x[1], segment.seg(people)) topic_model.build_lsi(raw) '''topic_model._build_corpus(raw) tfidf = topic_model._build_tfidf() corpus_tfidf = tfidf[topic_model.corpus] for ele in corpus_tfidf: print ele ''' '''for ele in topic_model.dictionary.token2id: output_file.write(ele+' ')'''
# -*- encoding:"utf-8"-*- # coding=utf-8 import tools import segment filename = "G:/Seg_AR/data/annotated" segment.SENSORLIST = tools.getSensorList(filename) seq_index = [] with open(filename, "r") as fr: for line in fr: row = line.split() seq_index.append(segment.SENSORLIST.index(row[2])) print len(seq_index) borders = segment.seg(seq_index) print len(borders) print borders
from __future__ import absolute_import import codecs import segment import pg output_file = codecs.open("data/seg_data1.txt", 'wb', encoding='utf-8') pg.pg_init() # print segment.seg(pg.get_edu()) bias = 1000 counter = 0 raw = '1' while len(raw) != 0: raw = pg.get_edu(bias, counter) for mlist in segment.seg(raw): for ele in mlist: for x in ele: output_file.write(x+' ') output_file.write('\t') output_file.write('\n') counter += 1 '''raw = pg.get_edu(bias, counter) for mlist in segment.seg(raw): for ele in mlist: for x in ele: output_file.write(x+' ') output_file.write('\t||\t') output_file.write('\n')'''