class Address_Acti(object): def __init__(self): self.batch = 10000 self.test_batch = 100 if FLAG_INIT_MODEL: self.addr_tree = Trie() self.dict_tree = Trie() self.num_tree = Trie() #self.init_num_hash() self.graph = my_graph.My_Graph() self.init_model() print("init_model ok") pickle_helper.save( os.path.join(SAVE_PATH, SAVE_FILE), [self.addr_tree, self.dict_tree, self.graph, self.num_tree]) print("pickle save ok") if FLAG_LOAD_MODEL: self.addr_tree, self.dict_tree, self.graph, self.num_tree = pickle_helper.load( os.path.join(SAVE_PATH, SAVE_FILE), [1, 2, 3, 4]) print("pickle load ok") def minEditDist(self, sm, sn): m, n = len(sm) + 1, len(sn) + 1 matrix = [[0] * n for i in range(m)] matrix[0][0] = 0 for i in range(1, m): matrix[i][0] = matrix[i - 1][0] + 1 for j in range(1, n): matrix[0][j] = matrix[0][j - 1] + 1 const = 0 for i in range(1, m): for j in range(1, n): if sm[i - 1] == sn[j - 1]: cost = 0 else: cost = 1 matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost) return matrix[m - 1][n - 1] def cut_filter(self, src_sent): cmdin = r"区.+?与.+?交叉口[向东]?[\d+米]?路?[东南西北]?" def init_num_hash(self): stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines() for line in stand_lines: """ insert address into addr_tree """ """ insert all addr_tree """ line = utils.clr(line) nums = list(re.findall("\d+", line)) self.num_tree.insert_num_lst(nums, hash(line)) def init_model(self): stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines() #stand_lines = pd.read_csv(os.path.join(STD_PATH, STD_FILE)).iloc[:,1] #stand_lines = [stand_lines[np.random.randint(len(stand_lines))] for i in range(self.batch)] for line in stand_lines: """ insert address into addr_tree """ """ insert all addr_tree """ line = utils.clr(line) words = list(jieba.cut(line)) nums = list(re.findall("\d+", line)) self.num_tree.insert_num_lst(nums, hash(line)) self.addr_tree.insert_wd_lst(words) for word in words: self.dict_tree.insert(word) def score_num_lst(self, nums1, nums2): if "" in nums1: nums1.remove("") if "" in nums2: nums2.remove("") """ use num to check weather same or not """ lmin = min(len(nums1), len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i, j in zip(nums1[:lmin], nums2[:lmin]): print(i, j) if i == j: cnt += 1 continue break if cnt == lmin and cnt > 0: print(nums1, " equal ", nums2) return (cnt / lmin) * 100 elif cnt > 0: print(nums1, " equal ", nums2) return (cnt / lmin) * 100 else: print(nums1, " not equal ", nums2) return 0.0 def check_num_lst(self, nums1, nums2): if "" in nums1: nums1.remove("") if "" in nums2: nums2.remove("") """ use num to check weather same or not """ lmin = min(len(nums1), len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i, j in zip(nums1[:lmin], nums2[:lmin]): print(i, j) if i == j: cnt += 1 continue break if cnt == lmin and cnt > 0: print(nums1, " equal ", nums2) return True elif cnt > 1: print(nums1, " equal ", nums2) return True else: print(nums1, " not equal ", nums2) return False def check_num(self, line1, line2): """ use num to check weather same or not """ print("判断数字是否一致", line1, line2) cont = re.split("\d+", line1)[0] base = "" if len(cont) > 0: #base = list(jieba.cut(cont))[-1] base = cont[-3:] nums1 = re.findall("\d+", line1) nums2 = re.findall("\d+", line2) print(line1, line2) print(nums1, nums2) lmin = min(len(nums1), len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i, j in zip(nums1[:lmin], nums2[:lmin]): print(i, j) if i == j: cnt += 1 continue break if lmin > 0: if not base + nums1[0] in line2: print("False", line1, line2) return False if cnt == lmin and cnt > 0: return True elif cnt > 0: return True else: print("False", line1, line2) return False def _check_num(self, line1, line2): """ use num to check weather same or not """ base = "" cont = re.split("\d+", line1)[0] if len(cont) > 0: #base = list(jieba.cut(cont))[-1] base = cont[-3:] nums1 = re.findall("\d+", line1) nums2 = re.findall("\d+", line2) print(line1, line2) print(nums1, nums2) lmin = min(len(nums1), len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i, j in zip(nums1[:lmin], nums2[:lmin]): if i == j: cnt += 1 continue break if lmin > 0: if not base + nums1[0] in line2: print("False", base + nums1[0]) return False if cnt == lmin and cnt > 0: return True elif cnt > 1: return True else: print("False", nums1, nums2) return False def check(self, line1, line2): txts = re.split("\d+", line) if len(txts[0]) < 2: return False nums = re.findall("\d+", line) if len(nums) < 1: return False baseline = "" for i in range(2): if i > len(txts) - 1: break elif i == 0: baseline += txts[i][-3:] else: baseline += "\D+" if i > len(nums) - 1: break else: baseline += nums[i] if len(baseline) < 1: return False print(baseline, " weather in ", line2) if len(re.findall(baseline, line2)) > 0: print(baseline, " bingo in ", line2) print(re.findall(baseline, line2)) return True else: return False return False def _check(self, line1, line2): txts = re.split("\d+", line) if len(txts[0]) < 2: return False nums = re.findall("\d+", line) if len(nums) < 1: return False baseline = "" for i in range(2): if i > len(txts) - 1: return baseline elif i == 0: baseline += txts[i][-3:] else: baseline += ".+?" if i > len(nums) - 1: return baseline else: baseline += nums[i] if len(baseline) < 1: return False if len(re.findall(baseline, line2)) > 0: print(baseline, " is in ", line2) return True else: print(baseline, " not in ", line2) return False def word_filter(self, line_pre): res = [] for word in jieba.cut(line_pre): if self.dict_tree.search(word): res.append(word) return res def common_nbs(self, comm_nbs): result = set() if len(comm_nbs) > 1: result = set(comm_nbs[0]) for i in comm_nbs[1:]: if len(result & set(i)) > 0: print("交集", len(result), len(set(i))) result = result & set(i) print(self.graph.sent[list(result)[0]]) else: return result elif len(comm_nbs) == 1: result = comm_nbs[0] if len(result) > 0: print("最终输出过滤后的标准地址", self.graph.sent[list(result)[0]]) return result def _check_num(self, line1, line2): """ use num to check weather same or not """ base = "" cont = re.split("\d+", line1)[0] if len(cont) > 0: #base = list(jieba.cut(cont))[-1] base = cont[-3:] nums1 = re.findall("\d+", line1) nums2 = re.findall("\d+", line2) print(line1, line2) print(nums1, nums2) lmin = min(len(nums1), len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i, j in zip(nums1[:lmin], nums2[:lmin]): if i == j: cnt += 1 continue break if lmin > 0: if not base + nums1[0] in line2: print("False", base + nums1[0]) return False if cnt == lmin and cnt > 0: return True elif cnt > 1: return True else: print("False", nums1, nums2) return False def check(self, line1, line2): txts = re.split("\d+", line) if len(txts[0]) < 2: return False nums = re.findall("\d+", line) if len(nums) < 1: return False baseline = "" for i in range(2): if i > len(txts) - 1: break elif i == 0: baseline += txts[i][-3:] else: baseline += "\D+" if i > len(nums) - 1: break else: baseline += nums[i] if len(baseline) < 1: return False print(baseline, " weather in ", line2) if len(re.findall(baseline, line2)) > 0: print(baseline, " bingo in ", line2) print(re.findall(baseline, line2)) return True else: return False return False def _check(self, line1, line2): txts = re.split("\d+", line) if len(txts[0]) < 2: return False nums = re.findall("\d+", line) if len(nums) < 1: return False baseline = "" for i in range(2): if i > len(txts) - 1: return baseline elif i == 0: baseline += txts[i][-3:] else: baseline += ".+?" if i > len(nums) - 1: return baseline else: baseline += nums[i] if len(baseline) < 1: return False if len(re.findall(baseline, line2)) > 0: print(baseline, " is in ", line2) return True else: print(baseline, " not in ", line2) return False def _query_one(self, line): output, res, score = [], [], [] min_edit_value = 9999 parts = re.split("\d+", line) my_txt = utils.without_num(line) my_num = re.findall("\d+", line) result, res = self.route_text(my_txt, my_num) print("句子集合数目", len(result), "关键词集合", res) return result, res, 0 def editDist(self, line, result): """get the minist edit distance of line with result""" min_edit_value = 999 minist_one = "" for hs in list(result): standard_addr = self.graph.sent[hs] print("比较数字部分文本", utils.get_nums(line), utils.get_nums(standard_addr)) v = self.minEditDist(utils.get_nums(line), utils.get_nums(standard_addr)) if v < min_edit_value: minist_one = hs min_edit_value = v return minist_one def route_text(self, line, lst): print("过滤掉无用文本 ", line, lst) line = utils.clr(str(line)) #line_pre = utils.before_first_num(line) res = self.word_filter(line) print("经过过滤的词条", res) #res.extend(lst) words_route = [] comm_nbs = [] if len(res) == 1: res.extend(res) for i in range(len(res) - 1): print(res) try: #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1]) p_node = res[i] a_node = res[i + 1] if len(words_route) == 0: words_route.append(p_node) try: route = nx.shortest_path(self.graph.tree_di, words_route[-1], a_node) print('是否存在最短路径 ', route) words_route.append(a_node) print("add node", i, a_node) #weight = self.graph.tree_di[words_route[-1]][a_node]['weight'] #weight = self.graph.tree_di[words_route[-1]][a_node]['weight'] except: print( "not connect direct, continue, find the next one, utile to the head of words lst" ) print("过滤复杂文本的词条") #words_route = words_route[:-1] #words_route.append(a_node) continue except: print("networkx error") continue #words_route = words_route[::-1] print("复杂文本", res) print("过滤输出", words_route) if " " in words_route: words_route.remove(" ") if len(words_route) > 0: words_route.insert(0, words_route[0]) for i in range(len(words_route)): try: comm_nbs.extend( list(nx.all_neighbors(self.graph.di, words_route[i]))) except: print("添加邻居出错") print("所有的邻居都添加到列表中,等待计算") print("列表中共有多少个item", len(comm_nbs)) cnt_lst = collections.Counter(comm_nbs) sorted_lst = sorted(cnt_lst.items(), key=lambda d: d[1], reverse=True) if not len(sorted_lst) > 0: return [], words_route max_value = sorted_lst[0][1] #result = self.common_nbs(comm_nbs) #result = self.common_nbs(comm_nbs) result = filter(lambda x: utils.is_max(x, max_value), sorted_lst) result = [i[0] for i in result] print("一共有多少个句子", len(result)) print("公共邻居最多的句子", self.graph.sent[result[0]]) print("公共邻居最少的句子", self.graph.sent[result[-1]]) print("最终关键词", words_route) return result, words_route def format_txt(self, txts): """ date 1114 """ _txts_res = "" for txt in txts: txt = re.sub("号楼", "号", txt) txt = re.sub("号院", "号", txt) txt = re.sub("附(\d+)号", "\1号", txt) _txts_res += txt return re.findall("[\dA-Za-z]+", _txts_res) def handle_num(self, line): nums = re.split("[^0-9a-zA-Z]+", line) #txts = re.split("[0-9a-zA-Z]",line) #_txts = self.format_txt(txts) #output = [] #for i,j in zip(nums,_txts): # output.append(i) # output.append(j) #return output return nums def save_one(self, line, target, f): f.write("%s,%s\n" % (line, "ROOT" + target)) def save_one_txt(self, result, res, score, line, f): if len(result) == 0: f.write("%s,%s\n" % (line, "None")) return for parent_res in result: f.write("%s,%s\n" % (line, "ROOT" + self.graph.sent[parent_res]))
class Address_Acti(object): def __init__(self): self.batch = 6000000 self.test_batch = 300 if FLAG_INIT_MODEL: self.addr_tree = Trie() self.dict_tree = Trie() self.graph = my_graph.My_Graph() self.init_model() print("init_model ok") pickle_helper.save(os.path.join(SAVE_PATH,SAVE_FILE),[self.addr_tree,self.dict_tree,self.graph]) print("pickle save ok") if FLAG_LOAD_MODEL: self.addr_tree, self.dict_tree, self.graph = pickle_helper.load(os.path.join(SAVE_PATH,SAVE_FILE),[1,2,3]) print("pickle load ok") def init_model(self): stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines() #stand_lines = pd.read_csv(os.path.join(STD_PATH, STD_FILE)).iloc[:,1] stand_lines = [stand_lines[np.random.randint(len(stand_lines))] for i in range(self.batch)] for line in stand_lines: """ insert address into addr_tree """ """ insert all addr_tree """ line = utils.clr(line) words = list(jieba.cut(line)) self.addr_tree.insert_wd_lst(words) for word in words: self.dict_tree.insert(word) def check_num_lst(self, nums1, nums2): nums1.remove("") nums2.remove("") """ use num to check weather same or not """ lmin = min(len(nums1),len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i,j in zip(nums1[:lmin], nums2[:lmin]): print(i,j) if i == j: cnt+=1 continue break if cnt == lmin and cnt>0: print(nums1 , " equal " ,nums2) return True elif cnt>0: print(nums1 , " equal " ,nums2) return True else: print(nums1, " not equal ", nums2) return False def check_num(self, line1, line2): """ use num to check weather same or not """ print(line1,line2) pdb.set_trace() cont = re.split("\d+",line1)[0] base = "" if len(cont)>0: #base = list(jieba.cut(cont))[-1] base = cont[-3:] nums1 = re.findall("\d+",line1) nums2 = re.findall("\d+",line2) print(line1, line2) print(nums1, nums2) lmin = min(len(nums1),len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i,j in zip(nums1[:lmin], nums2[:lmin]): print(i,j) if i == j: cnt+=1 continue break if lmin>0: if not base+nums1[0] in line2: print("False",line1, line2) return False if cnt == lmin and cnt>0: return True elif cnt>0: return True else: print("False",line1, line2) return False def _check_num(self, line1, line2): """ use num to check weather same or not """ base = "" cont = re.split("\d+",line1)[0] if len(cont)>0: #base = list(jieba.cut(cont))[-1] base = cont[-3:] nums1 = re.findall("\d+",line1) nums2 = re.findall("\d+",line2) print(line1, line2) print(nums1, nums2) lmin = min(len(nums1),len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i,j in zip(nums1[:lmin], nums2[:lmin]): if i == j: cnt+=1 continue break if lmin>0: if not base+nums1[0] in line2: print("False",base+nums1[0]) return False if cnt == lmin and cnt>0: return True elif cnt>1: return True else: print("False",nums1, nums2) return False """ def check(self,line1,line2): txts = re.split("\d+",line) if len(txts[0]) < 2: return False nums = re.findall("\d+",line) if len(nums)<1: return False baseline = "" for i in range(2): if i > len(txts)-1: break elif i == 0: baseline+=txts[i][-3:] else: baseline+="\D+" if i > len(nums)-1: break else: baseline+=nums[i] if len(baseline) < 1: return False print(baseline , " weather in ", line2) if len(re.findall(baseline,line2))>0: print(baseline , " bingo in ", line2) print(re.findall(baseline,line2)) return True else: return False return False def _check(self,line1,line2): txts = re.split("\d+",line) if len(txts[0]) < 2: return False nums = re.findall("\d+",line) if len(nums)<1: return False baseline = "" for i in range(2): if i > len(txts)-1: return baseline elif i == 0: baseline+=txts[i][-3:] else: baseline+=".+?" if i > len(nums)-1: return baseline else: baseline+=nums[i] if len(baseline) < 1: return False if len(re.findall(baseline,line2))>0: print(baseline , " is in ", line2) return True else: print(baseline , " not in ", line2) return False """ def word_filter(self, line_pre): res = [] for word in jieba.cut(line_pre): if self.dict_tree.search(word): res.append(word) return res def common_nbs(self,comm_nbs): result = set() if len(comm_nbs)>1: result = set(comm_nbs[0]) for i in comm_nbs[1:]: if len(result & set(i))>0: result = result & set(i) else: return result elif len(comm_nbs)==1: result = comm_nbs[0] return result def _check_num(self, line1, line2): """ use num to check weather same or not """ base = "" cont = re.split("\d+",line1)[0] if len(cont)>0: #base = list(jieba.cut(cont))[-1] base = cont[-3:] nums1 = re.findall("\d+",line1) nums2 = re.findall("\d+",line2) print(line1, line2) print(nums1, nums2) lmin = min(len(nums1),len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i,j in zip(nums1[:lmin], nums2[:lmin]): if i == j: cnt+=1 continue break if lmin>0: if not base+nums1[0] in line2: print("False",base+nums1[0]) return False if cnt == lmin and cnt>0: return True elif cnt>1: return True else: print("False",nums1, nums2) return False """ def check(self,line1,line2): txts = re.split("\d+",line) if len(txts[0]) < 2: return False nums = re.findall("\d+",line) if len(nums)<1: return False baseline = "" for i in range(2): if i > len(txts)-1: break elif i == 0: baseline+=txts[i][-3:] else: baseline+="\D+" if i > len(nums)-1: break else: baseline+=nums[i] if len(baseline) < 1: return False print(baseline , " weather in ", line2) if len(re.findall(baseline,line2))>0: print(baseline , " bingo in ", line2) print(re.findall(baseline,line2)) return True else: return False return False def _check(self,line1,line2): txts = re.split("\d+",line) if len(txts[0]) < 2: return False nums = re.findall("\d+",line) if len(nums)<1: return False baseline = "" for i in range(2): if i > len(txts)-1: return baseline elif i == 0: baseline+=txts[i][-3:] else: baseline+=".+?" if i > len(nums)-1: return baseline else: baseline+=nums[i] if len(baseline) < 1: return False if len(re.findall(baseline,line2))>0: print(baseline , " is in ", line2) return True else: print(baseline , " not in ", line2) return False """ def word_filter(self, line_pre): res = [] for word in jieba.cut(line_pre): if self.dict_tree.search(word): res.append(word) return res def common_nbs(self,comm_nbs): result = set() if len(comm_nbs)>1: result = set(comm_nbs[0]) for i in comm_nbs[1:]: if len(result & set(i))>0: result = result & set(i) else: return result elif len(comm_nbs)==1: result = comm_nbs[0] return result def _query_one(self, line): output,res = [],[] parts = re.split("[\da-zA-Z]+",line) if len(parts)>1: #result,res = tasks.handle_text.delay(parts[0]) result,res = self.handle_text(parts[0]) for hs in result: try: self.graph.sent[hs] except: continue _line = self.graph.sent[hs] _parts = re.split("[\da-zA-Z]+",line) __line = re.split("[^\da-zA-Z]+",_line) nums = re.split("[^\da-zA-Z]+",line) #nums = self.handle_num(line[len(parts[0]):]) #nums = tasks.handle_num(line[len(parts[0]):]) if True == self.check_num_lst(__line ,nums): if "".join(res)[-2:] == __line[0][-2:]: output.append(_line) else: print("".join(res)[-3:] ," not equal ", _parts[0][-3:]) return output,res def handle_text(self,line): line = utils.clr(str(line)) line_pre = utils.before_first_num(line) res = self.word_filter(line_pre) comm_nbs = [] for i in range(len(res)-2): print(res) try: #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1]) comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1]))) except: print("networkx error") continue #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1]) comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1]))) result = self.common_nbs(comm_nbs) return result,res def format_txt(self, txts): """ date 1114 """ _txts_res = "" for txt in txts: txt = re.sub("号楼","号",txt) txt = re.sub("号院","号",txt) txt = re.sub("附(\d+)号","\1号",txt) _txts_res+=txt return re.findall("[\dA-Za-z]+",_txts_res) def handle_num(self, line): nums = re.split("[^0-9a-zA-Z]+",line) #txts = re.split("[0-9a-zA-Z]",line) #_txts = self.format_txt(txts) #output = [] #for i,j in zip(nums,_txts): # output.append(i) # output.append(j) #return output return nums def query_one(self, line): line = utils.clr(str(line)) line_pre = utils.before_first_num(line) #fir_num= utils.first_numbers(line) res = self.word_filter(line_pre) res.extend(utils.numbers(line)) comm_nbs = [] for i in range(len(res)-1): print(res) try: #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1]) comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1]))) except: print("networkx error") continue #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1]) comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1]))) result = self.common_nbs(comm_nbs) _result = [] for i in result: if not self.check_num(self.graph.sent[i],line): continue _result.append(self.graph.sent[i]) return _result,res def query(self): df = pd.DataFrame() df['map'] = "" df['kw'] = "" df['target'] = "" input_file = [] cnt=0 for _,_,docs in os.walk(TEST_PATH): for doc in docs: lines = open(os.path.join(TEST_PATH, doc)).readlines() #lines = pd.read_csv(os.path.join(TEST_PATH, doc)).iloc[:,1] lines = [lines[np.random.randint(len(lines))] for i in range(self.test_batch)] for line in lines: line = utils.clr(line) print(line) result,res = self._query_one(line) #result = self.addr_tree.words_route(res) if len(result) == 0: df.loc[str(cnt),'map'] = line df.loc[str(cnt),'target'] = "".join([]) df.loc[str(cnt),'kw'] = ",".join(res) cnt+=1 continue else: for parent_res in result: print(line, parent_res) df.loc[str(cnt),'map'] = line df.loc[str(cnt),'target'] = "ROOT"+parent_res df.loc[str(cnt),'kw'] = ",".join(res) cnt+=1 df.to_csv("./record.csv") print(cnt, 'save')