def __init__(self): self.batch = 10000 self.test_batch = 100 if FLAG_INIT_MODEL: print("init addr_tree") self.addr_tree = Trie() print("init dict_tree") self.dict_tree = Trie() print("init num_tree") self.num_tree = Trie() print("init model") self.init_model() print("init graph") #self.init_num_hash() self.graph = my_graph.My_Graph() logger.debug("init_model ok") pickle_helper.save( os.path.join(SAVE_PATH, SAVE_FILE), [self.addr_tree, self.dict_tree, self.graph, self.num_tree]) logger.debug("pickle save ok") if FLAG_LOAD_MODEL: self.redis = StrictRedis(host='localhost', port=6379, db=0) #self.init_redis() self.addr_tree, self.dict_tree, self.graph = pickle_helper.load( os.path.join(SAVE_PATH, SAVE_FILE), [1, 2, 3]) #self.addr_tree, self.dict_tree, self.graph,self.num_tree = pickle_helper.load(os.path.join(SAVE_PATH,SAVE_FILE),[1,2,3,4]) logger.debug("> pickle load ok ===")
def make_goto(self): start = time.time() trie = Trie() for word in self.keywords: trie.add(word) self.goto = trie.tree self.goto['root']['status'] = 0 self.output = trie.output stop = time.time()
def __init__(self): self.batch = 6000000 self.test_batch = 300 if FLAG_INIT_MODEL: self.addr_tree = Trie() self.dict_tree = Trie() self.graph = my_graph.My_Graph() self.init_model() print("init_model ok") pickle_helper.save(os.path.join(SAVE_PATH,SAVE_FILE),[self.addr_tree,self.dict_tree,self.graph]) print("pickle save ok") if FLAG_LOAD_MODEL: self.addr_tree, self.dict_tree, self.graph = pickle_helper.load(os.path.join(SAVE_PATH,SAVE_FILE),[1,2,3]) print("pickle load ok")
def building_noise_tree(height, total_budget, k, b, delta, data_file, trajectory_file, save_path): if not os.path.exists(data_file): print("原始数据集文件不存在!") return if not os.path.exists(trajectory_file): print("轨迹数据集文件不存在!") return print('>>>开始构建树' + '>' * 40) read_data_start_time = time.time() data = pd.read_csv(data_file) information = np.load( trajectory_file).item() # {id:轨迹} Tractory_dataset_40_170 read_data_end_time = time.time() pre_location = list(set(data["route_station"])) # 地点域 pre_timer = sorted(set(data["timestamp"])) # 时间域 read_data_time = cal_time(read_data_start_time, read_data_end_time) # 字典树高度,除根节点 budget, thresholds = assign_privacy_budget_and_thresholds( height, total_budget, k, b, delta) # allcation_ep(height) t是阈值list print('每层树节点阈值划分:', thresholds) print('每层树隐私预算划分:', budget) trie = Trie() # 建立字典树 root_epsilon = budget[0] # 根节点隐私预算 epsilonL = budget[1] # 每层隐私预算 thred = thresholds[0] # 阈值 sensitivity = 1 # 敏感度 # 插入每条轨迹,建立字典树 for item in information.values(): trie.insert(item, height) trie.root.count += 1 build_rawtree_start_time = time.time() build_rawtree_time = cal_time(read_data_end_time, build_rawtree_start_time) # 保存文件存放路径 raw_path = './raw_tree/' if not os.path.exists(raw_path): os.mkdir(raw_path) filename = raw_path + 'raw_tree_data_height_' + str(height) + '_.txt' if os.path.isfile(filename) == False: with open(filename, 'wb') as f: pkl.dump(trie, f) print(">>>成功保存原轨迹树文件!") else: print(">>>原轨迹树文件已经存在,不需要重复保存!") # 给根节点加噪声 build_noise_tree_start_time = time.time() noise = lap_noise(root_epsilon, sensitivity) trie.root.local_count = trie.root.count # 保存根节点原始count trie.root.count = trie.root.count + noise # 根节点count加噪声 # 对树节点加噪声 addNoise(trie.root, pre_timer, pre_location, epsilonL, sensitivity, thred) thred2 = thresholds[1] # 阈值 for i in range(1, height): next_epsilon = budget[i + 1] # 下一层树节点的隐私预算 thred2 = thresholds[i] for item in search_level_nodes(trie.root, i, []): if pre_timer.index(item.value[1]) >= len(pre_timer) - height: continue addNoise(item, pre_timer, pre_location, next_epsilon, sensitivity, thred2) build_noise_tree_end_time = time.time() build_noise_tree_time = cal_time(build_noise_tree_start_time, build_noise_tree_end_time) sanitized_data = [] # 保存净化的轨迹 for son_node in trie.root.children.values(): sanitized_data.extend(output([], son_node, [])) print('轨迹预览:', sanitized_data[:2]) print('不重复的轨迹数量:', len(sanitized_data)) sanitized_data_endtime = time.time() sanitized_data_time = cal_time(build_noise_tree_end_time, sanitized_data_endtime) lap_path = './lap_tree/' + save_path if not os.path.exists(lap_path): os.mkdir(lap_path) prefix_path = lap_path + '/lap_tree_data_height_' suffix = str(height) + '_' + str(total_budget) + '_' + \ str(k) + '_' + str(b) + '_.txt' lap_name = prefix_path + suffix if os.path.isfile(lap_name) == False: with open(lap_name, 'wb') as f: pkl.dump(trie, f) print(">>>成功保存噪声树文件!") else: print(">>>原噪声树文件已经存在,不需要重复保存!") total_time = build_rawtree_time + build_noise_tree_time + sanitized_data_time print('=' * 50) print("读数据时间[秒]:", read_data_time) print("建原始树时间[秒](Read time):", build_rawtree_time) print('建噪声树时间[秒](saniztion time):', build_noise_tree_time) print('生成数据集时间[秒](writing time):', sanitized_data_time) print('total time[秒]:', round(total_time, 4)) print('=' * 50)
class Address_Acti(object): def __init__(self): self.batch = 10000 self.test_batch = 100 if FLAG_INIT_MODEL: self.addr_tree = Trie() self.dict_tree = Trie() self.num_tree = Trie() #self.init_num_hash() self.graph = my_graph.My_Graph() self.init_model() print("init_model ok") pickle_helper.save( os.path.join(SAVE_PATH, SAVE_FILE), [self.addr_tree, self.dict_tree, self.graph, self.num_tree]) print("pickle save ok") if FLAG_LOAD_MODEL: self.addr_tree, self.dict_tree, self.graph, self.num_tree = pickle_helper.load( os.path.join(SAVE_PATH, SAVE_FILE), [1, 2, 3, 4]) print("pickle load ok") def minEditDist(self, sm, sn): m, n = len(sm) + 1, len(sn) + 1 matrix = [[0] * n for i in range(m)] matrix[0][0] = 0 for i in range(1, m): matrix[i][0] = matrix[i - 1][0] + 1 for j in range(1, n): matrix[0][j] = matrix[0][j - 1] + 1 const = 0 for i in range(1, m): for j in range(1, n): if sm[i - 1] == sn[j - 1]: cost = 0 else: cost = 1 matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost) return matrix[m - 1][n - 1] def cut_filter(self, src_sent): cmdin = r"区.+?与.+?交叉口[向东]?[\d+米]?路?[东南西北]?" def init_num_hash(self): stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines() for line in stand_lines: """ insert address into addr_tree """ """ insert all addr_tree """ line = utils.clr(line) nums = list(re.findall("\d+", line)) self.num_tree.insert_num_lst(nums, hash(line)) def init_model(self): stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines() #stand_lines = pd.read_csv(os.path.join(STD_PATH, STD_FILE)).iloc[:,1] #stand_lines = [stand_lines[np.random.randint(len(stand_lines))] for i in range(self.batch)] for line in stand_lines: """ insert address into addr_tree """ """ insert all addr_tree """ line = utils.clr(line) words = list(jieba.cut(line)) nums = list(re.findall("\d+", line)) self.num_tree.insert_num_lst(nums, hash(line)) self.addr_tree.insert_wd_lst(words) for word in words: self.dict_tree.insert(word) def score_num_lst(self, nums1, nums2): if "" in nums1: nums1.remove("") if "" in nums2: nums2.remove("") """ use num to check weather same or not """ lmin = min(len(nums1), len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i, j in zip(nums1[:lmin], nums2[:lmin]): print(i, j) if i == j: cnt += 1 continue break if cnt == lmin and cnt > 0: print(nums1, " equal ", nums2) return (cnt / lmin) * 100 elif cnt > 0: print(nums1, " equal ", nums2) return (cnt / lmin) * 100 else: print(nums1, " not equal ", nums2) return 0.0 def check_num_lst(self, nums1, nums2): if "" in nums1: nums1.remove("") if "" in nums2: nums2.remove("") """ use num to check weather same or not """ lmin = min(len(nums1), len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i, j in zip(nums1[:lmin], nums2[:lmin]): print(i, j) if i == j: cnt += 1 continue break if cnt == lmin and cnt > 0: print(nums1, " equal ", nums2) return True elif cnt > 1: print(nums1, " equal ", nums2) return True else: print(nums1, " not equal ", nums2) return False def check_num(self, line1, line2): """ use num to check weather same or not """ print("判断数字是否一致", line1, line2) cont = re.split("\d+", line1)[0] base = "" if len(cont) > 0: #base = list(jieba.cut(cont))[-1] base = cont[-3:] nums1 = re.findall("\d+", line1) nums2 = re.findall("\d+", line2) print(line1, line2) print(nums1, nums2) lmin = min(len(nums1), len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i, j in zip(nums1[:lmin], nums2[:lmin]): print(i, j) if i == j: cnt += 1 continue break if lmin > 0: if not base + nums1[0] in line2: print("False", line1, line2) return False if cnt == lmin and cnt > 0: return True elif cnt > 0: return True else: print("False", line1, line2) return False def _check_num(self, line1, line2): """ use num to check weather same or not """ base = "" cont = re.split("\d+", line1)[0] if len(cont) > 0: #base = list(jieba.cut(cont))[-1] base = cont[-3:] nums1 = re.findall("\d+", line1) nums2 = re.findall("\d+", line2) print(line1, line2) print(nums1, nums2) lmin = min(len(nums1), len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i, j in zip(nums1[:lmin], nums2[:lmin]): if i == j: cnt += 1 continue break if lmin > 0: if not base + nums1[0] in line2: print("False", base + nums1[0]) return False if cnt == lmin and cnt > 0: return True elif cnt > 1: return True else: print("False", nums1, nums2) return False def check(self, line1, line2): txts = re.split("\d+", line) if len(txts[0]) < 2: return False nums = re.findall("\d+", line) if len(nums) < 1: return False baseline = "" for i in range(2): if i > len(txts) - 1: break elif i == 0: baseline += txts[i][-3:] else: baseline += "\D+" if i > len(nums) - 1: break else: baseline += nums[i] if len(baseline) < 1: return False print(baseline, " weather in ", line2) if len(re.findall(baseline, line2)) > 0: print(baseline, " bingo in ", line2) print(re.findall(baseline, line2)) return True else: return False return False def _check(self, line1, line2): txts = re.split("\d+", line) if len(txts[0]) < 2: return False nums = re.findall("\d+", line) if len(nums) < 1: return False baseline = "" for i in range(2): if i > len(txts) - 1: return baseline elif i == 0: baseline += txts[i][-3:] else: baseline += ".+?" if i > len(nums) - 1: return baseline else: baseline += nums[i] if len(baseline) < 1: return False if len(re.findall(baseline, line2)) > 0: print(baseline, " is in ", line2) return True else: print(baseline, " not in ", line2) return False def word_filter(self, line_pre): res = [] for word in jieba.cut(line_pre): if self.dict_tree.search(word): res.append(word) return res def common_nbs(self, comm_nbs): result = set() if len(comm_nbs) > 1: result = set(comm_nbs[0]) for i in comm_nbs[1:]: if len(result & set(i)) > 0: print("交集", len(result), len(set(i))) result = result & set(i) print(self.graph.sent[list(result)[0]]) else: return result elif len(comm_nbs) == 1: result = comm_nbs[0] if len(result) > 0: print("最终输出过滤后的标准地址", self.graph.sent[list(result)[0]]) return result def _check_num(self, line1, line2): """ use num to check weather same or not """ base = "" cont = re.split("\d+", line1)[0] if len(cont) > 0: #base = list(jieba.cut(cont))[-1] base = cont[-3:] nums1 = re.findall("\d+", line1) nums2 = re.findall("\d+", line2) print(line1, line2) print(nums1, nums2) lmin = min(len(nums1), len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i, j in zip(nums1[:lmin], nums2[:lmin]): if i == j: cnt += 1 continue break if lmin > 0: if not base + nums1[0] in line2: print("False", base + nums1[0]) return False if cnt == lmin and cnt > 0: return True elif cnt > 1: return True else: print("False", nums1, nums2) return False def check(self, line1, line2): txts = re.split("\d+", line) if len(txts[0]) < 2: return False nums = re.findall("\d+", line) if len(nums) < 1: return False baseline = "" for i in range(2): if i > len(txts) - 1: break elif i == 0: baseline += txts[i][-3:] else: baseline += "\D+" if i > len(nums) - 1: break else: baseline += nums[i] if len(baseline) < 1: return False print(baseline, " weather in ", line2) if len(re.findall(baseline, line2)) > 0: print(baseline, " bingo in ", line2) print(re.findall(baseline, line2)) return True else: return False return False def _check(self, line1, line2): txts = re.split("\d+", line) if len(txts[0]) < 2: return False nums = re.findall("\d+", line) if len(nums) < 1: return False baseline = "" for i in range(2): if i > len(txts) - 1: return baseline elif i == 0: baseline += txts[i][-3:] else: baseline += ".+?" if i > len(nums) - 1: return baseline else: baseline += nums[i] if len(baseline) < 1: return False if len(re.findall(baseline, line2)) > 0: print(baseline, " is in ", line2) return True else: print(baseline, " not in ", line2) return False def _query_one(self, line): output, res, score = [], [], [] min_edit_value = 9999 parts = re.split("\d+", line) my_txt = utils.without_num(line) my_num = re.findall("\d+", line) result, res = self.route_text(my_txt, my_num) print("句子集合数目", len(result), "关键词集合", res) return result, res, 0 def editDist(self, line, result): """get the minist edit distance of line with result""" min_edit_value = 999 minist_one = "" for hs in list(result): standard_addr = self.graph.sent[hs] print("比较数字部分文本", utils.get_nums(line), utils.get_nums(standard_addr)) v = self.minEditDist(utils.get_nums(line), utils.get_nums(standard_addr)) if v < min_edit_value: minist_one = hs min_edit_value = v return minist_one def route_text(self, line, lst): print("过滤掉无用文本 ", line, lst) line = utils.clr(str(line)) #line_pre = utils.before_first_num(line) res = self.word_filter(line) print("经过过滤的词条", res) #res.extend(lst) words_route = [] comm_nbs = [] if len(res) == 1: res.extend(res) for i in range(len(res) - 1): print(res) try: #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1]) p_node = res[i] a_node = res[i + 1] if len(words_route) == 0: words_route.append(p_node) try: route = nx.shortest_path(self.graph.tree_di, words_route[-1], a_node) print('是否存在最短路径 ', route) words_route.append(a_node) print("add node", i, a_node) #weight = self.graph.tree_di[words_route[-1]][a_node]['weight'] #weight = self.graph.tree_di[words_route[-1]][a_node]['weight'] except: print( "not connect direct, continue, find the next one, utile to the head of words lst" ) print("过滤复杂文本的词条") #words_route = words_route[:-1] #words_route.append(a_node) continue except: print("networkx error") continue #words_route = words_route[::-1] print("复杂文本", res) print("过滤输出", words_route) if " " in words_route: words_route.remove(" ") if len(words_route) > 0: words_route.insert(0, words_route[0]) for i in range(len(words_route)): try: comm_nbs.extend( list(nx.all_neighbors(self.graph.di, words_route[i]))) except: print("添加邻居出错") print("所有的邻居都添加到列表中,等待计算") print("列表中共有多少个item", len(comm_nbs)) cnt_lst = collections.Counter(comm_nbs) sorted_lst = sorted(cnt_lst.items(), key=lambda d: d[1], reverse=True) if not len(sorted_lst) > 0: return [], words_route max_value = sorted_lst[0][1] #result = self.common_nbs(comm_nbs) #result = self.common_nbs(comm_nbs) result = filter(lambda x: utils.is_max(x, max_value), sorted_lst) result = [i[0] for i in result] print("一共有多少个句子", len(result)) print("公共邻居最多的句子", self.graph.sent[result[0]]) print("公共邻居最少的句子", self.graph.sent[result[-1]]) print("最终关键词", words_route) return result, words_route def format_txt(self, txts): """ date 1114 """ _txts_res = "" for txt in txts: txt = re.sub("号楼", "号", txt) txt = re.sub("号院", "号", txt) txt = re.sub("附(\d+)号", "\1号", txt) _txts_res += txt return re.findall("[\dA-Za-z]+", _txts_res) def handle_num(self, line): nums = re.split("[^0-9a-zA-Z]+", line) #txts = re.split("[0-9a-zA-Z]",line) #_txts = self.format_txt(txts) #output = [] #for i,j in zip(nums,_txts): # output.append(i) # output.append(j) #return output return nums def save_one(self, line, target, f): f.write("%s,%s\n" % (line, "ROOT" + target)) def save_one_txt(self, result, res, score, line, f): if len(result) == 0: f.write("%s,%s\n" % (line, "None")) return for parent_res in result: f.write("%s,%s\n" % (line, "ROOT" + self.graph.sent[parent_res]))
class Address_Acti(object): def __init__(self): self.batch = 10000 self.test_batch = 100 if FLAG_INIT_MODEL: print("init addr_tree") self.addr_tree = Trie() print("init dict_tree") self.dict_tree = Trie() print("init num_tree") self.num_tree = Trie() print("init model") self.init_model() print("init graph") #self.init_num_hash() self.graph = my_graph.My_Graph() logger.debug("init_model ok") pickle_helper.save( os.path.join(SAVE_PATH, SAVE_FILE), [self.addr_tree, self.dict_tree, self.graph, self.num_tree]) logger.debug("pickle save ok") if FLAG_LOAD_MODEL: self.redis = StrictRedis(host='localhost', port=6379, db=0) #self.init_redis() self.addr_tree, self.dict_tree, self.graph = pickle_helper.load( os.path.join(SAVE_PATH, SAVE_FILE), [1, 2, 3]) #self.addr_tree, self.dict_tree, self.graph,self.num_tree = pickle_helper.load(os.path.join(SAVE_PATH,SAVE_FILE),[1,2,3,4]) logger.debug("> pickle load ok ===") def init_redis(self): cmd = "cat redis_cmd_insert_data.txt | redis-cli --pipe" result = subprocess.getoutput(cmd) print(result) def _init_redis(self): stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines() #stand_lines = pd.read_csv(os.path.join(STD_PATH, STD_FILE)).iloc[:,1] cnt = 0 for line in stand_lines[:100000]: #logger.debug("redis init", line) #line = utils.clr(line) #words = list(jieba.cut(line)) words = line.split(" ") for word in words: utils.add_sent_2_word(self.redis, word, str(hash(line))) cnt += 1 if cnt % 1000 == 1: print(cnt) return 0 def cut_filter(self, src_sent): cmdin = r"区.+?与.+?交叉口[向东]?[\d+米]?路?[东南西北]?" def init_num_hash(self): stand_lines = pd.read_csv(os.path.join(STD_PATH, STD_FILE)).iloc[:, 1] for line in stand_lines: """ insert address into addr_tree """ """ insert all addr_tree """ line = utils.clr(line) nums = list(re.findall(RE_NUMS, line)) #self.num_tree.insert_num_lst(nums,hash(line)) def init_model(self): #stand_lines = pd.read_csv(os.path.join(STD_PATH, STD_FILE)).iloc[:,1] stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines() cnt = 0 for line in stand_lines[:100000]: cnt += 1 if cnt % 10000 == 1: print(cnt) """ insert address into addr_tree """ """ insert all addr_tree """ #line = utils.clr(line) #words = list(jieba.cut(line)) #nums = list(re.findall(RE_NUMS,line)) #self.num_tree.insert_num_lst(nums,hash(line)) #self.addr_tree.insert_wd_lst(words) words = line.split(" ") for word in words: self.dict_tree.insert(word) def score_num_lst(self, nums1, nums2): if "" in nums1: nums1.remove("") if "" in nums2: nums2.remove("") """ use num to check weather same or not """ lmin = min(len(nums1), len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i, j in zip(nums1[:lmin], nums2[:lmin]): logger.debug(i, j) if i == j: cnt += 1 continue break if cnt == lmin and cnt > 0: logger.debug(nums1, " equal ", nums2) return (cnt / lmin) * 100 elif cnt > 0: logger.debug(nums1, " equal ", nums2) return (cnt / lmin) * 100 else: logger.debug(nums1, " not equal ", nums2) return 0.0 def check_num_lst(self, nums1, nums2): if "" in nums1: nums1.remove("") if "" in nums2: nums2.remove("") """ use num to check weather same or not """ lmin = min(len(nums1), len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i, j in zip(nums1[:lmin], nums2[:lmin]): logger.debug(i, j) if i == j: cnt += 1 continue break if cnt == lmin and cnt > 0: logger.debug(nums1, " equal ", nums2) return True elif cnt > 1: logger.debug(nums1, " equal ", nums2) return True else: logger.debug(nums1, " not equal ", nums2) return False def check_num(self, line1, line2): """ use num to check weather same or not """ logger.debug("判断数字是否一致", line1, line2) cont = re.split("\d+", line1)[0] base = "" if len(cont) > 0: #base = list(jieba.cut(cont))[-1] base = cont[-3:] nums1 = re.findall(RE_NUMS, line1) nums2 = re.findall(RE_NUMS, line2) logger.debug(line1, line2) logger.debug(nums1, nums2) lmin = min(len(nums1), len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i, j in zip(nums1[:lmin], nums2[:lmin]): logger.debug(i, j) if i == j: cnt += 1 continue break if lmin > 0: if not base + nums1[0] in line2: logger.debug("False", line1, line2) return False if cnt == lmin and cnt > 0: return True elif cnt > 0: return True else: logger.debug("False", line1, line2) return False def _check_num(self, line1, line2): """ use num to check weather same or not """ base = "" cont = re.split("\d+", line1)[0] if len(cont) > 0: #base = list(jieba.cut(cont))[-1] base = cont[-3:] nums1 = re.findall(RE_NUMS, line1) nums2 = re.findall(RE_NUMS, line2) logger.debug(line1, line2) logger.debug(nums1, nums2) lmin = min(len(nums1), len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i, j in zip(nums1[:lmin], nums2[:lmin]): if i == j: cnt += 1 continue break if lmin > 0: if not base + nums1[0] in line2: logger.debug("False", base + nums1[0]) return False if cnt == lmin and cnt > 0: return True elif cnt > 1: return True else: logger.debug("False", nums1, nums2) return False def _check(self, line1, line2): txts = re.split(RE_NUMS, line) if len(txts[0]) < 2: return False nums = re.findall(RE_NUMS, line) if len(nums) < 1: return False baseline = "" for i in range(2): if i > len(txts) - 1: return baseline elif i == 0: baseline += txts[i][-3:] else: baseline += ".+?" if i > len(nums) - 1: return baseline else: baseline += nums[i] if len(baseline) < 1: return False if len(re.findall(baseline, line2)) > 0: logger.debug(baseline, " is in ", line2) return True else: logger.debug(baseline, " not in ", line2) return False def common_nbs(self, comm_nbs): result = set() if len(comm_nbs) > 1: result = set(comm_nbs[0]) for i in comm_nbs[1:]: if len(result & set(i)) > 0: logger.debug("交集", len(result), len(set(i))) result = result & set(i) logger.debug(self.graph.sent[list(result)[0]]) else: return result elif len(comm_nbs) == 1: result = comm_nbs[0] if len(result) > 0: logger.debug("最终输出过滤后的标准地址", self.graph.sent[list(result)[0]]) return result def check(self, line1, line2): txts = re.split(RE_NUMS, line) if len(txts[0]) < 2: return False nums = re.findall(RE_NUMS, line) if len(nums) < 1: return False baseline = "" for i in range(2): if i > len(txts) - 1: break elif i == 0: baseline += txts[i][-3:] else: baseline += "\D+" if i > len(nums) - 1: break else: baseline += nums[i] if len(baseline) < 1: return False logger.debug(baseline, " weather in ", line2) if len(re.findall(baseline, line2)) > 0: logger.debug(baseline, " bingo in ", line2) logger.debug(re.findall(baseline, line2)) return True else: return False return False def word_filter(self, line_pre): res = [] for word in line_pre.split(" "): #if self.dict_tree.search(word): # res.append(word) res.append(word) return res def _query_one(self, line): """ 入口接口 line 待比对内容,为一行文本 地址可以看作三个部分 文本部分 数字部分 和 量纲部分 """ #line = utils.clr(line) output, res, score = [], [], [] #min_edit_value = 9999 #最小编辑距离 #my_txt = utils.without_num(line)#取出第一个数字前的文本部分 #my_num = re.findall(RE_NUMS,line)#取出数字部分 result = self.route_text(line) #根据文本部分和数字部分完成索引 return result def editDist(self, line, result): """get the minist edit distance of line with result""" min_edit_value = -1 minist_one = "" for hs in list(result): try: standard_addr = self.graph.sent[hs] except: continue v = utils.compare_num(line, standard_addr) #v = utils.minEditDist(utils.get_nums(line), utils.get_nums(standard_addr)) if v > min_edit_value: minist_one = hs min_edit_value = v #pdb.set_trace() return minist_one def _route_text(self, line, lst): """key algor the search algor""" line = utils.clr(str(line)) """filter left the text word""" """how to filter use the dict-tree""" #res = self.word_filter(line) words_route = [] if " " in res: res.remove(" ") key_word_dict = {} for word in res: #pdb.set_trace() key_word_dict[word] = self.graph.di.degree()[word] sorted_key_word_dict = sorted(key_word_dict.items(), key=lambda d: d[1], reverse=False) key_word_lst = [word[0] for word in sorted_key_word_dict] neighbor = [] for cursor in range(len(key_word_lst)): p_wd = key_word_lst[cursor] """get the common neighbors one by one when there is a word has no neighbors, continue""" """if there is a set of common_neighbor, & the set with last one""" print(p_wd, time.time()) tmp_neighbor = utils.get_sent_from_word(self.redis, p_wd) if len(neighbor) == 0: neighbor.append(tmp_neighbor) if len(tmp_neighbor) > 0: if len(neighbor) > 0: tmp = neighbor[-1] & tmp_neighbor if len(neighbor[-1]) == len(tmp): print("查询到高级词召回数量没有变化", len(tmp)) break if len(tmp) > 0: print("查询到高级词召回数量没有变化", len(tmp)) break if len(tmp) == 0: continue else: neighbor[-1] = tmp else: continue if len(neighbor) == 0: """there is no neighor here""" return [] else: return list(neighbor[-1]) def route_text(self, line): """key algor the search algor""" #line = utils.clr(str(line)) """filter left the text word""" """how to filter use the dict-tree""" #res = self.word_filter(line) key_word_dict = {} logger.debug("过滤后词组" + line) key_word_lst = line.split(",") words_route = [] neighbor = [] for word in key_word_lst: neighbor.append(word) if len(neighbor) > 1: tmp_neighbor = utils.get_common_neighbor( self.redis, neighbor[-2], neighbor[-1]) if len(tmp_neighbor) == 0: continue else: words_route.append(tmp_neighbor) return words_route[-1] """ if len(words_route)>0: tmp = tmp_neighbor & words_route[-1] if len(tmp)>0: words_route[-1] = tmp else: continue else: words_route.append(tmp_neighbor) """ if len(words_route) > 0: ret = words_route[-1] return ret else: return set() def format_txt(self, txts): """ date 1114 """ _txts_res = "" for txt in txts: txt = re.sub("号楼", "号", txt) txt = re.sub("号院", "号", txt) txt = re.sub("附(\d+)号", "\1号", txt) _txts_res += txt return re.findall("[\dA-Za-z]+", _txts_res) def handle_num(self, line): nums = re.split("[^0-9a-zA-Z]+", line) #txts = re.split("[0-9a-zA-Z]",line) #_txts = self.format_txt(txts) #output = [] #for i,j in zip(nums,_txts): # output.append(i) # output.append(j) #return output return nums def save_one(self, line, target, f): f.write("%s,%s\n" % (line, "ROOT" + target)) print("%s,%s\n" % (line, "ROOT" + target)) f.flush() def save_one_txt(self, result, res, score, line, f): if len(result) == 0: f.write("%s,%s\n" % (line, "None")) f.flush() return for parent_res in result: f.write("%s,%s\n" % (line, "ROOT" + self.graph.sent[parent_res])) f.flush() def filter(self, filter_name, line, output): if output == []: return ["None"] if filter_name == "edit_dis": target = self.editDist(line, output) return [self.graph.sent[target]] elif filter_name == "num_filter": #num_lst = re.findall(RE_NUMS,line) num_lst = line.split(",") if len(num_lst) > 0: for num in num_lst: num_set = utils.get_sent_from_word(self.redis, num) if len(num_set) == 0: #return [self.graph.sent[target] for target in output] return output else: tmp = output & num_set if len(tmp) == 0: return output else: output = tmp return output
def test_trie_has_words(num): """Test that trie has words.""" words = random.sample(FULL_LIST, num) tree = Trie(words) assert tree._size == num
def trie(): """Make an empty trei tree.""" return Trie()
def test_trie(): """Create a small filled trie.""" return Trie(TEST_WORDS)
class Address_Acti(object): def __init__(self): self.batch = 6000000 self.test_batch = 300 if FLAG_INIT_MODEL: self.addr_tree = Trie() self.dict_tree = Trie() self.graph = my_graph.My_Graph() self.init_model() print("init_model ok") pickle_helper.save(os.path.join(SAVE_PATH,SAVE_FILE),[self.addr_tree,self.dict_tree,self.graph]) print("pickle save ok") if FLAG_LOAD_MODEL: self.addr_tree, self.dict_tree, self.graph = pickle_helper.load(os.path.join(SAVE_PATH,SAVE_FILE),[1,2,3]) print("pickle load ok") def init_model(self): stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines() #stand_lines = pd.read_csv(os.path.join(STD_PATH, STD_FILE)).iloc[:,1] stand_lines = [stand_lines[np.random.randint(len(stand_lines))] for i in range(self.batch)] for line in stand_lines: """ insert address into addr_tree """ """ insert all addr_tree """ line = utils.clr(line) words = list(jieba.cut(line)) self.addr_tree.insert_wd_lst(words) for word in words: self.dict_tree.insert(word) def check_num_lst(self, nums1, nums2): nums1.remove("") nums2.remove("") """ use num to check weather same or not """ lmin = min(len(nums1),len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i,j in zip(nums1[:lmin], nums2[:lmin]): print(i,j) if i == j: cnt+=1 continue break if cnt == lmin and cnt>0: print(nums1 , " equal " ,nums2) return True elif cnt>0: print(nums1 , " equal " ,nums2) return True else: print(nums1, " not equal ", nums2) return False def check_num(self, line1, line2): """ use num to check weather same or not """ print(line1,line2) pdb.set_trace() cont = re.split("\d+",line1)[0] base = "" if len(cont)>0: #base = list(jieba.cut(cont))[-1] base = cont[-3:] nums1 = re.findall("\d+",line1) nums2 = re.findall("\d+",line2) print(line1, line2) print(nums1, nums2) lmin = min(len(nums1),len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i,j in zip(nums1[:lmin], nums2[:lmin]): print(i,j) if i == j: cnt+=1 continue break if lmin>0: if not base+nums1[0] in line2: print("False",line1, line2) return False if cnt == lmin and cnt>0: return True elif cnt>0: return True else: print("False",line1, line2) return False def _check_num(self, line1, line2): """ use num to check weather same or not """ base = "" cont = re.split("\d+",line1)[0] if len(cont)>0: #base = list(jieba.cut(cont))[-1] base = cont[-3:] nums1 = re.findall("\d+",line1) nums2 = re.findall("\d+",line2) print(line1, line2) print(nums1, nums2) lmin = min(len(nums1),len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i,j in zip(nums1[:lmin], nums2[:lmin]): if i == j: cnt+=1 continue break if lmin>0: if not base+nums1[0] in line2: print("False",base+nums1[0]) return False if cnt == lmin and cnt>0: return True elif cnt>1: return True else: print("False",nums1, nums2) return False """ def check(self,line1,line2): txts = re.split("\d+",line) if len(txts[0]) < 2: return False nums = re.findall("\d+",line) if len(nums)<1: return False baseline = "" for i in range(2): if i > len(txts)-1: break elif i == 0: baseline+=txts[i][-3:] else: baseline+="\D+" if i > len(nums)-1: break else: baseline+=nums[i] if len(baseline) < 1: return False print(baseline , " weather in ", line2) if len(re.findall(baseline,line2))>0: print(baseline , " bingo in ", line2) print(re.findall(baseline,line2)) return True else: return False return False def _check(self,line1,line2): txts = re.split("\d+",line) if len(txts[0]) < 2: return False nums = re.findall("\d+",line) if len(nums)<1: return False baseline = "" for i in range(2): if i > len(txts)-1: return baseline elif i == 0: baseline+=txts[i][-3:] else: baseline+=".+?" if i > len(nums)-1: return baseline else: baseline+=nums[i] if len(baseline) < 1: return False if len(re.findall(baseline,line2))>0: print(baseline , " is in ", line2) return True else: print(baseline , " not in ", line2) return False """ def word_filter(self, line_pre): res = [] for word in jieba.cut(line_pre): if self.dict_tree.search(word): res.append(word) return res def common_nbs(self,comm_nbs): result = set() if len(comm_nbs)>1: result = set(comm_nbs[0]) for i in comm_nbs[1:]: if len(result & set(i))>0: result = result & set(i) else: return result elif len(comm_nbs)==1: result = comm_nbs[0] return result def _check_num(self, line1, line2): """ use num to check weather same or not """ base = "" cont = re.split("\d+",line1)[0] if len(cont)>0: #base = list(jieba.cut(cont))[-1] base = cont[-3:] nums1 = re.findall("\d+",line1) nums2 = re.findall("\d+",line2) print(line1, line2) print(nums1, nums2) lmin = min(len(nums1),len(nums2)) #lmax = max(len(nums1),len(nums2)) cnt = 0 for i,j in zip(nums1[:lmin], nums2[:lmin]): if i == j: cnt+=1 continue break if lmin>0: if not base+nums1[0] in line2: print("False",base+nums1[0]) return False if cnt == lmin and cnt>0: return True elif cnt>1: return True else: print("False",nums1, nums2) return False """ def check(self,line1,line2): txts = re.split("\d+",line) if len(txts[0]) < 2: return False nums = re.findall("\d+",line) if len(nums)<1: return False baseline = "" for i in range(2): if i > len(txts)-1: break elif i == 0: baseline+=txts[i][-3:] else: baseline+="\D+" if i > len(nums)-1: break else: baseline+=nums[i] if len(baseline) < 1: return False print(baseline , " weather in ", line2) if len(re.findall(baseline,line2))>0: print(baseline , " bingo in ", line2) print(re.findall(baseline,line2)) return True else: return False return False def _check(self,line1,line2): txts = re.split("\d+",line) if len(txts[0]) < 2: return False nums = re.findall("\d+",line) if len(nums)<1: return False baseline = "" for i in range(2): if i > len(txts)-1: return baseline elif i == 0: baseline+=txts[i][-3:] else: baseline+=".+?" if i > len(nums)-1: return baseline else: baseline+=nums[i] if len(baseline) < 1: return False if len(re.findall(baseline,line2))>0: print(baseline , " is in ", line2) return True else: print(baseline , " not in ", line2) return False """ def word_filter(self, line_pre): res = [] for word in jieba.cut(line_pre): if self.dict_tree.search(word): res.append(word) return res def common_nbs(self,comm_nbs): result = set() if len(comm_nbs)>1: result = set(comm_nbs[0]) for i in comm_nbs[1:]: if len(result & set(i))>0: result = result & set(i) else: return result elif len(comm_nbs)==1: result = comm_nbs[0] return result def _query_one(self, line): output,res = [],[] parts = re.split("[\da-zA-Z]+",line) if len(parts)>1: #result,res = tasks.handle_text.delay(parts[0]) result,res = self.handle_text(parts[0]) for hs in result: try: self.graph.sent[hs] except: continue _line = self.graph.sent[hs] _parts = re.split("[\da-zA-Z]+",line) __line = re.split("[^\da-zA-Z]+",_line) nums = re.split("[^\da-zA-Z]+",line) #nums = self.handle_num(line[len(parts[0]):]) #nums = tasks.handle_num(line[len(parts[0]):]) if True == self.check_num_lst(__line ,nums): if "".join(res)[-2:] == __line[0][-2:]: output.append(_line) else: print("".join(res)[-3:] ," not equal ", _parts[0][-3:]) return output,res def handle_text(self,line): line = utils.clr(str(line)) line_pre = utils.before_first_num(line) res = self.word_filter(line_pre) comm_nbs = [] for i in range(len(res)-2): print(res) try: #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1]) comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1]))) except: print("networkx error") continue #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1]) comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1]))) result = self.common_nbs(comm_nbs) return result,res def format_txt(self, txts): """ date 1114 """ _txts_res = "" for txt in txts: txt = re.sub("号楼","号",txt) txt = re.sub("号院","号",txt) txt = re.sub("附(\d+)号","\1号",txt) _txts_res+=txt return re.findall("[\dA-Za-z]+",_txts_res) def handle_num(self, line): nums = re.split("[^0-9a-zA-Z]+",line) #txts = re.split("[0-9a-zA-Z]",line) #_txts = self.format_txt(txts) #output = [] #for i,j in zip(nums,_txts): # output.append(i) # output.append(j) #return output return nums def query_one(self, line): line = utils.clr(str(line)) line_pre = utils.before_first_num(line) #fir_num= utils.first_numbers(line) res = self.word_filter(line_pre) res.extend(utils.numbers(line)) comm_nbs = [] for i in range(len(res)-1): print(res) try: #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1]) comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1]))) except: print("networkx error") continue #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1]) comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1]))) result = self.common_nbs(comm_nbs) _result = [] for i in result: if not self.check_num(self.graph.sent[i],line): continue _result.append(self.graph.sent[i]) return _result,res def query(self): df = pd.DataFrame() df['map'] = "" df['kw'] = "" df['target'] = "" input_file = [] cnt=0 for _,_,docs in os.walk(TEST_PATH): for doc in docs: lines = open(os.path.join(TEST_PATH, doc)).readlines() #lines = pd.read_csv(os.path.join(TEST_PATH, doc)).iloc[:,1] lines = [lines[np.random.randint(len(lines))] for i in range(self.test_batch)] for line in lines: line = utils.clr(line) print(line) result,res = self._query_one(line) #result = self.addr_tree.words_route(res) if len(result) == 0: df.loc[str(cnt),'map'] = line df.loc[str(cnt),'target'] = "".join([]) df.loc[str(cnt),'kw'] = ",".join(res) cnt+=1 continue else: for parent_res in result: print(line, parent_res) df.loc[str(cnt),'map'] = line df.loc[str(cnt),'target'] = "ROOT"+parent_res df.loc[str(cnt),'kw'] = ",".join(res) cnt+=1 df.to_csv("./record.csv") print(cnt, 'save')