Exemplo n.º 1
0
 def __init__(self):
     self.batch = 10000
     self.test_batch = 100
     if FLAG_INIT_MODEL:
         print("init addr_tree")
         self.addr_tree = Trie()
         print("init dict_tree")
         self.dict_tree = Trie()
         print("init num_tree")
         self.num_tree = Trie()
         print("init model")
         self.init_model()
         print("init graph")
         #self.init_num_hash()
         self.graph = my_graph.My_Graph()
         logger.debug("init_model ok")
         pickle_helper.save(
             os.path.join(SAVE_PATH, SAVE_FILE),
             [self.addr_tree, self.dict_tree, self.graph, self.num_tree])
         logger.debug("pickle save ok")
     if FLAG_LOAD_MODEL:
         self.redis = StrictRedis(host='localhost', port=6379, db=0)
         #self.init_redis()
         self.addr_tree, self.dict_tree, self.graph = pickle_helper.load(
             os.path.join(SAVE_PATH, SAVE_FILE), [1, 2, 3])
         #self.addr_tree, self.dict_tree, self.graph,self.num_tree = pickle_helper.load(os.path.join(SAVE_PATH,SAVE_FILE),[1,2,3,4])
         logger.debug("> pickle load ok ===")
Exemplo n.º 2
0
 def make_goto(self):
     start = time.time()
     trie = Trie()
     for word in self.keywords:
         trie.add(word)
     self.goto = trie.tree
     self.goto['root']['status'] = 0
     self.output = trie.output
     stop = time.time()
Exemplo n.º 3
0
 def __init__(self):
     self.batch = 6000000
     self.test_batch = 300
     if FLAG_INIT_MODEL:
         self.addr_tree = Trie()
         self.dict_tree = Trie()
         self.graph = my_graph.My_Graph()
         self.init_model()
         print("init_model ok")
         pickle_helper.save(os.path.join(SAVE_PATH,SAVE_FILE),[self.addr_tree,self.dict_tree,self.graph])
         print("pickle save ok")
     if FLAG_LOAD_MODEL:
         self.addr_tree, self.dict_tree, self.graph = pickle_helper.load(os.path.join(SAVE_PATH,SAVE_FILE),[1,2,3])
         print("pickle load ok")
Exemplo n.º 4
0
def building_noise_tree(height, total_budget, k, b, delta, data_file,
                        trajectory_file, save_path):
    if not os.path.exists(data_file):
        print("原始数据集文件不存在!")
        return
    if not os.path.exists(trajectory_file):
        print("轨迹数据集文件不存在!")
        return
    print('>>>开始构建树' + '>' * 40)
    read_data_start_time = time.time()
    data = pd.read_csv(data_file)
    information = np.load(
        trajectory_file).item()  # {id:轨迹} Tractory_dataset_40_170
    read_data_end_time = time.time()
    pre_location = list(set(data["route_station"]))  # 地点域
    pre_timer = sorted(set(data["timestamp"]))  # 时间域
    read_data_time = cal_time(read_data_start_time, read_data_end_time)
    # 字典树高度,除根节点
    budget, thresholds = assign_privacy_budget_and_thresholds(
        height, total_budget, k, b, delta)  # allcation_ep(height)  t是阈值list
    print('每层树节点阈值划分:', thresholds)
    print('每层树隐私预算划分:', budget)
    trie = Trie()  # 建立字典树
    root_epsilon = budget[0]  # 根节点隐私预算
    epsilonL = budget[1]  # 每层隐私预算
    thred = thresholds[0]  # 阈值
    sensitivity = 1  # 敏感度
    # 插入每条轨迹,建立字典树
    for item in information.values():
        trie.insert(item, height)
        trie.root.count += 1
    build_rawtree_start_time = time.time()
    build_rawtree_time = cal_time(read_data_end_time, build_rawtree_start_time)

    # 保存文件存放路径
    raw_path = './raw_tree/'
    if not os.path.exists(raw_path):
        os.mkdir(raw_path)
    filename = raw_path + 'raw_tree_data_height_' + str(height) + '_.txt'
    if os.path.isfile(filename) == False:
        with open(filename, 'wb') as f:
            pkl.dump(trie, f)
        print(">>>成功保存原轨迹树文件!")
    else:
        print(">>>原轨迹树文件已经存在,不需要重复保存!")
    # 给根节点加噪声
    build_noise_tree_start_time = time.time()
    noise = lap_noise(root_epsilon, sensitivity)
    trie.root.local_count = trie.root.count  # 保存根节点原始count
    trie.root.count = trie.root.count + noise  # 根节点count加噪声
    # 对树节点加噪声
    addNoise(trie.root, pre_timer, pre_location, epsilonL, sensitivity, thred)
    thred2 = thresholds[1]  # 阈值
    for i in range(1, height):
        next_epsilon = budget[i + 1]  # 下一层树节点的隐私预算
        thred2 = thresholds[i]
        for item in search_level_nodes(trie.root, i, []):
            if pre_timer.index(item.value[1]) >= len(pre_timer) - height:
                continue
            addNoise(item, pre_timer, pre_location, next_epsilon, sensitivity,
                     thred2)
    build_noise_tree_end_time = time.time()
    build_noise_tree_time = cal_time(build_noise_tree_start_time,
                                     build_noise_tree_end_time)

    sanitized_data = []  # 保存净化的轨迹
    for son_node in trie.root.children.values():
        sanitized_data.extend(output([], son_node, []))
    print('轨迹预览:', sanitized_data[:2])
    print('不重复的轨迹数量:', len(sanitized_data))
    sanitized_data_endtime = time.time()
    sanitized_data_time = cal_time(build_noise_tree_end_time,
                                   sanitized_data_endtime)

    lap_path = './lap_tree/' + save_path
    if not os.path.exists(lap_path):
        os.mkdir(lap_path)
    prefix_path = lap_path + '/lap_tree_data_height_'
    suffix = str(height) + '_' + str(total_budget) + '_' + \
                str(k) + '_' + str(b) + '_.txt'
    lap_name = prefix_path + suffix
    if os.path.isfile(lap_name) == False:
        with open(lap_name, 'wb') as f:
            pkl.dump(trie, f)
        print(">>>成功保存噪声树文件!")
    else:
        print(">>>原噪声树文件已经存在,不需要重复保存!")
    total_time = build_rawtree_time + build_noise_tree_time + sanitized_data_time
    print('=' * 50)
    print("读数据时间[秒]:", read_data_time)
    print("建原始树时间[秒](Read time):", build_rawtree_time)
    print('建噪声树时间[秒](saniztion time):', build_noise_tree_time)
    print('生成数据集时间[秒](writing time):', sanitized_data_time)
    print('total time[秒]:', round(total_time, 4))
    print('=' * 50)
Exemplo n.º 5
0
class Address_Acti(object):
    def __init__(self):
        self.batch = 10000
        self.test_batch = 100
        if FLAG_INIT_MODEL:
            self.addr_tree = Trie()
            self.dict_tree = Trie()
            self.num_tree = Trie()
            #self.init_num_hash()
            self.graph = my_graph.My_Graph()
            self.init_model()
            print("init_model ok")
            pickle_helper.save(
                os.path.join(SAVE_PATH, SAVE_FILE),
                [self.addr_tree, self.dict_tree, self.graph, self.num_tree])
            print("pickle save ok")
        if FLAG_LOAD_MODEL:
            self.addr_tree, self.dict_tree, self.graph, self.num_tree = pickle_helper.load(
                os.path.join(SAVE_PATH, SAVE_FILE), [1, 2, 3, 4])
            print("pickle load ok")

    def minEditDist(self, sm, sn):
        m, n = len(sm) + 1, len(sn) + 1
        matrix = [[0] * n for i in range(m)]
        matrix[0][0] = 0
        for i in range(1, m):
            matrix[i][0] = matrix[i - 1][0] + 1
        for j in range(1, n):
            matrix[0][j] = matrix[0][j - 1] + 1
        const = 0
        for i in range(1, m):
            for j in range(1, n):
                if sm[i - 1] == sn[j - 1]:
                    cost = 0
                else:
                    cost = 1
                matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1,
                                   matrix[i - 1][j - 1] + cost)
        return matrix[m - 1][n - 1]

    def cut_filter(self, src_sent):
        cmdin = r"区.+?与.+?交叉口[向东]?[\d+米]?路?[东南西北]?"

    def init_num_hash(self):
        stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines()
        for line in stand_lines:
            """ insert address into addr_tree """
            """ insert all addr_tree """
            line = utils.clr(line)
            nums = list(re.findall("\d+", line))
            self.num_tree.insert_num_lst(nums, hash(line))

    def init_model(self):
        stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines()
        #stand_lines = pd.read_csv(os.path.join(STD_PATH, STD_FILE)).iloc[:,1]
        #stand_lines = [stand_lines[np.random.randint(len(stand_lines))] for i in range(self.batch)]
        for line in stand_lines:
            """ insert address into addr_tree """
            """ insert all addr_tree """
            line = utils.clr(line)
            words = list(jieba.cut(line))
            nums = list(re.findall("\d+", line))
            self.num_tree.insert_num_lst(nums, hash(line))
            self.addr_tree.insert_wd_lst(words)
            for word in words:
                self.dict_tree.insert(word)

    def score_num_lst(self, nums1, nums2):
        if "" in nums1:
            nums1.remove("")
        if "" in nums2:
            nums2.remove("")
        """ use num to check weather same or not """
        lmin = min(len(nums1), len(nums2))
        #lmax = max(len(nums1),len(nums2))
        cnt = 0
        for i, j in zip(nums1[:lmin], nums2[:lmin]):
            print(i, j)
            if i == j:
                cnt += 1
                continue
            break
        if cnt == lmin and cnt > 0:
            print(nums1, " equal ", nums2)
            return (cnt / lmin) * 100
        elif cnt > 0:
            print(nums1, " equal ", nums2)
            return (cnt / lmin) * 100
        else:
            print(nums1, " not equal ", nums2)
            return 0.0

    def check_num_lst(self, nums1, nums2):
        if "" in nums1:
            nums1.remove("")
        if "" in nums2:
            nums2.remove("")
        """ use num to check weather same or not """
        lmin = min(len(nums1), len(nums2))
        #lmax = max(len(nums1),len(nums2))
        cnt = 0
        for i, j in zip(nums1[:lmin], nums2[:lmin]):
            print(i, j)
            if i == j:
                cnt += 1
                continue
            break
        if cnt == lmin and cnt > 0:
            print(nums1, " equal ", nums2)
            return True
        elif cnt > 1:
            print(nums1, " equal ", nums2)
            return True
        else:
            print(nums1, " not equal ", nums2)
            return False

    def check_num(self, line1, line2):
        """ use num to check weather same or not """
        print("判断数字是否一致", line1, line2)
        cont = re.split("\d+", line1)[0]
        base = ""
        if len(cont) > 0:
            #base = list(jieba.cut(cont))[-1]
            base = cont[-3:]
        nums1 = re.findall("\d+", line1)
        nums2 = re.findall("\d+", line2)
        print(line1, line2)
        print(nums1, nums2)
        lmin = min(len(nums1), len(nums2))
        #lmax = max(len(nums1),len(nums2))
        cnt = 0
        for i, j in zip(nums1[:lmin], nums2[:lmin]):
            print(i, j)
            if i == j:
                cnt += 1
                continue
            break
        if lmin > 0:
            if not base + nums1[0] in line2:
                print("False", line1, line2)
                return False
        if cnt == lmin and cnt > 0:
            return True
        elif cnt > 0:
            return True
        else:
            print("False", line1, line2)
            return False

    def _check_num(self, line1, line2):
        """ use num to check weather same or not """
        base = ""
        cont = re.split("\d+", line1)[0]
        if len(cont) > 0:
            #base = list(jieba.cut(cont))[-1]
            base = cont[-3:]
        nums1 = re.findall("\d+", line1)
        nums2 = re.findall("\d+", line2)
        print(line1, line2)
        print(nums1, nums2)
        lmin = min(len(nums1), len(nums2))
        #lmax = max(len(nums1),len(nums2))
        cnt = 0
        for i, j in zip(nums1[:lmin], nums2[:lmin]):
            if i == j:
                cnt += 1
                continue
            break
        if lmin > 0:
            if not base + nums1[0] in line2:
                print("False", base + nums1[0])
                return False
        if cnt == lmin and cnt > 0:
            return True
        elif cnt > 1:
            return True
        else:
            print("False", nums1, nums2)
            return False

    def check(self, line1, line2):
        txts = re.split("\d+", line)
        if len(txts[0]) < 2:
            return False
        nums = re.findall("\d+", line)
        if len(nums) < 1:
            return False
        baseline = ""
        for i in range(2):
            if i > len(txts) - 1:
                break
            elif i == 0:
                baseline += txts[i][-3:]
            else:
                baseline += "\D+"
            if i > len(nums) - 1:
                break
            else:
                baseline += nums[i]
        if len(baseline) < 1:
            return False
        print(baseline, " weather in ", line2)
        if len(re.findall(baseline, line2)) > 0:
            print(baseline, " bingo in ", line2)
            print(re.findall(baseline, line2))
            return True
        else:
            return False
        return False

    def _check(self, line1, line2):
        txts = re.split("\d+", line)
        if len(txts[0]) < 2:
            return False
        nums = re.findall("\d+", line)
        if len(nums) < 1:
            return False
        baseline = ""
        for i in range(2):
            if i > len(txts) - 1:
                return baseline
            elif i == 0:
                baseline += txts[i][-3:]
            else:
                baseline += ".+?"
            if i > len(nums) - 1:
                return baseline
            else:
                baseline += nums[i]
        if len(baseline) < 1:
            return False
        if len(re.findall(baseline, line2)) > 0:
            print(baseline, " is in ", line2)
            return True
        else:
            print(baseline, " not in ", line2)
            return False

    def word_filter(self, line_pre):
        res = []
        for word in jieba.cut(line_pre):
            if self.dict_tree.search(word):
                res.append(word)
        return res

    def common_nbs(self, comm_nbs):
        result = set()
        if len(comm_nbs) > 1:
            result = set(comm_nbs[0])
            for i in comm_nbs[1:]:
                if len(result & set(i)) > 0:
                    print("交集", len(result), len(set(i)))
                    result = result & set(i)
                    print(self.graph.sent[list(result)[0]])
                else:
                    return result
        elif len(comm_nbs) == 1:
            result = comm_nbs[0]
        if len(result) > 0:
            print("最终输出过滤后的标准地址", self.graph.sent[list(result)[0]])
        return result

    def _check_num(self, line1, line2):
        """ use num to check weather same or not """
        base = ""
        cont = re.split("\d+", line1)[0]
        if len(cont) > 0:
            #base = list(jieba.cut(cont))[-1]
            base = cont[-3:]
        nums1 = re.findall("\d+", line1)
        nums2 = re.findall("\d+", line2)
        print(line1, line2)
        print(nums1, nums2)
        lmin = min(len(nums1), len(nums2))
        #lmax = max(len(nums1),len(nums2))
        cnt = 0
        for i, j in zip(nums1[:lmin], nums2[:lmin]):
            if i == j:
                cnt += 1
                continue
            break
        if lmin > 0:
            if not base + nums1[0] in line2:
                print("False", base + nums1[0])
                return False
        if cnt == lmin and cnt > 0:
            return True
        elif cnt > 1:
            return True
        else:
            print("False", nums1, nums2)
            return False

    def check(self, line1, line2):
        txts = re.split("\d+", line)
        if len(txts[0]) < 2:
            return False
        nums = re.findall("\d+", line)
        if len(nums) < 1:
            return False
        baseline = ""
        for i in range(2):
            if i > len(txts) - 1:
                break
            elif i == 0:
                baseline += txts[i][-3:]
            else:
                baseline += "\D+"
            if i > len(nums) - 1:
                break
            else:
                baseline += nums[i]
        if len(baseline) < 1:
            return False
        print(baseline, " weather in ", line2)
        if len(re.findall(baseline, line2)) > 0:
            print(baseline, " bingo in ", line2)
            print(re.findall(baseline, line2))
            return True
        else:
            return False
        return False

    def _check(self, line1, line2):
        txts = re.split("\d+", line)
        if len(txts[0]) < 2:
            return False
        nums = re.findall("\d+", line)
        if len(nums) < 1:
            return False
        baseline = ""
        for i in range(2):
            if i > len(txts) - 1:
                return baseline
            elif i == 0:
                baseline += txts[i][-3:]
            else:
                baseline += ".+?"
            if i > len(nums) - 1:
                return baseline
            else:
                baseline += nums[i]
        if len(baseline) < 1:
            return False
        if len(re.findall(baseline, line2)) > 0:
            print(baseline, " is in ", line2)
            return True
        else:
            print(baseline, " not in ", line2)
            return False

    def _query_one(self, line):
        output, res, score = [], [], []
        min_edit_value = 9999
        parts = re.split("\d+", line)
        my_txt = utils.without_num(line)
        my_num = re.findall("\d+", line)
        result, res = self.route_text(my_txt, my_num)
        print("句子集合数目", len(result), "关键词集合", res)
        return result, res, 0

    def editDist(self, line, result):
        """get the minist edit distance of line with result"""
        min_edit_value = 999
        minist_one = ""
        for hs in list(result):
            standard_addr = self.graph.sent[hs]
            print("比较数字部分文本", utils.get_nums(line),
                  utils.get_nums(standard_addr))
            v = self.minEditDist(utils.get_nums(line),
                                 utils.get_nums(standard_addr))
            if v < min_edit_value:
                minist_one = hs
                min_edit_value = v
        return minist_one

    def route_text(self, line, lst):
        print("过滤掉无用文本 ", line, lst)
        line = utils.clr(str(line))
        #line_pre = utils.before_first_num(line)
        res = self.word_filter(line)
        print("经过过滤的词条", res)
        #res.extend(lst)
        words_route = []
        comm_nbs = []
        if len(res) == 1:
            res.extend(res)
        for i in range(len(res) - 1):
            print(res)
            try:
                #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1])
                p_node = res[i]
                a_node = res[i + 1]
                if len(words_route) == 0:
                    words_route.append(p_node)
                try:
                    route = nx.shortest_path(self.graph.tree_di,
                                             words_route[-1], a_node)
                    print('是否存在最短路径 ', route)
                    words_route.append(a_node)
                    print("add node", i, a_node)
                    #weight = self.graph.tree_di[words_route[-1]][a_node]['weight']
                    #weight = self.graph.tree_di[words_route[-1]][a_node]['weight']
                except:
                    print(
                        "not connect direct, continue, find the next one, utile to the head of words lst"
                    )
                    print("过滤复杂文本的词条")
                    #words_route = words_route[:-1]
                    #words_route.append(a_node)
                    continue
            except:
                print("networkx error")
                continue
        #words_route = words_route[::-1]
        print("复杂文本", res)
        print("过滤输出", words_route)
        if " " in words_route:
            words_route.remove(" ")
        if len(words_route) > 0:
            words_route.insert(0, words_route[0])
        for i in range(len(words_route)):
            try:
                comm_nbs.extend(
                    list(nx.all_neighbors(self.graph.di, words_route[i])))
            except:
                print("添加邻居出错")
        print("所有的邻居都添加到列表中,等待计算")
        print("列表中共有多少个item", len(comm_nbs))
        cnt_lst = collections.Counter(comm_nbs)
        sorted_lst = sorted(cnt_lst.items(), key=lambda d: d[1], reverse=True)
        if not len(sorted_lst) > 0:
            return [], words_route
        max_value = sorted_lst[0][1]
        #result = self.common_nbs(comm_nbs)
        #result = self.common_nbs(comm_nbs)
        result = filter(lambda x: utils.is_max(x, max_value), sorted_lst)
        result = [i[0] for i in result]
        print("一共有多少个句子", len(result))
        print("公共邻居最多的句子", self.graph.sent[result[0]])
        print("公共邻居最少的句子", self.graph.sent[result[-1]])
        print("最终关键词", words_route)
        return result, words_route

    def format_txt(self, txts):
        """ date 1114 """
        _txts_res = ""
        for txt in txts:
            txt = re.sub("号楼", "号", txt)
            txt = re.sub("号院", "号", txt)
            txt = re.sub("附(\d+)号", "\1号", txt)
            _txts_res += txt
        return re.findall("[\dA-Za-z]+", _txts_res)

    def handle_num(self, line):
        nums = re.split("[^0-9a-zA-Z]+", line)
        #txts = re.split("[0-9a-zA-Z]",line)
        #_txts = self.format_txt(txts)
        #output = []
        #for i,j in zip(nums,_txts):
        #    output.append(i)
        #    output.append(j)
        #return output
        return nums

    def save_one(self, line, target, f):
        f.write("%s,%s\n" % (line, "ROOT" + target))

    def save_one_txt(self, result, res, score, line, f):
        if len(result) == 0:
            f.write("%s,%s\n" % (line, "None"))
            return
        for parent_res in result:
            f.write("%s,%s\n" % (line, "ROOT" + self.graph.sent[parent_res]))
Exemplo n.º 6
0
class Address_Acti(object):
    def __init__(self):
        self.batch = 10000
        self.test_batch = 100
        if FLAG_INIT_MODEL:
            print("init addr_tree")
            self.addr_tree = Trie()
            print("init dict_tree")
            self.dict_tree = Trie()
            print("init num_tree")
            self.num_tree = Trie()
            print("init model")
            self.init_model()
            print("init graph")
            #self.init_num_hash()
            self.graph = my_graph.My_Graph()
            logger.debug("init_model ok")
            pickle_helper.save(
                os.path.join(SAVE_PATH, SAVE_FILE),
                [self.addr_tree, self.dict_tree, self.graph, self.num_tree])
            logger.debug("pickle save ok")
        if FLAG_LOAD_MODEL:
            self.redis = StrictRedis(host='localhost', port=6379, db=0)
            #self.init_redis()
            self.addr_tree, self.dict_tree, self.graph = pickle_helper.load(
                os.path.join(SAVE_PATH, SAVE_FILE), [1, 2, 3])
            #self.addr_tree, self.dict_tree, self.graph,self.num_tree = pickle_helper.load(os.path.join(SAVE_PATH,SAVE_FILE),[1,2,3,4])
            logger.debug("> pickle load ok ===")

    def init_redis(self):
        cmd = "cat redis_cmd_insert_data.txt | redis-cli --pipe"
        result = subprocess.getoutput(cmd)
        print(result)

    def _init_redis(self):
        stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines()
        #stand_lines = pd.read_csv(os.path.join(STD_PATH, STD_FILE)).iloc[:,1]
        cnt = 0
        for line in stand_lines[:100000]:
            #logger.debug("redis init", line)
            #line = utils.clr(line)
            #words = list(jieba.cut(line))
            words = line.split(" ")
            for word in words:
                utils.add_sent_2_word(self.redis, word, str(hash(line)))
            cnt += 1
            if cnt % 1000 == 1:
                print(cnt)
        return 0

    def cut_filter(self, src_sent):
        cmdin = r"区.+?与.+?交叉口[向东]?[\d+米]?路?[东南西北]?"

    def init_num_hash(self):
        stand_lines = pd.read_csv(os.path.join(STD_PATH, STD_FILE)).iloc[:, 1]
        for line in stand_lines:
            """ insert address into addr_tree """
            """ insert all addr_tree """
            line = utils.clr(line)
            nums = list(re.findall(RE_NUMS, line))
            #self.num_tree.insert_num_lst(nums,hash(line))

    def init_model(self):
        #stand_lines = pd.read_csv(os.path.join(STD_PATH, STD_FILE)).iloc[:,1]
        stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines()
        cnt = 0
        for line in stand_lines[:100000]:
            cnt += 1
            if cnt % 10000 == 1:
                print(cnt)
            """ insert address into addr_tree """
            """ insert all addr_tree """
            #line = utils.clr(line)
            #words = list(jieba.cut(line))
            #nums = list(re.findall(RE_NUMS,line))
            #self.num_tree.insert_num_lst(nums,hash(line))
            #self.addr_tree.insert_wd_lst(words)
            words = line.split(" ")
            for word in words:
                self.dict_tree.insert(word)

    def score_num_lst(self, nums1, nums2):
        if "" in nums1:
            nums1.remove("")
        if "" in nums2:
            nums2.remove("")
        """ use num to check weather same or not """
        lmin = min(len(nums1), len(nums2))
        #lmax = max(len(nums1),len(nums2))
        cnt = 0
        for i, j in zip(nums1[:lmin], nums2[:lmin]):
            logger.debug(i, j)
            if i == j:
                cnt += 1
                continue
            break
        if cnt == lmin and cnt > 0:
            logger.debug(nums1, " equal ", nums2)
            return (cnt / lmin) * 100
        elif cnt > 0:
            logger.debug(nums1, " equal ", nums2)
            return (cnt / lmin) * 100
        else:
            logger.debug(nums1, " not equal ", nums2)
            return 0.0

    def check_num_lst(self, nums1, nums2):
        if "" in nums1:
            nums1.remove("")
        if "" in nums2:
            nums2.remove("")
        """ use num to check weather same or not """
        lmin = min(len(nums1), len(nums2))
        #lmax = max(len(nums1),len(nums2))
        cnt = 0
        for i, j in zip(nums1[:lmin], nums2[:lmin]):
            logger.debug(i, j)
            if i == j:
                cnt += 1
                continue
            break
        if cnt == lmin and cnt > 0:
            logger.debug(nums1, " equal ", nums2)
            return True
        elif cnt > 1:
            logger.debug(nums1, " equal ", nums2)
            return True
        else:
            logger.debug(nums1, " not equal ", nums2)
            return False

    def check_num(self, line1, line2):
        """ use num to check weather same or not """
        logger.debug("判断数字是否一致", line1, line2)
        cont = re.split("\d+", line1)[0]
        base = ""
        if len(cont) > 0:
            #base = list(jieba.cut(cont))[-1]
            base = cont[-3:]
        nums1 = re.findall(RE_NUMS, line1)
        nums2 = re.findall(RE_NUMS, line2)
        logger.debug(line1, line2)
        logger.debug(nums1, nums2)
        lmin = min(len(nums1), len(nums2))
        #lmax = max(len(nums1),len(nums2))
        cnt = 0
        for i, j in zip(nums1[:lmin], nums2[:lmin]):
            logger.debug(i, j)
            if i == j:
                cnt += 1
                continue
            break
        if lmin > 0:
            if not base + nums1[0] in line2:
                logger.debug("False", line1, line2)
                return False
        if cnt == lmin and cnt > 0:
            return True
        elif cnt > 0:
            return True
        else:
            logger.debug("False", line1, line2)
            return False

    def _check_num(self, line1, line2):
        """ use num to check weather same or not """
        base = ""
        cont = re.split("\d+", line1)[0]
        if len(cont) > 0:
            #base = list(jieba.cut(cont))[-1]
            base = cont[-3:]
        nums1 = re.findall(RE_NUMS, line1)
        nums2 = re.findall(RE_NUMS, line2)
        logger.debug(line1, line2)
        logger.debug(nums1, nums2)
        lmin = min(len(nums1), len(nums2))
        #lmax = max(len(nums1),len(nums2))
        cnt = 0
        for i, j in zip(nums1[:lmin], nums2[:lmin]):
            if i == j:
                cnt += 1
                continue
            break
        if lmin > 0:
            if not base + nums1[0] in line2:
                logger.debug("False", base + nums1[0])
                return False
        if cnt == lmin and cnt > 0:
            return True
        elif cnt > 1:
            return True
        else:
            logger.debug("False", nums1, nums2)
            return False

    def _check(self, line1, line2):
        txts = re.split(RE_NUMS, line)
        if len(txts[0]) < 2:
            return False
        nums = re.findall(RE_NUMS, line)
        if len(nums) < 1:
            return False
        baseline = ""
        for i in range(2):
            if i > len(txts) - 1:
                return baseline
            elif i == 0:
                baseline += txts[i][-3:]
            else:
                baseline += ".+?"
            if i > len(nums) - 1:
                return baseline
            else:
                baseline += nums[i]
        if len(baseline) < 1:
            return False
        if len(re.findall(baseline, line2)) > 0:
            logger.debug(baseline, " is in ", line2)
            return True
        else:
            logger.debug(baseline, " not in ", line2)
            return False

    def common_nbs(self, comm_nbs):
        result = set()
        if len(comm_nbs) > 1:
            result = set(comm_nbs[0])
            for i in comm_nbs[1:]:
                if len(result & set(i)) > 0:
                    logger.debug("交集", len(result), len(set(i)))
                    result = result & set(i)
                    logger.debug(self.graph.sent[list(result)[0]])
                else:
                    return result
        elif len(comm_nbs) == 1:
            result = comm_nbs[0]
        if len(result) > 0:
            logger.debug("最终输出过滤后的标准地址", self.graph.sent[list(result)[0]])
        return result

    def check(self, line1, line2):
        txts = re.split(RE_NUMS, line)
        if len(txts[0]) < 2:
            return False
        nums = re.findall(RE_NUMS, line)
        if len(nums) < 1:
            return False
        baseline = ""
        for i in range(2):
            if i > len(txts) - 1:
                break
            elif i == 0:
                baseline += txts[i][-3:]
            else:
                baseline += "\D+"
            if i > len(nums) - 1:
                break
            else:
                baseline += nums[i]
        if len(baseline) < 1:
            return False
        logger.debug(baseline, " weather in ", line2)
        if len(re.findall(baseline, line2)) > 0:
            logger.debug(baseline, " bingo in ", line2)
            logger.debug(re.findall(baseline, line2))
            return True
        else:
            return False
        return False

    def word_filter(self, line_pre):
        res = []
        for word in line_pre.split(" "):
            #if self.dict_tree.search(word):
            #    res.append(word)
            res.append(word)
        return res

    def _query_one(self, line):
        """
        入口接口
        line 待比对内容,为一行文本
        地址可以看作三个部分
        文本部分 数字部分 和 量纲部分
        """
        #line = utils.clr(line)
        output, res, score = [], [], []
        #min_edit_value = 9999 #最小编辑距离
        #my_txt = utils.without_num(line)#取出第一个数字前的文本部分
        #my_num = re.findall(RE_NUMS,line)#取出数字部分
        result = self.route_text(line)  #根据文本部分和数字部分完成索引
        return result

    def editDist(self, line, result):
        """get the minist edit distance of line with result"""
        min_edit_value = -1
        minist_one = ""
        for hs in list(result):
            try:
                standard_addr = self.graph.sent[hs]
            except:
                continue
            v = utils.compare_num(line, standard_addr)
            #v = utils.minEditDist(utils.get_nums(line), utils.get_nums(standard_addr))
            if v > min_edit_value:
                minist_one = hs
                min_edit_value = v
        #pdb.set_trace()
        return minist_one

    def _route_text(self, line, lst):
        """key algor the search algor"""
        line = utils.clr(str(line))
        """filter left the text word"""
        """how to filter use the dict-tree"""
        #res = self.word_filter(line)
        words_route = []
        if " " in res:
            res.remove(" ")
        key_word_dict = {}
        for word in res:
            #pdb.set_trace()
            key_word_dict[word] = self.graph.di.degree()[word]
        sorted_key_word_dict = sorted(key_word_dict.items(),
                                      key=lambda d: d[1],
                                      reverse=False)
        key_word_lst = [word[0] for word in sorted_key_word_dict]
        neighbor = []
        for cursor in range(len(key_word_lst)):
            p_wd = key_word_lst[cursor]
            """get the common neighbors one by one when there is a word has no neighbors, continue"""
            """if there is a set of common_neighbor, & the set with last one"""
            print(p_wd, time.time())
            tmp_neighbor = utils.get_sent_from_word(self.redis, p_wd)
            if len(neighbor) == 0:
                neighbor.append(tmp_neighbor)
            if len(tmp_neighbor) > 0:
                if len(neighbor) > 0:
                    tmp = neighbor[-1] & tmp_neighbor
                    if len(neighbor[-1]) == len(tmp):
                        print("查询到高级词召回数量没有变化", len(tmp))
                        break
                    if len(tmp) > 0:
                        print("查询到高级词召回数量没有变化", len(tmp))
                        break
                    if len(tmp) == 0:
                        continue
                    else:
                        neighbor[-1] = tmp
            else:
                continue
        if len(neighbor) == 0:
            """there is no neighor here"""
            return []
        else:
            return list(neighbor[-1])

    def route_text(self, line):
        """key algor the search algor"""
        #line = utils.clr(str(line))
        """filter left the text word"""
        """how to filter use the dict-tree"""
        #res = self.word_filter(line)
        key_word_dict = {}
        logger.debug("过滤后词组" + line)
        key_word_lst = line.split(",")
        words_route = []
        neighbor = []
        for word in key_word_lst:
            neighbor.append(word)
            if len(neighbor) > 1:
                tmp_neighbor = utils.get_common_neighbor(
                    self.redis, neighbor[-2], neighbor[-1])
                if len(tmp_neighbor) == 0:
                    continue
                else:
                    words_route.append(tmp_neighbor)
                    return words_route[-1]
                    """
              if len(words_route)>0:
                tmp = tmp_neighbor & words_route[-1]
                if len(tmp)>0:
                  words_route[-1] = tmp
                else:
                  continue
              else:
                words_route.append(tmp_neighbor)
              """
        if len(words_route) > 0:
            ret = words_route[-1]
            return ret
        else:
            return set()

    def format_txt(self, txts):
        """ date 1114 """
        _txts_res = ""
        for txt in txts:
            txt = re.sub("号楼", "号", txt)
            txt = re.sub("号院", "号", txt)
            txt = re.sub("附(\d+)号", "\1号", txt)
            _txts_res += txt
        return re.findall("[\dA-Za-z]+", _txts_res)

    def handle_num(self, line):
        nums = re.split("[^0-9a-zA-Z]+", line)
        #txts = re.split("[0-9a-zA-Z]",line)
        #_txts = self.format_txt(txts)
        #output = []
        #for i,j in zip(nums,_txts):
        #    output.append(i)
        #    output.append(j)
        #return output
        return nums

    def save_one(self, line, target, f):
        f.write("%s,%s\n" % (line, "ROOT" + target))
        print("%s,%s\n" % (line, "ROOT" + target))
        f.flush()

    def save_one_txt(self, result, res, score, line, f):
        if len(result) == 0:
            f.write("%s,%s\n" % (line, "None"))
            f.flush()
            return
        for parent_res in result:
            f.write("%s,%s\n" % (line, "ROOT" + self.graph.sent[parent_res]))
        f.flush()

    def filter(self, filter_name, line, output):
        if output == []:
            return ["None"]
        if filter_name == "edit_dis":
            target = self.editDist(line, output)
            return [self.graph.sent[target]]
        elif filter_name == "num_filter":
            #num_lst = re.findall(RE_NUMS,line)
            num_lst = line.split(",")
            if len(num_lst) > 0:
                for num in num_lst:
                    num_set = utils.get_sent_from_word(self.redis, num)
                    if len(num_set) == 0:
                        #return [self.graph.sent[target] for target in output]
                        return output
                    else:
                        tmp = output & num_set
                        if len(tmp) == 0:
                            return output
                        else:
                            output = tmp
            return output
Exemplo n.º 7
0
def test_trie_has_words(num):
    """Test that trie has words."""
    words = random.sample(FULL_LIST, num)
    tree = Trie(words)
    assert tree._size == num
Exemplo n.º 8
0
def trie():
    """Make an empty trei tree."""
    return Trie()
Exemplo n.º 9
0
def test_trie():
    """Create a small filled trie."""

    return Trie(TEST_WORDS)
Exemplo n.º 10
0
class Address_Acti(object):

    def __init__(self):
        self.batch = 6000000
        self.test_batch = 300
        if FLAG_INIT_MODEL:
            self.addr_tree = Trie()
            self.dict_tree = Trie()
            self.graph = my_graph.My_Graph()
            self.init_model()
            print("init_model ok")
            pickle_helper.save(os.path.join(SAVE_PATH,SAVE_FILE),[self.addr_tree,self.dict_tree,self.graph])
            print("pickle save ok")
        if FLAG_LOAD_MODEL:
            self.addr_tree, self.dict_tree, self.graph = pickle_helper.load(os.path.join(SAVE_PATH,SAVE_FILE),[1,2,3])
            print("pickle load ok")

    def init_model(self):
        stand_lines = open(os.path.join(STD_PATH, STD_FILE)).readlines()
        #stand_lines = pd.read_csv(os.path.join(STD_PATH, STD_FILE)).iloc[:,1]
        stand_lines = [stand_lines[np.random.randint(len(stand_lines))] for i in range(self.batch)]
        for line in stand_lines:
            """ insert address into addr_tree """
            """ insert all addr_tree """
            line = utils.clr(line)
            words = list(jieba.cut(line))
            self.addr_tree.insert_wd_lst(words)
            for word in words:
                self.dict_tree.insert(word)

    def check_num_lst(self, nums1, nums2):
        nums1.remove("")
        nums2.remove("")
        """ use num to check weather same or not """
        lmin = min(len(nums1),len(nums2))
        #lmax = max(len(nums1),len(nums2))
        cnt = 0
        for i,j in zip(nums1[:lmin], nums2[:lmin]):
            print(i,j)
            if i == j:
                cnt+=1
                continue
            break
        if cnt == lmin and cnt>0:
            print(nums1 , " equal " ,nums2)
            return True
        elif cnt>0:
            print(nums1 , " equal " ,nums2)
            return True
        else:
            print(nums1, " not equal ", nums2)
            return False

    def check_num(self, line1, line2):
        """ use num to check weather same or not """
        print(line1,line2)
        pdb.set_trace()
        cont = re.split("\d+",line1)[0]
        base = ""
        if len(cont)>0:
            #base = list(jieba.cut(cont))[-1]
            base = cont[-3:]
        nums1 = re.findall("\d+",line1)
        nums2 = re.findall("\d+",line2)
        print(line1, line2)
        print(nums1, nums2)
        lmin = min(len(nums1),len(nums2))
        #lmax = max(len(nums1),len(nums2))
        cnt = 0
        for i,j in zip(nums1[:lmin], nums2[:lmin]):
            print(i,j)
            if i == j:
                cnt+=1
                continue
            break
        if lmin>0:
            if not base+nums1[0] in line2:
                print("False",line1, line2)
                return False
        if cnt == lmin and cnt>0:
            return True
        elif cnt>0:
            return True
        else:
            print("False",line1, line2)
            return False

    def _check_num(self, line1, line2):
        """ use num to check weather same or not """
        base = ""
        cont = re.split("\d+",line1)[0]
        if len(cont)>0:
            #base = list(jieba.cut(cont))[-1]
            base = cont[-3:]
        nums1 = re.findall("\d+",line1)
        nums2 = re.findall("\d+",line2)
        print(line1, line2)
        print(nums1, nums2)
        lmin = min(len(nums1),len(nums2))
        #lmax = max(len(nums1),len(nums2))
        cnt = 0
        for i,j in zip(nums1[:lmin], nums2[:lmin]):
            if i == j:
                cnt+=1
                continue
            break
        if lmin>0:
            if not base+nums1[0] in line2:
                print("False",base+nums1[0])
                return False
        if cnt == lmin and cnt>0:
            return True
        elif cnt>1:
            return True
        else:
            print("False",nums1, nums2)
            return False

    """
    def check(self,line1,line2):
        txts = re.split("\d+",line)
        if len(txts[0]) < 2:
            return False
        nums = re.findall("\d+",line)
        if len(nums)<1:
            return False
        baseline = ""
        for i in range(2):
            if i > len(txts)-1:
                break
            elif i == 0:
                baseline+=txts[i][-3:]
            else:
                baseline+="\D+"
            if i > len(nums)-1:
                break
            else:
                baseline+=nums[i]
        if len(baseline) < 1:
            return False
        print(baseline , " weather in ", line2)
        if len(re.findall(baseline,line2))>0:
            print(baseline , " bingo in ", line2)
            print(re.findall(baseline,line2))
            return True
        else:
            return False
        return False

    def _check(self,line1,line2):
        txts = re.split("\d+",line)
        if len(txts[0]) < 2:
            return False
        nums = re.findall("\d+",line)
        if len(nums)<1:
            return False
        baseline = ""
        for i in range(2):
            if i > len(txts)-1:
                return baseline
            elif i == 0:
                baseline+=txts[i][-3:]
            else:
                baseline+=".+?"
            if i > len(nums)-1:
                return baseline
            else:
                baseline+=nums[i]
        if len(baseline) < 1:
            return False
        if len(re.findall(baseline,line2))>0:
            print(baseline , " is in ", line2)
            return True
        else:
            print(baseline , " not in ", line2)
            return False
    """

    def word_filter(self, line_pre):
        res = []
        for word in jieba.cut(line_pre):
            if self.dict_tree.search(word):
                res.append(word)
        return res

    def common_nbs(self,comm_nbs):
        result = set()
        if len(comm_nbs)>1:
            result  = set(comm_nbs[0])
            for i in comm_nbs[1:]:
                if len(result & set(i))>0:
                    result = result & set(i)
                else:
                    return result
        elif len(comm_nbs)==1:
            result = comm_nbs[0]
        return result
    def _check_num(self, line1, line2):
        """ use num to check weather same or not """
        base = ""
        cont = re.split("\d+",line1)[0]
        if len(cont)>0:
            #base = list(jieba.cut(cont))[-1]
            base = cont[-3:]
        nums1 = re.findall("\d+",line1)
        nums2 = re.findall("\d+",line2)
        print(line1, line2)
        print(nums1, nums2)
        lmin = min(len(nums1),len(nums2))
        #lmax = max(len(nums1),len(nums2))
        cnt = 0
        for i,j in zip(nums1[:lmin], nums2[:lmin]):
            if i == j:
                cnt+=1
                continue
            break
        if lmin>0:
            if not base+nums1[0] in line2:
                print("False",base+nums1[0])
                return False
        if cnt == lmin and cnt>0:
            return True
        elif cnt>1:
            return True
        else:
            print("False",nums1, nums2)
            return False

    """

    def check(self,line1,line2):
        txts = re.split("\d+",line)
        if len(txts[0]) < 2:
            return False
        nums = re.findall("\d+",line)
        if len(nums)<1:
            return False
        baseline = ""
        for i in range(2):
            if i > len(txts)-1:
                break
            elif i == 0:
                baseline+=txts[i][-3:]
            else:
                baseline+="\D+"
            if i > len(nums)-1:
                break
            else:
                baseline+=nums[i]
        if len(baseline) < 1:
            return False
        print(baseline , " weather in ", line2)
        if len(re.findall(baseline,line2))>0:
            print(baseline , " bingo in ", line2)
            print(re.findall(baseline,line2))
            return True
        else:
            return False
        return False

    def _check(self,line1,line2):
        txts = re.split("\d+",line)
        if len(txts[0]) < 2:
            return False
        nums = re.findall("\d+",line)
        if len(nums)<1:
            return False
        baseline = ""
        for i in range(2):
            if i > len(txts)-1:
                return baseline
            elif i == 0:
                baseline+=txts[i][-3:]
            else:
                baseline+=".+?"
            if i > len(nums)-1:
                return baseline
            else:
                baseline+=nums[i]
        if len(baseline) < 1:
            return False
        if len(re.findall(baseline,line2))>0:
            print(baseline , " is in ", line2)
            return True
        else:
            print(baseline , " not in ", line2)
            return False

    """
    def word_filter(self, line_pre):
        res = []
        for word in jieba.cut(line_pre):
            if self.dict_tree.search(word):
                res.append(word)
        return res

    def common_nbs(self,comm_nbs):
        result = set()
        if len(comm_nbs)>1:
            result  = set(comm_nbs[0])
            for i in comm_nbs[1:]:
                if len(result & set(i))>0:
                    result = result & set(i)
                else:
                    return result
        elif len(comm_nbs)==1:
            result = comm_nbs[0]
        return result

    def _query_one(self, line):
        output,res = [],[]
        parts = re.split("[\da-zA-Z]+",line)
        if len(parts)>1:

            #result,res = tasks.handle_text.delay(parts[0])
            result,res = self.handle_text(parts[0])
            for hs in result:
                try:
                    self.graph.sent[hs]
                except:
                    continue
                _line = self.graph.sent[hs]
                _parts = re.split("[\da-zA-Z]+",line)
                __line = re.split("[^\da-zA-Z]+",_line)
                nums = re.split("[^\da-zA-Z]+",line)
                #nums = self.handle_num(line[len(parts[0]):])
                #nums = tasks.handle_num(line[len(parts[0]):])
                if True == self.check_num_lst(__line ,nums):
                    if "".join(res)[-2:] == __line[0][-2:]:
                        output.append(_line)
                    else:
                        print("".join(res)[-3:] ," not equal ", _parts[0][-3:])
        return output,res

    def handle_text(self,line):
        line = utils.clr(str(line))
        line_pre = utils.before_first_num(line)
        res = self.word_filter(line_pre)
        comm_nbs = []
        for i in range(len(res)-2):
            print(res)
            try:
                #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1])
                comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1])))
            except:
                print("networkx error")
                continue
            #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1])
            comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1])))
        result = self.common_nbs(comm_nbs)
        return result,res

    def format_txt(self, txts):
        """ date 1114 """
        _txts_res = ""
        for txt in txts:
            txt = re.sub("号楼","号",txt)
            txt = re.sub("号院","号",txt)
            txt = re.sub("附(\d+)号","\1号",txt)
            _txts_res+=txt
        return re.findall("[\dA-Za-z]+",_txts_res)

    def handle_num(self, line):
        nums = re.split("[^0-9a-zA-Z]+",line)
        #txts = re.split("[0-9a-zA-Z]",line)
        #_txts = self.format_txt(txts)
        #output = []
        #for i,j in zip(nums,_txts):
        #    output.append(i)
        #    output.append(j)
        #return output
        return nums

    def query_one(self, line):
        line = utils.clr(str(line))
        line_pre = utils.before_first_num(line)
        #fir_num= utils.first_numbers(line)
        res = self.word_filter(line_pre)
        res.extend(utils.numbers(line))
        comm_nbs = []
        for i in range(len(res)-1):
            print(res)
            try:
                #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1])
                comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1])))
            except:
                print("networkx error")
                continue
            #conn = nx.all_shortest_paths(self.graph.tree_di,res[i],res[i+1])
            comm_nbs.append(list(nx.common_neighbors(self.graph.di,res[i],res[i+1])))
        result = self.common_nbs(comm_nbs)
        _result = []
        for i in result:
            if not self.check_num(self.graph.sent[i],line):
                continue
            _result.append(self.graph.sent[i])
        return _result,res

    def query(self):
        df = pd.DataFrame()
        df['map'] = ""
        df['kw'] = ""
        df['target'] = ""
        input_file = []
        cnt=0
        for _,_,docs in os.walk(TEST_PATH):
            for doc in docs:
                lines = open(os.path.join(TEST_PATH, doc)).readlines()
                #lines = pd.read_csv(os.path.join(TEST_PATH, doc)).iloc[:,1]
                lines = [lines[np.random.randint(len(lines))] for i in range(self.test_batch)]
                for line in lines:
                    line = utils.clr(line)
                    print(line)
                    result,res = self._query_one(line)
                    #result = self.addr_tree.words_route(res)
                    if len(result) == 0:
                        df.loc[str(cnt),'map'] = line
                        df.loc[str(cnt),'target'] = "".join([])
                        df.loc[str(cnt),'kw'] = ",".join(res)
                        cnt+=1
                        continue
                    else:
                        for parent_res in result:
                            print(line, parent_res)
                            df.loc[str(cnt),'map'] = line
                            df.loc[str(cnt),'target'] = "ROOT"+parent_res
                            df.loc[str(cnt),'kw'] = ",".join(res)
                            cnt+=1
                    df.to_csv("./record.csv")
                    print(cnt, 'save')