def main(p_args): success, list = fetch_last_one_record_by_time( p_alter_time_start=20190702080000, p_alter_time_end=20190702150000) excel_title = [ 'outbound_warehouse', 'order_number', 'can_send_weight', 'can_send_number', 'can_send_date', 'momentum_number', 'momentum_weight', 'commodity' ] XUtils.process_and_dump_2_excel(p_excel_title=excel_title, p_new_excel_list=list, p_new_file='./resources/snapshot.xls') EXCEL_TABLE1 = './resources/snapshot.xls' old_excel_list = XUtils.excel_to_list(p_read_excel_file_path=EXCEL_TABLE1, p_sheet_name='Sheet1', p_excel_title_list=excel_title) # if success: # update_inventory_table_by(p_new_value_dict=record) return 0
def has_the_same_tel_no(self, p_address_dict_a=None, p_address_dict_b=None, p_key=None): """判断是否有同一个手机号码 :param p_address_dict_a: :param p_address_dict_b: :param p_key: :return: """ rst = False mobile_1_list = XUtils.fetch_all_mobiles( p_text=p_address_dict_a[p_key]) mobile_2_list = XUtils.fetch_all_mobiles( p_text=p_address_dict_b[p_key]) mobile_list = mobile_1_list + mobile_2_list mobile_set = set(mobile_list) for item in mobile_set: if mobile_list.count(item) > 1: rst = True break return rst
def main(p_args): # Note 看这里 如果想做50个增量, 就把这个100改成50 G_INCREMENT_SIZE = int(input('请输入样本数(必须是正整数):')) print('\n\n样本数为========>>%d\n' % (G_INCREMENT_SIZE)) excel_title = ['序号', '地址编号', '省份', '城市', '区/县', '乡', '详细地址(拼接省市区)', '详细地址(PROD地址)', '经度', '纬度', '标准地址', '标准地址是否新地址'] # 1. 读取地址信息 old_excel_list = XUtils.excel_to_list(p_read_excel_file_path='./resources/receiving_address_input_1.xlsx', p_sheet_name='Sheet1', p_excel_title_list=excel_title) old_len = len(old_excel_list) print('\n最原始的数据总条数old_excel_list length=====>>%d' % (old_len)) err_num = 0 tmp_old_excel_list = [] for tmp_dict in old_excel_list: if XUtils.has_valid_lat_lng(tmp_dict): tmp_old_excel_list.append(tmp_dict) else: err_num += 1 old_excel_list = tmp_old_excel_list print('\n经纬度数据有问题的数据条数 invalid data num======>>%d' % (err_num)) print('\n真实参与处理的数据条数(即抛弃了非法数据后) old_excel_list length=====>> (%d - %d) = %d\n' % ( old_len, err_num, len(old_excel_list))) XUtils.process_and_dump_2_excel(p_excel_title=excel_title, p_new_excel_list=old_excel_list, p_new_file='./resources/receiving_address_input_1_ok.xls') # 增量 increment_list = [] while len(increment_list) < G_INCREMENT_SIZE: random_index = random.randint(0, len(old_excel_list) - 1) tmp_dict = old_excel_list.pop(random_index) increment_list.append(tmp_dict) # 存量(即表1) stock_list = old_excel_list print('存量 increment_list.length===>%d' % (len(increment_list))) print('删除了存量后的表1 stock_list.length===>%d' % (len(stock_list))) XUtils.process_and_dump_2_excel(p_excel_title=excel_title, p_new_excel_list=stock_list, p_new_file='./resources/receiving_address_stock_1_ok.xls') XUtils.process_and_dump_2_excel(p_excel_title=excel_title, p_new_excel_list=increment_list, p_new_file='./resources/receiving_address_increment_1_ok.xls') return 0
def main(p_args): excel_title = ['序号', '地址编号', '省份', '城市', '区/县', '乡', '详细地址(拼接省市区)', '详细地址(PROD地址)', '经度', '纬度', '标准地址', '标准地址是否新地址'] # 1. 读取地址信息 old_excel_list = XUtils.excel_to_list(p_read_excel_file_path='./resources/receiving_address_input_1.xlsx', p_sheet_name='Sheet1', p_excel_title_list=excel_title) old_len = len(old_excel_list) print('\n最原始的数据总条数old_excel_list length=====>>%d' % (old_len)) err_num = 0 final_err_num = 0 tmp_old_excel_list = [] for tmp_dict in old_excel_list: skip = False try: a = 1.0 / tmp_dict['经度'] a = 1.0 / tmp_dict['纬度'] except Exception as e: err_num += 1 # 维度, 经度 lat, lng = XUtils.findlogandlat(tmp_dict['详细地址(拼接省市区)']) if lat != 0.0 and lng != 0.0: tmp_dict['纬度'] = lat tmp_dict['经度'] = lng else: final_err_num += 1 skip = True print('err_num=%d addr=%s, lat=%f lng=%f' % (err_num, tmp_dict['详细地址(拼接省市区)'], lat, lng)) if skip is False: tmp_old_excel_list.append(tmp_dict) old_excel_list = tmp_old_excel_list print('\n经纬度数据有问题的数据条数 invalid data num======>>%d' % (err_num)) print('\n利用百度地图API纠正的数据条数=======>>%d, 无法纠正的数据条数======>>%d' % ((err_num - final_err_num), final_err_num)) XUtils.process_and_dump_2_excel(p_excel_title=excel_title, p_new_excel_list=old_excel_list, p_new_file='./resources/receiving_address_input_1_correct_150_via_baidu.xls') return 0
def compare(self, p_address_dict_a=None, p_address_dict_b=None): """ :param p_address_dict_a: :param p_address_dict_b: :return: """ # s1 = "hi,今天温度是12摄氏度。" # s2 = "hello,今天温度很高。" # s1 = p_address_dict_a['详细地址(拼接省市区)'] # s2 = p_address_dict_b['详细地址(拼接省市区)'] # 详细地址(拼接省市区)匹配度 P_KEY = '详细地址(拼接省市区)' # 修改原始数据,丢弃掉空格,特殊字符串(物理丢弃) p_address_dict_a[P_KEY] = XUtils.remove_noise_empty_punctuation( p_address_dict=p_address_dict_a, p_key=P_KEY) p_address_dict_b[P_KEY] = XUtils.remove_noise_empty_punctuation( p_address_dict=p_address_dict_b, p_key=P_KEY) # 例如有的地址叫国储八三二,有的地址叫国储832, 统一处理为国储832, 以提升匹配度 p_address_dict_a[ P_KEY] = XUtils.convert_chinese_numerals_2_arabic_numerals_for_dict( p_address_dict=p_address_dict_a, p_key=P_KEY) p_address_dict_b[ P_KEY] = XUtils.convert_chinese_numerals_2_arabic_numerals_for_dict( p_address_dict=p_address_dict_b, p_key=P_KEY) # 丢弃省市县(内存中丢弃, 不篡改之前数据) s1 = XUtils.remove_noise_province_city_district( p_address_dict=p_address_dict_a, p_key=P_KEY) s2 = XUtils.remove_noise_province_city_district( p_address_dict=p_address_dict_b, p_key=P_KEY) vec1, vec2 = self.get_word_vector(s1, s2) dist1 = self.cos_dist(vec1, vec2) # Note 看这里 此处认为相似度达到0.8才是同一个地址, 这个数字可以改 rst = dist1 >= AddressCosineSimilarityStrategy.G_80 if rst is True: rst = super(AddressCosineSimilarityStrategy, self).compare(p_address_dict_a=p_address_dict_a, p_address_dict_b=p_address_dict_b) return rst
def contains(p_new_excel_list=None, p_old_dict=None): """ 判断 :param p_new_excel_list: :param p_old_dict: :return: rst为true的时候表示能够在p_new_excel_list找到兄弟节点, 否则找不到兄弟节点。 (所谓兄弟节点就是指着这2个点认为是同一个地址), brother_dict是p_old_dict的兄弟节点. """ rst = False max_sim = -3721.4728 brother_dict = None # 竟然没有SIM ?????????????? for tmp_new_dict in p_new_excel_list: # Note 利用余弦相似度公式计算两字符串的相似性 (相似度达到0.8则认为是一个地址,否则是2个不同地址, 这个0.8我是随便写的, 可修改) # rst_cos_sim = CosineSimilarityStrategy().compare(p_address_dict_a=tmp_new_dict, p_address_dict_b=p_old_dict) # NOTE 通过对经纬度的比较,相差百分之一或更小以内的视为同一地址,否则视为两个地址 # rst_lal = LALPctStrategy().compare(p_address_dict_a=tmp_new_dict, p_address_dict_b=p_old_dict) # Note 根据距离来判断(200米) match_distance, real_distance = GEODistanceStrategy().compare( p_address_dict_a=tmp_new_dict, p_address_dict_b=p_old_dict) # real_distance = random.randint(0, 5000000) # 2个点的真实距离 x = real_distance # Note 计算字符匹配度 # 详细地址(拼接省市区)匹配度; 详细地址(PROD地址) 匹配度 rst_str_diff, sim_string = AddressStringDiffStrategy().compare( p_address_dict_a=tmp_new_dict, p_address_dict_b=p_old_dict) # sim_string = random.random() # a是字符串相似度, b是距离相似度 a = sim_string # 首先判断,已有地址的这一条数据有没有经纬度 # 如果有 # 计算距离X # 再计算b =(500 - X) / 500, Note 此处的500也作为一个参数,允许调整,见XConstants.FIXED_DISTANCE # 这里加上一个b的下限 # b下限 =(β - a理论 * α) / (1 - α) # 如果b小于这个值 # b直接等于这个值 # 这一步主要保证了当a大于a理论(0.95)时,匹配一定能成功大于β(判定值)0.6 # α就是你的FACTER权重 # 如果没有 # b = 0 if XUtils.has_valid_lat_lng(p_old_dict): # 计算根据距离算出来的相似度. 其中x是求大圆算出来的距离, 即2个点的真实距离 b = (XConstants.FIXED_DISTANCE - x) / XConstants.FIXED_DISTANCE # b还影响匹配度, 但是影响程度非常低 B_MIN = (XConstants.BETA - XConstants.A_THEORY * XConstants.ALPHA) / (1 - XConstants.ALPHA) if b < B_MIN: b = B_MIN else: b = 0 # sim = XConstants.ALPHA * a + (1 - XConstants.ALPHA) * b # rst = match_distance is True and rst_str_diff is True # NOTE 看这里 ...................... 此处也需要人为调整 if rst is False: # 一旦匹配到一个兄弟后, 就认为成功, 后续就无需再考虑rst了, 后续就是去找匹配度更高的兄弟即可 rst = sim >= 0.6 tmp_new_dict['sim'] = sim # Note 取得sim 最大的作为兄弟返回 if rst is True: if brother_dict is None or tmp_new_dict['sim'] > brother_dict[ 'sim']: brother_dict = tmp_new_dict max_sim = brother_dict['sim'] return rst, brother_dict, max_sim
def main(p_args): global g_contains_execute_times global g_contains_cost_time g_contains_execute_times = 0 g_contains_cost_time = 0 start = datetime.datetime.now() excel_title = ['序号', '地址编号', '省份', '城市', '区/县', '乡', '详细地址(拼接省市区)', '详细地址(PROD地址)', '经度', '纬度', '标准地址', '标准地址是否新地址'] # 1. 读取地址信息 NOTE 看这里 # EXCEL_TABLE1 = './resources/receiving_address_input_1.xlsx' EXCEL_TABLE1 = './resources/receiving_address_stock_1_ok.xls' old_excel_list = XUtils.excel_to_list(p_read_excel_file_path=EXCEL_TABLE1, p_sheet_name='Sheet1', p_excel_title_list=excel_title) old_len = len(old_excel_list) print('\n表1数据总条数old_excel_list length=====>>%d' % (old_len)) # 2. 丢弃经纬度有问题的数据, 只留下经纬度正确的数据 err_num = 0 tmp_old_excel_list = [] for tmp_dict in old_excel_list: try: a = 1.0 / tmp_dict['经度'] a = 1.0 / tmp_dict['纬度'] tmp_old_excel_list.append(tmp_dict) except Exception as e: err_num += 1 old_excel_list = tmp_old_excel_list print('\n经纬度数据有问题的数据条数 invalid data num======>>%d' % (err_num)) print('\n真实参与处理的数据条数(即抛弃了非法数据后) old_excel_list length=====>> (%d - %d) = %d\n' % ( old_len, err_num, len(old_excel_list))) # Note 3. # Note 总共生成两个表,算上原始表就有三个 # Note 假如说第一个数据进来,直接就扔到第二个表里,给他一个编号1.第二个进来跟第一个比较,然后没匹配上扔到表2给他一个编号2.第三个进来了,跟前两个比较, # Note 假如说匹配到了1,我们给他也扔到表2里给编号1。。。这样的话表2就会有4636-150个数据,但是每一数据都有一个编号。然后在根据这个编号顺序排一下,这样就把相同的归在一起了 # Note 这个是把相同/类似的地址 编一个相同的号, 相当于分组 new_excel_dict_grouped = {} new_excel_list_grouped = [] group_id = 0 for tmp_dict in old_excel_list: time_start = time.time() rst, brother_dict, sim = contains(p_new_excel_list=new_excel_list_grouped, p_old_dict=tmp_dict) time_end = time.time() g_contains_execute_times += 1 g_contains_cost_time += (time_end - time_start) if rst is False: group_id += 1 # 建立小组 new_excel_dict_grouped[str(group_id)] = [] # Note 加一列数据group_id tmp_dict['group_id'] = group_id else: # Note 此处要非常注意, 应该使用它兄弟的group_id, 而不是使用最新的group_id tmp_dict['group_id'] = brother_dict['group_id'] new_excel_list_grouped.append(tmp_dict) # Note 分组, 同一小组的记录具有相同group_id new_excel_dict_grouped[str(tmp_dict['group_id'])].append(tmp_dict) print('\n组数 group_id=====>>%d' % (group_id)) print('\n分组后的新数据条数 new_excel_list_grouped length=====>>%d\n' % (len(new_excel_list_grouped))) # 4. 对分组后的数据进行处理并写入excel # Note 加一列标题group_id excel_title.insert(0, 'group_id') sorted_list = sorted(new_excel_list_grouped, key=lambda x: x['group_id'], reverse=False) XUtils.process_and_dump_2_excel(p_excel_title=excel_title, p_new_excel_list=sorted_list, p_new_file='./resources/receiving_address_group_by_1.xls') # Note 最重要的一步 # Note 5. 去重, 将去重后的数据存入new_excel_list_filtered中 # Note 对老数据循环, 挨个去和新数据内的所有数据逐个对比, 如果在new_list内找到了, 则认为是同一地址, 否则认为是不同地址, 不同地址则添加到new_excel_list内 # Note 所以这里是两层循环 # Note # Note 然后在生成表三,从表2中每一个重复标号的,选取详细地址最长字符的作为表3的地址 new_excel_list_filtered = [] new_excel_dict_filtered = {} # Note 注意, 这个value是一个list for (key, value) in new_excel_dict_grouped.items(): rst = fetch_max_length_item(p_excel_sub_list=value) new_excel_list_filtered.append(rst) new_excel_dict_filtered[rst['group_id']] = rst print('\n去重后的新数据条数 new_excel_list_filtered length=====>>%d\n' % (len(new_excel_list_filtered))) # 6. 对去重后的数据进行处理并写入excel sorted_list = sorted(new_excel_list_filtered, key=lambda x: x['group_id'], reverse=False) XUtils.process_and_dump_2_excel(p_excel_title=excel_title, p_new_excel_list=sorted_list, p_new_file='./resources/receiving_address_filtered_1.xls') print('\n\n----------------->g_contains_cost_time<-----------------') print(g_contains_execute_times) print(g_contains_cost_time) print(g_contains_cost_time / g_contains_execute_times) end = datetime.datetime.now() print('\n\n----------------->增量耗时 cost time<-----------------') print(end - start) start = datetime.datetime.now() print('\n程序执行完毕 !!! DONE DONE DONE DONE DONE DONE DONE DONE DONE DONE DONE') return 0
def main(p_args): start = datetime.datetime.now() excel_title = ['序号', '地址编号', '省份', '城市', '区/县', '乡', '详细地址(拼接省市区)', '详细地址(PROD地址)', '经度', '纬度', '标准地址', '标准地址是否新地址'] # 1. 读取地址信息 NOTE 看这里 # EXCEL_TABLE1 = './resources/receiving_address_input_1.xlsx' EXCEL_TABLE1 = './resources/receiving_address_stock_1_ok.xls' old_excel_list = XUtils.excel_to_list(p_read_excel_file_path=EXCEL_TABLE1, p_sheet_name='Sheet1', p_excel_title_list=excel_title) old_len = len(old_excel_list) print('\n表1数据总条数old_excel_list length=====>>%d' % (old_len)) # 2. 丢弃经纬度有问题的数据, 只留下经纬度正确的数据 err_num = 0 tmp_old_excel_list = [] for tmp_dict in old_excel_list: try: a = 1.0 / tmp_dict['经度'] a = 1.0 / tmp_dict['纬度'] tmp_old_excel_list.append(tmp_dict) except Exception as e: err_num += 1 old_excel_list = tmp_old_excel_list print('\n经纬度数据有问题的数据条数 invalid data num======>>%d' % (err_num)) print('\n真实参与处理的数据条数(即抛弃了非法数据后) old_excel_list length=====>> (%d - %d) = %d\n' % ( old_len, err_num, len(old_excel_list))) # Note 3. # Note 总共生成两个表,算上原始表就有三个 # Note 假如说第一个数据进来,直接就扔到第二个表里,给他一个编号1.第二个进来跟第一个比较,然后没匹配上扔到表2给他一个编号2.第三个进来了,跟前两个比较, # Note 假如说匹配到了1,我们给他也扔到表2里给编号1。。。这样的话表2就会有4636-150个数据,但是每一数据都有一个编号。然后在根据这个编号顺序排一下,这样就把相同的归在一起了 # Note 这个是把相同/类似的地址 编一个相同的号, 相当于分组 new_excel_dict_grouped = {} new_excel_list_grouped = [] group_id = 0 for tmp_dict in old_excel_list: rst, brother_dict = contains(p_new_excel_list=new_excel_list_grouped, p_old_dict=tmp_dict) if rst is False: group_id += 1 # 建立小组 new_excel_dict_grouped[str(group_id)] = [] # Note 加一列数据group_id tmp_dict['group_id'] = group_id else: # Note 此处要非常注意, 应该使用它兄弟的group_id, 而不是使用最新的group_id tmp_dict['group_id'] = brother_dict['group_id'] new_excel_list_grouped.append(tmp_dict) # Note 分组, 同一小组的记录具有相同group_id new_excel_dict_grouped[str(tmp_dict['group_id'])].append(tmp_dict) print('\n组数 group_id=====>>%d' % (group_id)) print('\n分组后的新数据条数 new_excel_list_grouped length=====>>%d\n' % (len(new_excel_list_grouped))) # 4. 对分组后的数据进行处理并写入excel # Note 加一列标题group_id excel_title.insert(0, 'group_id') sorted_list = sorted(new_excel_list_grouped, key=lambda x: x['group_id'], reverse=False) XUtils.process_and_dump_2_excel(p_excel_title=excel_title, p_new_excel_list=sorted_list, p_new_file='./resources/receiving_address_group_by_1.xls') # Note 最重要的一步 # Note 5. 去重, 将去重后的数据存入new_excel_list_filtered中 # Note 对老数据循环, 挨个去和新数据内的所有数据逐个对比, 如果在new_list内找到了, 则认为是同一地址, 否则认为是不同地址, 不同地址则添加到new_excel_list内 # Note 所以这里是两层循环 # Note # Note 然后在生成表三,从表2中每一个重复标号的,选取详细地址最长字符的作为表3的地址 new_excel_list_filtered = [] new_excel_dict_filtered = {} # Note 注意, 这个value是一个list for (key, value) in new_excel_dict_grouped.items(): rst = fetch_max_length_item(p_excel_sub_list=value) new_excel_list_filtered.append(rst) new_excel_dict_filtered[rst['group_id']] = rst print('\n去重后的新数据条数 new_excel_list_filtered length=====>>%d\n' % (len(new_excel_list_filtered))) # 6. 对去重后的数据进行处理并写入excel sorted_list = sorted(new_excel_list_filtered, key=lambda x: x['group_id'], reverse=False) XUtils.process_and_dump_2_excel(p_excel_title=excel_title, p_new_excel_list=sorted_list, p_new_file='./resources/receiving_address_filtered_1.xls') end = datetime.datetime.now() print('\n\n----------------->存量耗时 cost time<-----------------') print(end - start) start = datetime.datetime.now() # 7. 读取增量excel(实际excel中就一条) 至 old_excel_list 中 excel_title.remove('group_id') # NOTE 看过来 # EXCEL_TABLE_INCREMENT = './resources/receiving_address_increment_1.xlsx' EXCEL_TABLE_INCREMENT = './resources/receiving_address_increment_1_ok.xls' old_excel_list = XUtils.excel_to_list(p_read_excel_file_path=EXCEL_TABLE_INCREMENT, p_sheet_name='Sheet1', p_excel_title_list=excel_title) print('\n增量数据条数 old_excel_list length=====>>%d\n' % (len(old_excel_list))) increment_list_match_success = [] brother_in_table3_of_increment_list = [] increment_list_match_failed = [] should_create_new_group_4_increment = False excel_title.insert(0, 'group_id') for tmp_dict in old_excel_list: rst, brother_dict = contains(p_new_excel_list=new_excel_list_grouped, p_old_dict=tmp_dict) if rst is False: tmp_dict['标准地址'] = '匹配失败' group_id += 1 tmp_dict['group_id'] = group_id # 需要进表2的数据 new_excel_list_grouped.append(tmp_dict) # 建立小组 new_excel_dict_grouped[str(group_id)] = [] new_excel_dict_grouped[str(tmp_dict['group_id'])].append(tmp_dict) # increment_list_match_failed.append(tmp_dict) should_create_new_group_4_increment = True else: tmp_dict['标准地址'] = '匹配成功' tmp_dict['group_id'] = brother_dict['group_id'] increment_list_match_success.append(tmp_dict) pass new_excel_list_filtered = [] new_excel_dict_filtered = {} # Note 注意, 这个value是一个list for (key, value) in new_excel_dict_grouped.items(): rst = fetch_max_length_item(p_excel_sub_list=value) new_excel_list_filtered.append(rst) new_excel_dict_filtered[rst['group_id']] = rst # for tmp_dict in increment_list_match_success: brother_in_table3 = new_excel_dict_filtered[tmp_dict['group_id']] brother_in_table3['标准地址是否新地址'] = '我是存量' # print('表三中对应的地址信息如下=====>>:') # pp = pprint.PrettyPrinter(indent=4) # pp.pprint(brother_in_table3) brother_in_table3_of_increment_list.append(brother_in_table3) # TODO 最后100条测试数据,匹配成功的,看看能否将数据输出成Excel,就是,前几列信息匹配成功的增量数据,然后后几列是匹配到的表三数据 print('\n增量匹配成功的数据条数 increment_list_match_success length=====>>%d\n' % (len(increment_list_match_success))) print('\n增量匹配成功的兄弟们 brother_in_table3_of_increment_list length=====>>%d\n' % ( len(brother_in_table3_of_increment_list))) print('\n增量匹配失败的数据条数 increment_list_match_success length=====>>%d\n' % (len(increment_list_match_failed))) sorted_list = sorted(increment_list_match_success, key=lambda x: x['group_id'], reverse=False) XUtils.process_and_dump_2_excel(p_excel_title=excel_title, p_new_excel_list=sorted_list, p_new_file='./resources/receiving_address_increment_match_success.xls') sorted_list = sorted(brother_in_table3_of_increment_list, key=lambda x: x['group_id'], reverse=False) XUtils.process_and_dump_2_excel(p_excel_title=excel_title, p_new_excel_list=sorted_list, p_new_file='./resources/receiving_address_increment_brother_in_table3.xls') sorted_list = sorted(increment_list_match_failed, key=lambda x: x['group_id'], reverse=False) XUtils.process_and_dump_2_excel(p_excel_title=excel_title, p_new_excel_list=sorted_list, p_new_file='./resources/receiving_address_increment_match_failed.xls') # increment_list_match_success.extend(brother_in_table3_of_increment_list) sorted_list = sorted(increment_list_match_success, key=lambda x: x['group_id'], reverse=False) XUtils.process_and_dump_2_excel(p_excel_title=excel_title, p_new_excel_list=sorted_list, p_new_file='./resources/receiving_address_compare.xls') # 8. 对去重后的数据进行处理并写入excel if should_create_new_group_4_increment: sorted_list = sorted(new_excel_list_grouped, key=lambda x: x['group_id'], reverse=False) XUtils.process_and_dump_2_excel(p_excel_title=excel_title, p_new_excel_list=sorted_list, p_new_file='./resources/receiving_address_group_by_2.xls') sorted_list = sorted(new_excel_list_filtered, key=lambda x: x['group_id'], reverse=False) XUtils.process_and_dump_2_excel(p_excel_title=excel_title, p_new_excel_list=sorted_list, p_new_file='./resources/receiving_address_filtered_2.xls') end = datetime.datetime.now() print('\n\n----------------->增量耗时 cost time<-----------------') print(end - start) print('\n程序执行完毕 !!! DONE DONE DONE DONE DONE DONE DONE DONE DONE DONE DONE') return 0
#!/usr/bin/env python # -*- encoding: utf-8 -*- import pprint, sys, os, os.path, xlrd, xlwt, random, re from app.XUtils import XUtils if __name__ == '__main__': print(XUtils.convert_chinese_numerals_2_arabic_numerals_4_str(p_str='国储八三二')) print(XUtils.convert_chinese_numerals_2_arabic_numerals_4_str(p_str='国储八三三三三三三三三三三二')) print(XUtils.convert_chinese_numerals_2_arabic_numerals_4_str(p_str='国储832')) print(XUtils.convert_chinese_numerals_2_arabic_numerals_4_str(p_str='八三二国储')) print(XUtils.convert_chinese_numerals_2_arabic_numerals_4_str(p_str='八三二')) print(XUtils.convert_chinese_numerals_2_arabic_numerals_4_str(p_str='八三国储二')) print(XUtils.convert_chinese_numerals_2_arabic_numerals_4_str(p_str='')) sys.exit(0)
def compare(self, p_address_dict_a=None, p_address_dict_b=None): """判断'详细地址(拼接省市区)'的相似度, 大于等于 0.8,则认为是同一个地址 :param p_address_dict_a: :param p_address_dict_b: :return:详细地址(拼接省市区)匹配度; 详细地址(PROD地址) 匹配度 """ # 详细地址(拼接省市区)匹配度 P_KEY = '详细地址(拼接省市区)' # 修改原始数据,丢弃掉空格,特殊字符串(物理丢弃) p_address_dict_a[P_KEY] = XUtils.remove_noise_empty_punctuation(p_address_dict=p_address_dict_a, p_key=P_KEY) p_address_dict_b[P_KEY] = XUtils.remove_noise_empty_punctuation(p_address_dict=p_address_dict_b, p_key=P_KEY) # 例如有的地址叫国储八三二,有的地址叫国储832, 统一处理为国储832, 以提升匹配度 p_address_dict_a[P_KEY] = XUtils.convert_chinese_numerals_2_arabic_numerals_for_dict(p_address_dict=p_address_dict_a, p_key=P_KEY) p_address_dict_b[P_KEY] = XUtils.convert_chinese_numerals_2_arabic_numerals_for_dict(p_address_dict=p_address_dict_b, p_key=P_KEY) # 丢弃省市县(内存中丢弃, 不篡改之前数据) query_str = XUtils.remove_noise_province_city_district(p_address_dict=p_address_dict_a, p_key=P_KEY) s1 = XUtils.remove_noise_province_city_district(p_address_dict=p_address_dict_b, p_key=P_KEY) r1 = difflib.SequenceMatcher(None, query_str, s1).quick_ratio() # Note 如果分数介于60~80分, 则再给一次机会, 如果具有相同手机号码,则认为是同一个地址 if r1 < AddressStringDiffStrategy.G_80 and r1 >= AddressStringDiffStrategy.G_60: with_same_tel_no = self.has_the_same_tel_no(p_address_dict_a=p_address_dict_a, p_address_dict_b=p_address_dict_b, p_key=P_KEY) r1 = AddressStringDiffStrategy.G_100 if with_same_tel_no else r1 # 详细地址(PROD地址) 匹配度 try: P_KEY = '详细地址(PROD地址)' # 修改原始数据,丢弃掉空格,特殊字符串(物理丢弃) p_address_dict_a[P_KEY] = XUtils.remove_noise_empty_punctuation(p_address_dict=p_address_dict_a, p_key=P_KEY) p_address_dict_b[P_KEY] = XUtils.remove_noise_empty_punctuation(p_address_dict=p_address_dict_b, p_key=P_KEY) # 例如有的地址叫国储八三二,有的地址叫国储832, 统一处理为国储832, 以提升匹配度 p_address_dict_a[P_KEY] = XUtils.convert_chinese_numerals_2_arabic_numerals_for_dict(p_address_dict=p_address_dict_a, p_key=P_KEY) p_address_dict_b[P_KEY] = XUtils.convert_chinese_numerals_2_arabic_numerals_for_dict(p_address_dict=p_address_dict_b, p_key=P_KEY) # 丢弃省市县(内存中丢弃, 不篡改之前数据) query_str = XUtils.remove_noise_province_city_district(p_address_dict=p_address_dict_a, p_key=P_KEY) s1 = XUtils.remove_noise_province_city_district(p_address_dict=p_address_dict_b, p_key=P_KEY) r2 = difflib.SequenceMatcher(None, query_str, s1).quick_ratio() # Note 如果分数介于60~80分, 则再给一次机会, 如果具有相同手机号码,则认为是同一个地址 if r2 < AddressStringDiffStrategy.G_80 and r2 >= AddressStringDiffStrategy.G_60: with_same_tel_no = self.has_the_same_tel_no(p_address_dict_a=p_address_dict_a, p_address_dict_b=p_address_dict_b, p_key=P_KEY) r2 = AddressStringDiffStrategy.G_100 if with_same_tel_no else r2 except Exception as e: r2 = AddressStringDiffStrategy.G_100 # NOTE 看这里 此处认为相似度达到0.8才是同一个地址, 这个数字可以改 rst = r1 >= AddressStringDiffStrategy.G_80 and r2 >= AddressStringDiffStrategy.G_80 # if rst is True: rst = super(AddressStringDiffStrategy, self).compare(p_address_dict_a=p_address_dict_a, p_address_dict_b=p_address_dict_b) return rst, r1