def match_list2comm(self): # 取挂牌记录 ListRecords = ML.get_list_fromMysql( "SELECT distinct community_name FROM `ods_hse_detail`") for item in ListRecords: item['clear_name'] = ToolsBox.clearStr(item['community_name']) result = [] # 以开发库的小区名为主,匹配出挂牌数据的列表 for key, value in self.comm_arr.items(): name_dic = dict() name_dic['comm_name'] = key name_dic['comm_id'] = value name_dic['vol'] = 0 name_dic['match_list_comm_name'] = '' name_dic['match_all'] = '' #存放所有匹配度>0.8的小区名 for item in ListRecords: vol = MyVote.cmntVol(key, item['clear_name']) if vol > name_dic['vol']: name_dic['vol'] = vol name_dic['match_list_comm_name'] = item['community_name'] if vol >= 0.8: name_dic['match_all'] = name_dic['match_all'] + item[ 'community_name'] + '(' + '%f' % vol + ');' result.append(name_dic) for item in ListRecords: item['matchid'] = '0' item['match_vol'] = 0 for key, value in self.comm_arr.items(): vol = MyVote.cmntVol(key, item['clear_name']) if vol > item['match_vol']: item['match_vol'] = vol item['matchid'] = value item['match_comm_name'] = key ToolsBox.saveExcel('match.xlsx', result, "Sheet1") ToolsBox.saveExcel('match.xlsx', ListRecords, "Sheet2")
def deduplicate(self): if not self.input_sheet: self.input_sheet = ToolsBox.read_excel(self.file_name, self.sheet_name) # ToolsBox.saveExcel("C:\\Users\\15007\\Desktop\\回写.xlsx", self.input_sheet,"运行前") reduce_list = [] #存放去重后的记录的list count = 0 # for record in self.input_sheet: for i in range(len(self.input_sheet)): # for i in range(1000): record = copy.deepcopy(self.input_sheet[i]) count += 1 reduce_list_len = len(reduce_list) print("第%d个小区:%s(已更新%d个小区)" % (count, record[self.col_name], reduce_list_len)) # temp_similar_communitys = [] # 一个临时存放相似小区的list most_similar_community = {} # 存放最相似的小区记录 for item in reduce_list: # 遍历去重后的小区记录集 names = item['alias'].split(";") for name in names: similar = self.get_Comprehensive_similar( record[self.col_name], name) if similar >= self.valve: # temp_similar_communitys.append(record) if most_similar_community: if most_similar_community['similar'] < similar: # print(">>>>>>>>>>>>>%s与%s原有相似度为%f,现在与%s相似度为%f"%(record[self.col_name],most_similar_community[self.col_name],most_similar_community['similar'],item[self.col_name],similar)) print( ">>>>>>>>>>>>>%s与%s原有相似度为%f,现在与%s相似度为%f" % (record[self.col_name], most_similar_community[self.col_name], most_similar_community['similar'], name, similar)) most_similar_community = item most_similar_community['similar'] = similar # most_similar_community['alias'] = name + ";" + record[self.col_name] print( ">>>>>>>>>>>>>>>>>>>>>>>>%s现在最相似小区为%s,相似度为%f" % (record[self.col_name], most_similar_community[self.col_name], most_similar_community['similar'])) elif most_similar_community['similar'] == similar: if len(name) > len( most_similar_community[self.col_name]): print( "===========%s与%s原有相似度为%f,现在与%s相似度为%f" % (record[self.col_name], most_similar_community[self.col_name], most_similar_community['similar'], name, similar)) most_similar_community = item most_similar_community['similar'] = similar # most_similar_community['alias'] = name + ";" + record[self.col_name] print( "==========================%s现在最相似小区为%s,相似度为%f" % (record[self.col_name], most_similar_community[self.col_name], most_similar_community['similar'])) elif len(name) == len( most_similar_community[self.col_name]): print( "???????????????%s与%s原有相似度为%f,现在与%s相似度为%f" % (record[self.col_name], most_similar_community[self.col_name], most_similar_community['similar'], name, similar)) # most_similar_community['alias'] += ";" + record[self.col_name] ToolsBox.printDic(most_similar_community) else: most_similar_community = item most_similar_community['similar'] = similar # most_similar_community['alias'] = name + ";" + record[self.col_name] print("@@@@@@@@@@@@@@@@@@@%s与%s相似度为%f" % (record[self.col_name], most_similar_community[self.col_name], most_similar_community['similar'])) if most_similar_community: #如果小区在“去重集”中找到相似小区,更新一下去重集 for index in range(reduce_list_len): if reduce_list[index][ self.col_name] == most_similar_community[ self.col_name]: reduce_list[index]['alias'] = reduce_list[index][ 'alias'] + ";" + record[self.col_name] break else: # 如果没有发现去重后的小区记录集有与当前记录匹配的,说明是一个新小区,加入“去重集”中 record['alias'] = record[self.col_name] reduce_list.append(record) # print(type(reduce_list)) ToolsBox.saveExcel("C:\\Users\\15007\\Desktop\\去重结果.xlsx", reduce_list)