Пример #1
0
    def match_list2comm(self):
        # 取挂牌记录
        ListRecords = ML.get_list_fromMysql(
            "SELECT distinct community_name FROM `ods_hse_detail`")
        for item in ListRecords:
            item['clear_name'] = ToolsBox.clearStr(item['community_name'])

        result = []  # 以开发库的小区名为主,匹配出挂牌数据的列表
        for key, value in self.comm_arr.items():
            name_dic = dict()
            name_dic['comm_name'] = key
            name_dic['comm_id'] = value
            name_dic['vol'] = 0
            name_dic['match_list_comm_name'] = ''
            name_dic['match_all'] = ''  #存放所有匹配度>0.8的小区名
            for item in ListRecords:
                vol = MyVote.cmntVol(key, item['clear_name'])
                if vol > name_dic['vol']:
                    name_dic['vol'] = vol
                    name_dic['match_list_comm_name'] = item['community_name']
                if vol >= 0.8:
                    name_dic['match_all'] = name_dic['match_all'] + item[
                        'community_name'] + '(' + '%f' % vol + ');'
            result.append(name_dic)

        for item in ListRecords:
            item['matchid'] = '0'
            item['match_vol'] = 0
            for key, value in self.comm_arr.items():
                vol = MyVote.cmntVol(key, item['clear_name'])
                if vol > item['match_vol']:
                    item['match_vol'] = vol
                    item['matchid'] = value
                    item['match_comm_name'] = key
        ToolsBox.saveExcel('match.xlsx', result, "Sheet1")
        ToolsBox.saveExcel('match.xlsx', ListRecords, "Sheet2")
Пример #2
0
    def deduplicate(self):
        if not self.input_sheet:
            self.input_sheet = ToolsBox.read_excel(self.file_name,
                                                   self.sheet_name)
        # ToolsBox.saveExcel("C:\\Users\\15007\\Desktop\\回写.xlsx", self.input_sheet,"运行前")
        reduce_list = []  #存放去重后的记录的list
        count = 0
        # for record in self.input_sheet:
        for i in range(len(self.input_sheet)):
            # for i in range(1000):
            record = copy.deepcopy(self.input_sheet[i])
            count += 1
            reduce_list_len = len(reduce_list)
            print("第%d个小区:%s(已更新%d个小区)" %
                  (count, record[self.col_name], reduce_list_len))
            # temp_similar_communitys = []        # 一个临时存放相似小区的list
            most_similar_community = {}  # 存放最相似的小区记录
            for item in reduce_list:  # 遍历去重后的小区记录集
                names = item['alias'].split(";")
                for name in names:
                    similar = self.get_Comprehensive_similar(
                        record[self.col_name], name)
                    if similar >= self.valve:
                        # temp_similar_communitys.append(record)
                        if most_similar_community:
                            if most_similar_community['similar'] < similar:
                                # print(">>>>>>>>>>>>>%s与%s原有相似度为%f,现在与%s相似度为%f"%(record[self.col_name],most_similar_community[self.col_name],most_similar_community['similar'],item[self.col_name],similar))
                                print(
                                    ">>>>>>>>>>>>>%s与%s原有相似度为%f,现在与%s相似度为%f" %
                                    (record[self.col_name],
                                     most_similar_community[self.col_name],
                                     most_similar_community['similar'], name,
                                     similar))
                                most_similar_community = item
                                most_similar_community['similar'] = similar
                                # most_similar_community['alias'] = name + ";" + record[self.col_name]
                                print(
                                    ">>>>>>>>>>>>>>>>>>>>>>>>%s现在最相似小区为%s,相似度为%f"
                                    % (record[self.col_name],
                                       most_similar_community[self.col_name],
                                       most_similar_community['similar']))
                            elif most_similar_community['similar'] == similar:
                                if len(name) > len(
                                        most_similar_community[self.col_name]):
                                    print(
                                        "===========%s与%s原有相似度为%f,现在与%s相似度为%f"
                                        %
                                        (record[self.col_name],
                                         most_similar_community[self.col_name],
                                         most_similar_community['similar'],
                                         name, similar))
                                    most_similar_community = item
                                    most_similar_community['similar'] = similar
                                    # most_similar_community['alias'] = name + ";" + record[self.col_name]
                                    print(
                                        "==========================%s现在最相似小区为%s,相似度为%f"
                                        %
                                        (record[self.col_name],
                                         most_similar_community[self.col_name],
                                         most_similar_community['similar']))
                                elif len(name) == len(
                                        most_similar_community[self.col_name]):
                                    print(
                                        "???????????????%s与%s原有相似度为%f,现在与%s相似度为%f"
                                        %
                                        (record[self.col_name],
                                         most_similar_community[self.col_name],
                                         most_similar_community['similar'],
                                         name, similar))
                                    # most_similar_community['alias'] += ";" + record[self.col_name]
                                    ToolsBox.printDic(most_similar_community)
                        else:
                            most_similar_community = item
                            most_similar_community['similar'] = similar
                            # most_similar_community['alias'] = name + ";" + record[self.col_name]
                            print("@@@@@@@@@@@@@@@@@@@%s与%s相似度为%f" %
                                  (record[self.col_name],
                                   most_similar_community[self.col_name],
                                   most_similar_community['similar']))

            if most_similar_community:  #如果小区在“去重集”中找到相似小区,更新一下去重集
                for index in range(reduce_list_len):
                    if reduce_list[index][
                            self.col_name] == most_similar_community[
                                self.col_name]:
                        reduce_list[index]['alias'] = reduce_list[index][
                            'alias'] + ";" + record[self.col_name]
                        break
            else:  # 如果没有发现去重后的小区记录集有与当前记录匹配的,说明是一个新小区,加入“去重集”中
                record['alias'] = record[self.col_name]
                reduce_list.append(record)
        # print(type(reduce_list))
        ToolsBox.saveExcel("C:\\Users\\15007\\Desktop\\去重结果.xlsx", reduce_list)