def insert_to_mysql(self, connection, data): """ 插入新数据 :param connection: :param data: type => tuple List or dict :return: """ mysql_logger = Logger().logger if isinstance(data, dict): k_list = [key for key in data.keys()] v_list = tuple([value for value in data.values()]) v_sql = str(v_list) if v_sql[-2] == ",": v_sql = v_sql[:-2] + ")" sql = f"INSERT INTO {self.mysql_table} ({','.join(k_list)}) VALUES{v_sql}" elif isinstance(data, (list, tuple)): k_list = [key for key in data[0].keys()] value_list = list() for each in data: v_list = str(tuple([value for value in each.values()])) if v_list[-2] == ",": v_list = v_list[:-2] + ")" value_list.append(v_list) sql = f"INSERT INTO {self.mysql_table} ({','.join(k_list)}) VALUES" sql = sql + ",".join(value_list) else: raise Exception("not format type of data") try: mysql_logger.info(f"网络声量sql==>{sql}") count = self.cs_commit(connection=connection, sql=sql) mysql_logger.info(f"MySQL 插入成功 {count} 条") except Exception as e: mysql_logger.exception(f"网络声量 插入失败,ERROR: {e}")
def delete_from_mysql(self, connection, where_condition): """ 删除 :param connection: :param where_condition: where 条件 :return: """ mysql_logger = Logger().logger if "where" in where_condition or "WHERE" in where_condition: sql = f"DELETE FROM {self.mysql_table} {where_condition}" else: sql = f"DELETE FROM {self.mysql_table} WHERE {where_condition}" try: count = self.cs_commit(connection=connection, sql=sql) mysql_logger.info(f"MySQL 删除成功 {count} 条") except Exception as e: mysql_logger.exception(f"MySQL 删除失败,ERROR: {e}")
def update_to_mysql(self, connection, data, where_condition): """ 更新数据 :param connection: :param data: :param where_condition: where 条件 :return: """ mysql_logger = Logger().logger set_list = list() for key, value in data.items(): set_list.append(f"{key} = \'{value}\'") if "where" in where_condition or "WHERE" in where_condition: sql = f"UPDATE {self.mysql_table} SET {','.join(set_list)} {where_condition}" else: sql = f"UPDATE {self.mysql_table} SET {','.join(set_list)} WHERE {where_condition}" try: count = self.cs_commit(connection=connection, sql=sql) mysql_logger.info(f"MySQL 更新成功 {count} 条") except Exception as e: mysql_logger.exception(f"MySQL 更新失败,ERROR: {e}")
class AllToPhoenix(object): def __init__(self): # "ABCORGANIZE", "BOCOMORGANIZE","BOCORGANIZE", "CBHBORGANIZE", "CCBORGANIZE", "CEBORGANIZE", # "CGBORGANIZE", "CIBORGANIZE", "CMBCORGANIZE", "CMBORGANIZE", "CZBORGANIZE", "EBCLORGANIZE", self.code_list = [ "ECITICORGANIZE", "HXBORGANIZE", "ICBCORGANIZE", "PABORGANIZE", "PSBCORGANIZE", "SPDBORGANIZE" ] self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.copy_mongo_data_list = list() self.remove_id_list = list() self.branch_code_list = list() self.verify_list = [ "ID_", "BANK_CODE_", "BANK_NAME_", "CREATE_TIME_", "AREA_CODE_", "UNIT_CODE_", "ADDR_", "PROVINCE_NAME_", "PROVINCE_CODE_", "CITY_", "CITY_CODE_", "DISTRICT_NAME_", "DISTRICT_CODE_", "LAT_", "LNG_", "NAME_", "ENTITY_CODE_", "DEALTIME_", "URL_", "TEL_", "CODE_", "BUSINESS_HOURS_", "STATUS_1" ] # 从 MongoDB 获取数据 def get_data_from_mongo(self, m_client, collection, entity_code, data_id): m_client.mongo_db = "spider_data" m_client.mongo_entity_code = entity_code if data_id: data_id_obj = ObjectId(data_id) else: data_id_obj = None try: mongo_data_list = m_client.search_from_mongodb(collection, data_id=data_id_obj) return mongo_data_list except pymongo.errors.ServerSelectionTimeoutError: self.logger.info("连接失败,正在重新连接") sleep(1) mongo_data_list = m_client.search_from_mongodb(collection, data_id=data_id_obj) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None # 从 MongoDB 删除数据 def delete_data_from_mongo(self, m_client, collection, entity_code, remove_id_list): m_client.mongo_entity_code = entity_code try: remove_count = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return remove_count except pymongo.errors.ServerSelectionTimeoutError: mongo_data_list = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None # 清洗经纬度和 AREA_CODE_ def shuffle_for_area(self, re_data): if "STATUS_1" in re_data: del re_data["STATUS_1"] re_data["STATUS_"] = "1" location_result = get_lat_lng(re_data["ADDR_"]) if location_result["status"] == 0: re_data["LNG_"] = str(location_result["result"]["location"]["lng"]) re_data["LAT_"] = str(location_result["result"]["location"]["lat"]) address_result = get_area(lat_lng=re_data["LAT_"] + "," + re_data["LNG_"]) # todo use formatted_address or not re_data["DISTRICT_NAME_"] = address_result["result"][ "addressComponent"]["district"] re_data["DISTRICT_CODE_"] = address_result["result"][ "addressComponent"]["adcode"] re_data["AREA_CODE_"] = address_result["result"][ "addressComponent"]["adcode"] re_data["CITY_"] = address_result["result"]["addressComponent"][ "city"] re_data["CITY_CODE_"] = address_result["result"][ "addressComponent"]["adcode"][:4] + "00" re_data["PROVINCE_NAME_"] = address_result["result"][ "addressComponent"]["province"] re_data["PROVINCE_CODE_"] = address_result["result"][ "addressComponent"]["adcode"][:2] + "00" # todo use formatted_location or not # re_data["LNG_"] = lng_lat.split(",")[0] # re_data["LAT_"] = lng_lat.split(",")[1] else: re_data["LNG_"] = "" re_data["LAT_"] = "" # 网点 CODE_ hash_m = hashlib.md5() hash_m.update(re_data["ADDR_"].encode("utf-8")) hash_addr_ = hash_m.hexdigest() re_data["CODE_"] = re_data["BANK_CODE_"] + "_" + re_data[ "AREA_CODE_"] + "_" + hash_addr_ # for i in range(1, 10000): # branch_code = "ABC" + "_" + re_data["AREA_CODE_"] + "_" + "00000" # branch_code = branch_code[:len(branch_code)-len(str(i))] + "{}".format(i) # if branch_code in branch_code_list: # continue # else: # branch_code_list.append(branch_code) # break # print("*"*150) # print(re_data) return re_data # 主函数 def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="ORGANIZE_FINASSIST") p_client.verify_list = self.verify_list # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # # 创建 MongoDB spider_data_old 数据库对象 # old_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST") # # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60, # connectTimeoutMS=60, connect=False) # old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = old_client.client_to_mongodb() # collection_old = db_old["ORGANIZE_FINASSIST"] # 获取地区编码 province_list, city_list, area_list, dir_area_list = (GenericScript( entity_code=None, entity_type="ORGANIZE_FINASSIST").area_from_mysql()) # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # 创建表 # # 网点表创建语句 # sql = ('create table "ORGANIZE_FINASSIST" ("ID_" varchar primary key, "C"."BANK_NAME_" varchar,' # '"C"."BANK_CODE_" varchar, "C"."NAME_" varchar,' # '"C"."CODE_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,' # '"C"."AREA_CODE_" varchar, "C"."UNIT_CODE_" varchar, "C"."ADDR_" varchar,' # '"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar, "C"."CITY_" varchar,' # '"C"."CITY_CODE_" varchar, "C"."DISTRICT_NAME_" varchar, "C". "DISTRICT_CODE_" varchar,' # '"C"."LAT_" varchar, "C"."LNG_" varchar, "C"."CREATE_TIME_" varchar, "C"."DEALTIME_" varchar,' # '"C"."URL_" varchar, "C"."TEL_" varchar, "C"."BUSINESS_HOURS_" varchar, "C"."STATUS_" varchar,' # '"C"."IMPORTANCE_" varchar) IMMUTABLE_ROWS = true') # # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 遍历 ENTITY_CODE_ 列表 # self.code_list = ["ABCORGANIZE"] for entity_code in self.code_list: status = False module_name = __import__(entity_code) self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code)) self.remove_id_list = [] self.copy_mongo_data_list = [] self.branch_code_list = [] # find_id = "" if entity_code == "ECITICORGANIZE": find_id = "5c3f48479bb3df1d97d762e1" else: find_id = None try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, data_id=find_id) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, data_id=find_id) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count = mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] # copy_data = {} # self.remove_id_list.append(data_id) try: del data["_id"] # copy_data = deepcopy(data) # self.copy_mongo_data_list.append(copy_data) re_data = module_name.data_shuffle( data, province_list, city_list, area_list) if not re_data: self.bad_count += 1 continue except Exception as e: # except jpype._jexception.SQLExceptionPyRaisable as e: # except org.apache.phoenix.exception.BatchUpdateExecution as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue print(data_id) if isinstance(re_data, list): for list_data in re_data: area_data = "" try: # self.logger.info("_id {}".format(data_id)) area_data = self.shuffle_for_area(list_data) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception( "_id:{} 获取经纬度失败, {}".format(data_id, e)) continue # except ValueError: # pass # phoenix_HBase 插入数据 if area_data: try: # print(area_data) success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=area_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue elif isinstance(re_data, dict): area_data = "" try: area_data = self.shuffle_for_area(re_data) except urllib3.exceptions.NewConnectionError as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception("_id: {}获取经纬度失败, {}".format( data_id, e)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception("_id: {}获取经纬度失败, {}".format( data_id, e)) continue # phoenix_HBase 插入数据 if area_data: try: # print(area_data) success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=area_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if self.success_count % 100 == 0: self.logger.info("HBase 插入成功, 成功条数 {} 条".format( self.success_count)) # 添加 {d:1} # if self.success_count % 50 == 0: # update_count = m_client.update_to_mongodb(collection=collection, # data_id=self.remove_id_list, # data_dict={"d": 1}) # self.remove_id_list = [] # self.remove_count += update_count # self.logger.info("MongoDB 更新成功") mongo_data_list.close() # 添加 {d:1} # if self.remove_id_list: # update_count = m_client.update_to_mongodb(collection=collection, # data_id=self.remove_id_list, # data_dict={"d": 1}) # self.remove_id_list = [] # self.remove_count += update_count # self.logger.info("MongoDB 更新成功") if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) else: continue # 删除数据 # if status: # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection, # entity_code=entity_code, # remove_id_list=self.remove_id_list) # self.remove_count += delete_count # self.logger.info("MongoDB 删除成功") # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # 将数据插入 spider_data_old 中 # if status: # try: # old_client.mongo_db = "spider_data_old" # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except pymongo.errors.ServerSelectionTimeoutError as e: # sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class MapBarTransfer(object): def __init__(self, table_name="CHA_BRANCH_MAPBAR", collection_name="mapbar"): # phoenix connection self.p_client = PhoenixHbase(table_name=table_name) self.connection = self.p_client.connect_to_phoenix() # MongoDB connection self.m_client = MongoClient(mongo_collection=collection_name, entity_code="MAPBAR_DEATAIL_BJ") self.m_client.mongo_host = "172.22.69.35" self.m_client.mongo_port = 20000 self.m_client.client = pymongo.MongoClient(host="172.22.69.35", port=20000, serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) self.db, self.collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=self.db, collection_list=self.collection_list) # Log self.logger = Logger().logger # count self.count = 0 def main(self): # # 创建表 # table_sql = (f'create table "{self.p_client.table_name}" ("ID_" varchar primary key,' # '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,' # '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,' # '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar' # ') IMMUTABLE_ROWS = true') # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) # 获取数据 # mongo_data_list = self.m_client.all_from_mongodb(collection=self.collection) mongo_data_list = self.m_client.search_from_mongodb( collection=self.collection, field_name="DEALTIME_", field_value={"$gt": "1555136656.0579224"}, data_id="5cb65fac9bb3df61a09c6625") count = 0 while True: # 取一条处理 try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError: time.sleep(3) data = mongo_data_list.__next__() # 清洗 try: data["PHONE_"] = data["PHONE_"].replace("无,", "") u_time_list = re.findall(r"(\d{4}年\d{1,2}月\d{1,2})日", data["UPDATETIME_"]) if u_time_list: u_ = u_time_list[0].replace("年", "-") u_ = u_.replace("月", "-") u_l = u_.split("-") if len(u_l[1]) == 1: u_l[1] = "0" + u_l[1] if len(u_l[2]) == 1: u_l[2] = "0" + u_l[2] data["UPDATETIME_"] = "-".join(u_l) except Exception as e: self.logger.exception(f"数据清洗出错, _id: {data['_id']}, error {e}") continue # 获取经纬度 try: if data["ADDRESS_"]: data["ADDRESS_"] = "".join(data["ADDRESS_"].split("|")[1:]) location_result = get_lat_lng(address=data["ADDRESS_"]) if location_result["status"] == 0: data["LNG_"] = str( location_result["result"]["location"]["lng"]) data["LAT_"] = str( location_result["result"]["location"]["lat"]) else: self.logger.warning(f"_id: {data['_id']} 获取经纬度失败") else: continue except Exception as e: self.logger.exception( f"_id: {data['_id']} 获取经纬度失败, error: {e}") continue # upsert to HBase try: re_data = self.__check_lat(data=data) # 向 HBase 中插入一条 self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) count += 1 if count % 100 == 0: self.logger.info( f"HBase 插入成功, _id: {data['_id']}, 成功条数 {count}") except Exception as e: self.logger.exception( f"HBase 插入失败, _id: {data['_id']}, error: {e}") continue # 关闭 MongoDB cursor mongo_data_list.close() self.logger.info( f"collection: {self.m_client.mongo_collection} 的数据清洗完毕, 成功条数共计: {count} 条" ) def check_lat(self): # # 删除表 # self.p_client.drop_table_phoenix(connection=self.connection, table_name="CHA_BRANCH_MAPBAR") # # table_sql = (f'create table "CHA_BRANCH_MAPBAR" ("ID_" varchar primary key,' # '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,' # '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,' # '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar,' # '"C"."CHECK_LAT_" varchar, "C"."CHECK_LNG_" varchar' # ') IMMUTABLE_ROWS = true') # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) self.p_client.table_name = "FANSILE" data_cursor = self.p_client.search_all_from_phoenix( connection=self.connection, dict_status=True) self.p_client.table_name = "CHA_BRANCH_MAPBAR" while True: try: data = data_cursor.__next__() # del data["('C', 'CHECK_LNG_')"] # if not data["LAT_"]: # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue # if 30.7083860773 < float(data["LAT_"]) < 31.8739003864: # pass # else: # self.logger.warning(f"错误 _id: {data['ID_']}, 经纬度: {data['LAT_']},{data['LNG_']}") # data["CHECK_LAT_"] = data["LAT_"] # data["CHECK_LNG_"] = data["LNG_"] # data["LAT_"] = "" # data["LNG_"] = "" # # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue # if 120.8778122800 < float(data["LNG_"]) < 122.1248433443: # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue # else: # self.logger.warning(f"错误 _id: {data['ID_']}, 经纬度: {data['LAT_']},{data['LNG_']}") # data["CHECK_LAT_"] = data["LAT_"] # data["CHECK_LNG_"] = data["LNG_"] # data["LAT_"] = "" # data["LNG_"] = "" # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=data) self.count += 1 if self.count % 100 == 0: self.logger.info( f"HBase 插入成功, _id: {data['_id']}, 成功条数 {self.count} 条") except StopIteration: break def __check_lat(self, data): if "LAT_" not in data: return data # 上海 # if 30.7083860773 < float(data["LAT_"]) < 31.8739003864: # 北京 if 39.4498800000 < float(data["LAT_"]) < 41.1684980000: pass else: self.logger.warning( f"错误 _id: {data['_id']}, 经纬度: {data['LAT_']},{data['LNG_']}") data["CHECK_LAT_"] = data["LAT_"] data["CHECK_LNG_"] = data["LNG_"] data["LAT_"] = "" data["LNG_"] = "" return data # 上海 # if 120.8778122800 < float(data["LNG_"]) < 122.1248433443: # 北京 if 115.4534230000 < float(data["LNG_"]) < 117.5461160000: return data else: self.logger.warning( f"错误 _id: {data['_id']}, 经纬度: {data['LAT_']},{data['LNG_']}") data["CHECK_LAT_"] = data["LAT_"] data["CHECK_LNG_"] = data["LNG_"] data["LAT_"] = "" data["LNG_"] = "" return data