class WeiboBasicInfoUpdate(object): def __init__(self, table_name="CHA_BRANCH_WEIBO_BASIC", collection_name="WEIBOBASICINFO"): # phoenix connection self.p_client = PhoenixHbase(table_name=table_name) self.connection = self.p_client.connect_to_phoenix() # Mongo connection self.m_client = MongoClient(entity_code="CMBCMICROBLOG", mongo_collection=collection_name) self.mongo_host = "172.22.69.35" self.mongo_port = 20000 self.m_client.client = pymongo.MongoClient(host="172.22.69.35", port=20000, serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) self.db, self.collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=self.db, collection_list=self.collection_list) # Log self.logger = Logger().logger def get_mongo_column_dict(self, collection, column1, column2): mon_logger = Logger().logger try: mon_logger.info("开始查取数据") result = collection.aggregate([{ "$project": { "_id": 0, column1: 1, column2: 1 } }]) return result except TypeError as e: mon_logger.error( "WEIBO_CODE_ 数据查取失败,错误信息为{}, 请检查匹配规则是否正确".format(e)) raise Exception("WEIBO_CODE_ 查取失败, 错误信息为{}".format(e)) finally: self.m_client.client.close() def main(self): mongo_data_list = self.get_mongo_column_dict( collection=self.collection, column1="WEIBO_CODE_", column2="FANS_") # update to hbase result_generator = self.p_client.search_all_from_phoenix( connection=self.connection, dict_status=True) while True: try: result = result_generator.__next__() for mongo_data in mongo_data_list: if mongo_data["WEIBO_CODE_"] == result["WEIBO_CODE_"]: result["FANS_"] = mongo_data["FANS_"] break except StopIteration: break self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=result) self.connection.close()
insert_data["TITLE_"] = title insert_data["BANK_CODE_"] = bank_code insert_data["TYPE_"] = type_ insert_data["COUNT_"] = count insert_list.append(insert_data) if insert_list: # mysql_client.mysql_table = "cha_network_volume" mysql_client.insert_to_mysql(connection=mysql_connection, data=insert_list) mysql_client, mysql_connection = mysql_connect() p_client = PhoenixHbase(table_name="CHA_BRANCH_NEWS") connection = p_client.connect_to_phoenix() # 返回生成器对象 result_generator = p_client.search_all_from_phoenix(connection=connection, dict_status=True) while True: try: result = result_generator.__next__() count_network_volume(data=result) # p_client.upsert_to_phoenix_by_one(connection=connection, data=result) except StopIteration: break except Exception as e: print(e) continue connection.close()
class MapBarTransfer(object): def __init__(self, table_name="CHA_BRANCH_MAPBAR", collection_name="mapbar"): # phoenix connection self.p_client = PhoenixHbase(table_name=table_name) self.connection = self.p_client.connect_to_phoenix() # MongoDB connection self.m_client = MongoClient(mongo_collection=collection_name, entity_code="MAPBAR_DEATAIL_BJ") self.m_client.mongo_host = "172.22.69.35" self.m_client.mongo_port = 20000 self.m_client.client = pymongo.MongoClient(host="172.22.69.35", port=20000, serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) self.db, self.collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=self.db, collection_list=self.collection_list) # Log self.logger = Logger().logger # count self.count = 0 def main(self): # # 创建表 # table_sql = (f'create table "{self.p_client.table_name}" ("ID_" varchar primary key,' # '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,' # '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,' # '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar' # ') IMMUTABLE_ROWS = true') # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) # 获取数据 # mongo_data_list = self.m_client.all_from_mongodb(collection=self.collection) mongo_data_list = self.m_client.search_from_mongodb( collection=self.collection, field_name="DEALTIME_", field_value={"$gt": "1555136656.0579224"}, data_id="5cb65fac9bb3df61a09c6625") count = 0 while True: # 取一条处理 try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError: time.sleep(3) data = mongo_data_list.__next__() # 清洗 try: data["PHONE_"] = data["PHONE_"].replace("无,", "") u_time_list = re.findall(r"(\d{4}年\d{1,2}月\d{1,2})日", data["UPDATETIME_"]) if u_time_list: u_ = u_time_list[0].replace("年", "-") u_ = u_.replace("月", "-") u_l = u_.split("-") if len(u_l[1]) == 1: u_l[1] = "0" + u_l[1] if len(u_l[2]) == 1: u_l[2] = "0" + u_l[2] data["UPDATETIME_"] = "-".join(u_l) except Exception as e: self.logger.exception(f"数据清洗出错, _id: {data['_id']}, error {e}") continue # 获取经纬度 try: if data["ADDRESS_"]: data["ADDRESS_"] = "".join(data["ADDRESS_"].split("|")[1:]) location_result = get_lat_lng(address=data["ADDRESS_"]) if location_result["status"] == 0: data["LNG_"] = str( location_result["result"]["location"]["lng"]) data["LAT_"] = str( location_result["result"]["location"]["lat"]) else: self.logger.warning(f"_id: {data['_id']} 获取经纬度失败") else: continue except Exception as e: self.logger.exception( f"_id: {data['_id']} 获取经纬度失败, error: {e}") continue # upsert to HBase try: re_data = self.__check_lat(data=data) # 向 HBase 中插入一条 self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) count += 1 if count % 100 == 0: self.logger.info( f"HBase 插入成功, _id: {data['_id']}, 成功条数 {count}") except Exception as e: self.logger.exception( f"HBase 插入失败, _id: {data['_id']}, error: {e}") continue # 关闭 MongoDB cursor mongo_data_list.close() self.logger.info( f"collection: {self.m_client.mongo_collection} 的数据清洗完毕, 成功条数共计: {count} 条" ) def check_lat(self): # # 删除表 # self.p_client.drop_table_phoenix(connection=self.connection, table_name="CHA_BRANCH_MAPBAR") # # table_sql = (f'create table "CHA_BRANCH_MAPBAR" ("ID_" varchar primary key,' # '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,' # '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,' # '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar,' # '"C"."CHECK_LAT_" varchar, "C"."CHECK_LNG_" varchar' # ') IMMUTABLE_ROWS = true') # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) self.p_client.table_name = "FANSILE" data_cursor = self.p_client.search_all_from_phoenix( connection=self.connection, dict_status=True) self.p_client.table_name = "CHA_BRANCH_MAPBAR" while True: try: data = data_cursor.__next__() # del data["('C', 'CHECK_LNG_')"] # if not data["LAT_"]: # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue # if 30.7083860773 < float(data["LAT_"]) < 31.8739003864: # pass # else: # self.logger.warning(f"错误 _id: {data['ID_']}, 经纬度: {data['LAT_']},{data['LNG_']}") # data["CHECK_LAT_"] = data["LAT_"] # data["CHECK_LNG_"] = data["LNG_"] # data["LAT_"] = "" # data["LNG_"] = "" # # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue # if 120.8778122800 < float(data["LNG_"]) < 122.1248433443: # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue # else: # self.logger.warning(f"错误 _id: {data['ID_']}, 经纬度: {data['LAT_']},{data['LNG_']}") # data["CHECK_LAT_"] = data["LAT_"] # data["CHECK_LNG_"] = data["LNG_"] # data["LAT_"] = "" # data["LNG_"] = "" # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=data) self.count += 1 if self.count % 100 == 0: self.logger.info( f"HBase 插入成功, _id: {data['_id']}, 成功条数 {self.count} 条") except StopIteration: break def __check_lat(self, data): if "LAT_" not in data: return data # 上海 # if 30.7083860773 < float(data["LAT_"]) < 31.8739003864: # 北京 if 39.4498800000 < float(data["LAT_"]) < 41.1684980000: pass else: self.logger.warning( f"错误 _id: {data['_id']}, 经纬度: {data['LAT_']},{data['LNG_']}") data["CHECK_LAT_"] = data["LAT_"] data["CHECK_LNG_"] = data["LNG_"] data["LAT_"] = "" data["LNG_"] = "" return data # 上海 # if 120.8778122800 < float(data["LNG_"]) < 122.1248433443: # 北京 if 115.4534230000 < float(data["LNG_"]) < 117.5461160000: return data else: self.logger.warning( f"错误 _id: {data['_id']}, 经纬度: {data['LAT_']},{data['LNG_']}") data["CHECK_LAT_"] = data["LAT_"] data["CHECK_LNG_"] = data["LNG_"] data["LAT_"] = "" data["LNG_"] = "" return data