def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="FINPRODUCT_FINASSIST") p_client.verify_list = self.verify_list # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection(db=db, collection_list=collection_list) # 创建 MongoDB spider_data_old 数据库对象 # old_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST") # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, # serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) # old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = old_client.client_to_mongodb() # collection_old = db_old["FINPRODUCT_FINASSIST"] # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # 表创建语句 # sql = ('create table "FINPRODUCT_FINASSIST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,' # '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, ' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "C"."STATUS_" varchar,' # '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."TIME_LIMIT_" varchar,' # '"C"."YIELD_RATE_" varchar, "C"."BREAKEVEN_" varchar, "C"."START_FUNDS_" varchar,' # '"C"."INVEST_PERIOD_" varchar, "C"."SALE_DISTRICT_" varchar, "C"."SALE_START_" varchar,' # '"C"."SALE_END_" varchar, "C"."RISK_LEVEL_" varchar, "C"."REDEMING_MODE_" varchar,' # '"C"."PRIVATE_BANK_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."CURRENCY_TYPE_" varchar, "C"."INCREASE_UNIT_" varchar,' # '"C"."YIELD_START_DATE_" varchar, "C"."YIELD_END_DATE_" varchar, "C"."YIELD_TYPE_" varchar,' # '"C"."TARGET_" varchar, "C"."PRODUCT_TYPE_" varchar, "C"."YIELD_STATMENT_" varchar,' # '"C"."INVEST_RANGE_" varchar, "C"."PRE_STOP_" varchar, "C"."RASE_PLAN_" varchar,' # '"C"."PURCHASE_" varchar, "T"."CONTENT_" varchar, "C"."IMAGE_" varchar) IMMUTABLE_ROWS = true') # # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 增加列 # p_client.add_column_phoenix(connection=connection, column="IMAGE_") for entity in ["CHINANETFINANCIAL", "JSFIN_CCBDATA"]: # for entity in self.entity_list: status = False module_name = __import__(entity) self.logger.info("开始进行 ENTITY_CODE_: {}".format(entity)) self.remove_id_list = [] self.copy_mongo_data_list = [] # find_id = "5c3f118f8d7fee068da6ef53" find_id = None try: if entity == "JSFIN_CCBDATA": m_client.mongo_collection = "JSFIN_CCBDATA" collection = m_client.get_check_collection(db=db, collection_list=collection_list) mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client, collection=collection, data_id=None) else: m_client.mongo_collection = "FINPRODUCT_FINASSIST" collection = m_client.get_check_collection(db=db, collection_list=collection_list) mongo_data_list = self.get_data_from_mongo(m_client=m_client, collection=collection, entity_code=entity, data_id=find_id) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) if entity == "JSFIN_CCBDATA": m_client.mongo_collection = "JSFIN_CCBDATA" mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client, collection=collection, data_id=None) else: m_client.mongo_collection = "FINPRODUCT_FINASSIST" collection = m_client.get_check_collection(db=db, collection_list=collection_list) mongo_data_list = self.get_data_from_mongo(m_client=m_client, collection=collection, entity_code=entity, data_id=find_id) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count += mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) if entity == "CHINANETFINANCIAL": re_data = module_name.data_shuffle(data=data, sales_status=self.sales_status, produc_category=self.produc_category, revenue_type=self.revenue_type, operaton_pattern=self.operaton_pattern, purchase_amount=self.purchase_amount, duration_type=self.duration_type) elif entity == "JSFIN_CCBDATA": re_data = module_name.ScriptCCB.data_shuffle(self=self, data=data) else: re_data = module_name.data_shuffle(data) if not re_data: self.bad_count += 1 continue except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format(data_id, e)) continue print(data_id) # phoenix_HBase 插入数据 if isinstance(re_data, dict): try: success_count = p_client.upsert_to_phoenix_by_one(connection=connection, data=re_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e)) continue elif isinstance(re_data, list): for r_data in re_data: try: success_count = p_client.upsert_to_phoenix_by_one(connection=connection, data=r_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e)) continue if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) else: continue break # # 删除数据 # if status: # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection, # entity_code=entity, # remove_id_list=self.remove_id_list) # self.remove_count += delete_count # # self.logger.info("MongoDB 删除成功") # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # 将数据插入 spider_data_old 中 # if status: # try: # old_client.mongo_db = "spider_data_old" # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except pymongo.errors.ServerSelectionTimeoutError as e: # sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) # 关闭连接 m_client.client_close() # p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.handlers.clear()