def client_to_mongodb(self): mon_logger = Logger().logger mon_logger.info("开始连接MongoDB({}:{}),database={}".format( self.mongo_host, self.mongo_port, self.mongo_database)) try: collection_list = self.db.collection_names() mon_logger.info("MongoDB({}:{})连接成功".format( self.mongo_host, self.mongo_port)) return collection_list except pymongo.errors.ServerSelectionTimeoutError as e: mon_logger.warning("MongoDB({}:{})连接失败".format( self.mongo_host, self.mongo_port)) for i in range(2, 6): try: collection_list = self.db.collection_names() mon_logger.info("MongoDB({}:{})连接成功".format( self.mongo_host, self.mongo_port)) return collection_list except Exception: mon_logger.warning("MongoDB({}:{})第{}次连接失败".format( self.mongo_host, self.mongo_port, i)) if i == 5: mon_logger.error( "MongoDB连接失败,错误信息为: {}, 请检查各项参数是否正确host={}, port={},database={}" .format(e, self.mongo_host, self.mongo_port, self.mongo_database)) self.client_close()
def client_to_mysql(self): ''' :return: 返回mysql连接 ''' mysql_logger = Logger().logger try: mysql_logger.info("正在连接MySQL({}@{}:{})".format( self.mysql_user, self.mysql_host, self.mysql_port)) connection = pymysql.connect(**self.mysql_config) mysql_logger.info("Mysql连接成功({}@{}:{})".format( self.mysql_user, self.mysql_host, self.mysql_port)) return connection except pymysql.err.OperationalError as e: for retry_count in range(2, 7): try: mysql_logger.warning( "MySQL连接失败,正在重试第{}次连接".format(retry_count)) connection = pymysql.connect(**self.mysql_config) mysql_logger.info("Mysql连接成功") return connection except Exception as e: mysql_logger.warning("第{}次连接MySQL失败".format(retry_count)) # print(retry_count) if retry_count == 6: mysql_logger.error("MySQL连接失败,错误信息为{}".format(e))
class FundScript(object): def __init__(self): self.code_list = [ "STCNFUND", "ABCFUND", "CCBFUND", "CITICFUND", "ICBCFUND" ] self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.copy_mongo_data_list = list() self.remove_id_list = list() self.branch_code_list = list() # 基金 self.verify_list = [ "ENTITY_CODE_", "ENTITY_NAME_", "URL_", "PERIOD_CODE_", "STATUS_", "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "CODE_", "NAME_", "FUND_NEW_VALUE_", "TOTAL_NEW_VALUE_", "FUND_OLD_VALUE_", "TOTAL_OLD_VALUE_", "DAILY_RATE_", "YEAR_REWARD_", "SUBS_STATUS_", "ATONEM_STATUS_", "TYPE_", "ID_", "NEWEST_VALUE_", "TOTAL_VALUE_", "POPULARITY_", "RATING_", "OLD_VALUE_", "UNIT_VALUE_", "SCALE_", "ESTABLISH_DATE_", "RISK_LEVEL_", "BASE_INFO_", "YIELD_", "INVEST_", "MONTH_RATE_", "QUARTER_RATE_", "HALF_YEAR_RATE_", "HISTORY_RATE_", "FUND_STATUS_", "COMPANY_", "SUBS_STATUS_CODE_", "TYPE_CODE_" ] # 从 MongoDB 获取数据 def get_data_from_mongo(self, m_client, collection, entity_code): m_client.mongo_db = "spider_data" m_client.mongo_entity_code = entity_code try: mongo_data_list = m_client.search_from_mongodb(collection) return mongo_data_list except pymongo.errors.ServerSelectionTimeoutError: self.logger.info("连接失败,正在重新连接") sleep(1) mongo_data_list = m_client.search_from_mongodb(collection) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None # 从 MongoDB 删除数据 def delete_data_from_mongo(self, m_client, collection, entity_code, remove_id_list): m_client.mongo_entity_code = entity_code try: remove_count = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return remove_count except pymongo.errors.ServerSelectionTimeoutError: mongo_data_list = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None # 网点 CODE_ hash_m = hashlib.md5() hash_m.update(re_data["ADDR_"].encode("utf-8")) hash_addr_ = hash_m.hexdigest() re_data["CODE_"] = re_data["BANK_CODE_"] + "_" + re_data[ "AREA_CODE_"] + "_" + hash_addr_ # for i in range(1, 10000): # branch_code = "ABC" + "_" + re_data["AREA_CODE_"] + "_" + "00000" # branch_code = branch_code[:len(branch_code)-len(str(i))] + "{}".format(i) # if branch_code in branch_code_list: # continue # else: # branch_code_list.append(branch_code) # break return re_data def gaode_get_lat_lng(self, address): url = URL_FOR_LAT_LNG + "?" + "key=" + AK + "&address=" + address # url = url + "?location={}&output=json&pois=1&ak={}".format(address, ak) response = requests.get(url) temp = json.loads(response.content) response.close() return temp['geocodes'][0]['location'] def dict_from_mysql(self, dict_code): # 创建 MySQL 对象 mysql_config = { "host": "172.22.67.25", "port": 3306, "database": "chabei", "user": "******", "password": "******", "table": "sys_dict_item" } mysql_client = MysqlClient(**mysql_config) mysql_connection = mysql_client.client_to_mysql() result = mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'{}\'" .format(dict_code), connection=mysql_connection) mysql_client.close_client(connection=mysql_connection) return result # 主函数 def run(self): count = 0 # # 创建 Phoenix 对象-注意表格名字 p_client = PhoenixHbase(table_name="FUND") p_client.verify_list = self.verify_list # # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="JSFUND_CCBDATA") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) #查询省市区的编码列表 # script = GenericScript(entity_code="ICBCFUND", entity_type="JSFUND_CCBDATA") # province_list, city_list, area_list, dir_area_list = script.area_from_mysql() list_SUBS_STATUS = self.dict_from_mysql("FUND_SUBS_STATUS") list_TYPE = self.dict_from_mysql("FUND_TYPE") # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # 基金表创建语句 # sql = ('create table "FUND" ("ID_" varchar primary key,"C"."ENTITY_CODE_" varchar,"C"."AREA_CODE_" varchar,' # '"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,"C"."UNIT_CODE_" varchar,"C"."PERIOD_CODE_" varchar,"C"."REMARK_" varchar,' # '"C"."CREATE_TIME_" varchar,"C"."UPDATE_TIME_" varchar,"C"."STATUS_" varchar,"C"."CODE_" varchar,"C"."NAME_" varchar,' # '"C"."FUND_OLD_VALUE_" varchar,"C"."TOTAL_OLD_VALUE_" varchar,"C"."FUND_NEW_VALUE_" varchar,"C"."TOTAL_NEW_VALUE_" varchar,' # '"C"."INVEST_PERIOD_" varchar,"C"."DAILY_RATE_" varchar,"C"."YEAR_REWARD_" varchar,"C"."SUBS_STATUS_" varchar,' # '"C"."ATONEM_STATUS_" varchar,"C"."TYPE_" varchar,"C"."NEWEST_VALUE_" varchar,"C"."TOTAL_VALUE_" varchar,' # '"C"."POPULARITY_" varchar,"C"."RATING_" varchar,"C"."ENTITY_NAME_" varchar,"C"."OLD_VALUE_" varchar,' # '"C"."UNIT_VALUE_" varchar,"C"."SCALE_" varchar,"C"."ESTABLISH_DATE_" varchar,"C"."RISK_LEVEL_" varchar,' # '"C"."BASE_INFO_" varchar,"C"."YIELD_" varchar,"C"."INVEST_" varchar,"C"."MONTH_RATE_" varchar,' # '"C"."QUARTER_RATE_" varchar,"C"."HALF_YEAR_RATE_" varchar,"C"."URL_" varchar,"C"."HISTORY_RATE_" varchar,' # '"C"."FUND_STATUS_" varchar,"C"."COMPANY_" varchar,"C"."SUBS_STATUS_CODE_" varchar,"C"."TYPE_CODE_" varchar)IMMUTABLE_ROWS = true') # # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 遍历 ENTITY_CODE_ 列表 # self.code_list = ["ABCORGANIZE"] for entity_code in self.code_list: status = False module_name = __import__(entity_code) self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code)) self.remove_id_list = [] self.copy_mongo_data_list = [] self.branch_code_list = [] try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count = mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) # re_data = module_name.data_shuffle(data_list,province_list, city_list, area_list,list_SUBS_STATUS,list_TYPE) re_data = module_name.data_shuffle( data, list_SUBS_STATUS, list_TYPE) # re_data = module_name.data_shuffle(data_list) if not re_data: self.bad_count += 1 continue except Exception as e: # except jpype._jexception.SQLExceptionPyRaisable as e: # except org.apache.phoenix.exception.BatchUpdateExecution as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue if isinstance(re_data, list): for list_data in re_data: # try: # area_data = self.shuffle_for_area(list_data) # except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) # self.logger.warning("_id:{} 获取经纬度失败, {}".format(data_id, e)) # continue # except ValueError: # pass # phoenix_HBase 插入数据 if list_data: try: count += 1 print(count) # print(list_data) success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=list_data) # 导出csv # pd.DataFrame(area_data).to_csv("E:\\NEWS_CLEAN_\\" + module_name+ ".csv") once_count += success_count self.success_count += success_count self.logger.info( "HBase 插入成功, 成功条数 {} 条".format( success_count)) if self.success_count % 50 == 0: update_count = m_client.update_to_mongodb( collection=collection, data_id=self.remove_id_list, data_dict={"d": 1}) self.remove_count += update_count self.logger.info("MongoDB 更新成功") except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue elif isinstance(re_data, dict): # try: # area_data = self.shuffle_for_area(re_data) # except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) # self.logger.warning("_id: {}获取经纬度失败, {}".format(data_id, e)) # continue # phoenix_HBase 插入数据 if re_data: try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) once_count += success_count self.success_count += success_count if self.success_count % 100 == 0: self.logger.info( "HBase 插入成功, 成功条数 {} 条".format( self.success_count)) # 添加 {d:1} if self.success_count % 50 == 0: update_count = m_client.update_to_mongodb( collection=collection, data_id=self.remove_id_list, data_dict={"d": 1}) self.remove_count += update_count self.logger.info("MongoDB 更新成功") except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) else: continue # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class FinProductScript(object): def __init__(self): # 创建 MySQL 对象 __mysql_config = { "host": MYSQL_HOST_25, "port": MYSQL_PORT_25, "database": MYSQL_DATABASE_25, "user": MYSQL_USER_25, "password": MYSQL_PASSWORD_25, "table": MYSQL_TABLE_25 } __mysql_client = MysqlClient(**__mysql_config) __mysql_connection = __mysql_client.client_to_mysql() self.sales_status = __mysql_client.search_area_code( sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'SALES_STATUS\'", connection=__mysql_connection) self.produc_category = __mysql_client.search_area_code( sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'PRODUC_CATEGORY\'", connection=__mysql_connection) self.revenue_type = __mysql_client.search_area_code( sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'REVENUE_TYPE\'", connection=__mysql_connection) self.operaton_pattern = __mysql_client.search_area_code( sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'OPERATION_PATTERN\'", connection=__mysql_connection) self.purchase_amount = __mysql_client.search_area_code( sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'PURCHASE_AMOUNT\'", connection=__mysql_connection) self.duration_type = __mysql_client.search_area_code( sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'DURATION_TYPE\'", connection=__mysql_connection) __mysql_client.close_client(connection=__mysql_connection) self.logger = Logger().logger self.remove_id_list = list() self.copy_mongo_data_list = list() # "CZBFinancial", "PABFinancial", "PSBCFinancial", # self.entity_list = ["ABCFinancial", "BOCFinancial", "BOCOMFinancial", "CBHBFinancial", "CCBFinancial", "CEBFinancial", "CGBFinancial", "CIBFinancial", "CMBCFinancial", "CMBFinancial", "EBCLFinancial", "ECITICFinancial", "HXBFinancial", "ICBCFinancial", "SPDBFinancial", "CHINANETFINANCIAL", "JSFIN_CCBDATA"] self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.verify_list = ["ID_", "ENTITY_CODE_", "AREA_CODE_", "BANK_CODE_", "BANK_NAME_", "UNIT_CODE_", "PERIOD_CODE_", "CONTENT_", "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "CODE_", "NAME_", "TIME_LIMIT_", "YIELD_RATE_", "BREAKEVEN_", "START_FUNDS_", "INVEST_PERIOD_", "SALE_START_", "SALE_END_", "RISK_LEVEL_", "REDEMING_MODE_", "PRIVATE_BANK_", "URL_", "DEALTIME_", "DATETIME_", "ENTITY_NAME_", "STATUS_", "SALE_DISTRICT_"] def get_data_from_mongo(self, m_client, collection, entity_code, data_id): """ :param m_client: MongoDB client :param collection: MongoDB collection :param entity_code: :return: data from MongoDB """ m_client.mongo_db = "spider_data" m_client.mongo_entity_code = entity_code try: mongo_data_list = m_client.search_from_mongodb(collection, data_id=data_id) return mongo_data_list except pymongo.errors.ServerSelectionTimeoutError: self.logger.info("连接失败,正在重新连接") sleep(1) mongo_data_list = m_client.search_from_mongodb(collection, data_id=data_id) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None def delete_data_from_mongo(self, m_client, collection, entity_code, remove_id_list): """ :param m_client: MongoDB client :param collection: MongoDB collection :param entity_code: :param remove_id_list: id list to remove :return: delete count """ m_client.mongo_entity_code = entity_code try: remove_count = m_client.remove_from_mongo(collection=collection, remove_id_list=remove_id_list) return remove_count except pymongo.errors.ServerSelectionTimeoutError: mongo_data_list = m_client.remove_from_mongo(collection=collection, remove_id_list=remove_id_list) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="FINPRODUCT_FINASSIST") p_client.verify_list = self.verify_list # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection(db=db, collection_list=collection_list) # 创建 MongoDB spider_data_old 数据库对象 # old_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST") # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, # serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) # old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = old_client.client_to_mongodb() # collection_old = db_old["FINPRODUCT_FINASSIST"] # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # 表创建语句 # sql = ('create table "FINPRODUCT_FINASSIST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,' # '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, ' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "C"."STATUS_" varchar,' # '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."TIME_LIMIT_" varchar,' # '"C"."YIELD_RATE_" varchar, "C"."BREAKEVEN_" varchar, "C"."START_FUNDS_" varchar,' # '"C"."INVEST_PERIOD_" varchar, "C"."SALE_DISTRICT_" varchar, "C"."SALE_START_" varchar,' # '"C"."SALE_END_" varchar, "C"."RISK_LEVEL_" varchar, "C"."REDEMING_MODE_" varchar,' # '"C"."PRIVATE_BANK_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."CURRENCY_TYPE_" varchar, "C"."INCREASE_UNIT_" varchar,' # '"C"."YIELD_START_DATE_" varchar, "C"."YIELD_END_DATE_" varchar, "C"."YIELD_TYPE_" varchar,' # '"C"."TARGET_" varchar, "C"."PRODUCT_TYPE_" varchar, "C"."YIELD_STATMENT_" varchar,' # '"C"."INVEST_RANGE_" varchar, "C"."PRE_STOP_" varchar, "C"."RASE_PLAN_" varchar,' # '"C"."PURCHASE_" varchar, "T"."CONTENT_" varchar, "C"."IMAGE_" varchar) IMMUTABLE_ROWS = true') # # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 增加列 # p_client.add_column_phoenix(connection=connection, column="IMAGE_") for entity in ["CHINANETFINANCIAL", "JSFIN_CCBDATA"]: # for entity in self.entity_list: status = False module_name = __import__(entity) self.logger.info("开始进行 ENTITY_CODE_: {}".format(entity)) self.remove_id_list = [] self.copy_mongo_data_list = [] # find_id = "5c3f118f8d7fee068da6ef53" find_id = None try: if entity == "JSFIN_CCBDATA": m_client.mongo_collection = "JSFIN_CCBDATA" collection = m_client.get_check_collection(db=db, collection_list=collection_list) mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client, collection=collection, data_id=None) else: m_client.mongo_collection = "FINPRODUCT_FINASSIST" collection = m_client.get_check_collection(db=db, collection_list=collection_list) mongo_data_list = self.get_data_from_mongo(m_client=m_client, collection=collection, entity_code=entity, data_id=find_id) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) if entity == "JSFIN_CCBDATA": m_client.mongo_collection = "JSFIN_CCBDATA" mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client, collection=collection, data_id=None) else: m_client.mongo_collection = "FINPRODUCT_FINASSIST" collection = m_client.get_check_collection(db=db, collection_list=collection_list) mongo_data_list = self.get_data_from_mongo(m_client=m_client, collection=collection, entity_code=entity, data_id=find_id) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count += mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) if entity == "CHINANETFINANCIAL": re_data = module_name.data_shuffle(data=data, sales_status=self.sales_status, produc_category=self.produc_category, revenue_type=self.revenue_type, operaton_pattern=self.operaton_pattern, purchase_amount=self.purchase_amount, duration_type=self.duration_type) elif entity == "JSFIN_CCBDATA": re_data = module_name.ScriptCCB.data_shuffle(self=self, data=data) else: re_data = module_name.data_shuffle(data) if not re_data: self.bad_count += 1 continue except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format(data_id, e)) continue print(data_id) # phoenix_HBase 插入数据 if isinstance(re_data, dict): try: success_count = p_client.upsert_to_phoenix_by_one(connection=connection, data=re_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e)) continue elif isinstance(re_data, list): for r_data in re_data: try: success_count = p_client.upsert_to_phoenix_by_one(connection=connection, data=r_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e)) continue if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) else: continue break # # 删除数据 # if status: # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection, # entity_code=entity, # remove_id_list=self.remove_id_list) # self.remove_count += delete_count # # self.logger.info("MongoDB 删除成功") # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # 将数据插入 spider_data_old 中 # if status: # try: # old_client.mongo_db = "spider_data_old" # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except pymongo.errors.ServerSelectionTimeoutError as e: # sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) # 关闭连接 m_client.client_close() # p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.handlers.clear()
class ScriptCCB(object): def __init__(self): self.logger = Logger().logger self.remove_id_list = list() self.copy_mongo_data_list = list() # 创建 MySQL 对象 __mysql_config = { "host": MYSQL_HOST_25, "port": MYSQL_PORT_25, "database": MYSQL_DATABASE_25, "user": MYSQL_USER_25, "password": MYSQL_PASSWORD_25, "table": MYSQL_TABLE_25 } __mysql_client = MysqlClient(**__mysql_config) __mysql_connection = __mysql_client.client_to_mysql() self.sales_status = __mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'SALES_STATUS\'", connection=__mysql_connection) self.produc_category = __mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'PRODUC_CATEGORY\'", connection=__mysql_connection) self.revenue_type = __mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'REVENUE_TYPE\'", connection=__mysql_connection) self.operaton_pattern = __mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'OPERATION_PATTERN\'", connection=__mysql_connection) self.purchase_amount = __mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'PURCHASE_AMOUNT\'", connection=__mysql_connection) self.duration_type = __mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'DURATION_TYPE\'", connection=__mysql_connection) __mysql_client.close_client(connection=__mysql_connection) self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.verify_list = [ "ID_", "ENTITY_CODE_", "AREA_CODE_", "BANK_CODE_", "BANK_NAME_", "UNIT_CODE_", "PERIOD_CODE_", "CONTENT_", "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "CODE_", "NAME_", "TIME_LIMIT_", "YIELD_RATE_", "BREAKEVEN_", "START_FUNDS_", "INVEST_PERIOD_", "SALE_START_", "SALE_END_", "RISK_LEVEL_", "REDEMING_MODE_", "PRIVATE_BANK_", "URL_", "DEALTIME_", "DATETIME_", "ENTITY_NAME_", "STATUS_", "SALE_DISTRICT_", "CURRENCY_TYPE_", "INCREASE_UNIT_", "YIELD_START_DATE_", "YIELD_END_DATE_", "YIELD_TYPE_", "TARGET_", "PRODUCT_TYPE_", "YIELD_STATMENT_", "INVEST_RANGE_", "PRE_STOP_", "RASE_PLAN_", "PURCHASE_" ] @staticmethod def get_data_from_mongo(self, m_client, collection, data_id): """ :param m_client: MongoDB client :param collection: MongoDB collection :return: data from MongoDB """ m_client.mongo_db = "spider_data" try: mongo_data_list = m_client.all_from_mongodb(collection, data_id=data_id) return mongo_data_list except pymongo.errors.ServerSelectionTimeoutError: self.logger.info("连接失败,正在重新连接") sleep(1) mongo_data_list = m_client.all_from_mongodb(collection, data_id=data_id) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None @staticmethod def data_shuffle(self, data): if "上海银行" in data["BANK_NAME"]: bank_code = "BankOfShanghai" elif "天津银行" in data["BANK_NAME"]: bank_code = "TJBANK" elif "北京银行" in data["BANK_NAME"]: bank_code = "BOB" elif "中国光大银行股份有限公司" in data["BANK_NAME"]: bank_code = "CEB" else: print(data) re_data = dict() hash_m = hashlib.md5() hash_m.update(data["NAME_"].encode("utf-8")) hash_id = hash_m.hexdigest() re_data["ID_"] = bank_code + "_" + hash_id + "_" + data["SALE_START_"] re_data["ENTITY_CODE_"] = "RONG360FINANCIAL" # re_data["AREA_CODE_"] re_data["BANK_CODE_"] = bank_code re_data["BANK_NAME_"] = data["BANK_NAME"].replace("股份有限公司", "") # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["SALE_START_"].replace("-", "") # re_data["CONTENT_"] re_data["STATUS_"] = "" # re_data["REMARK_"] re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] re_data["NAME_"] = data["NAME_"] # 售卖时间范围 re_data["TIME_LIMIT_"] = "" # 收益率 re_data["LOW_YIELD_RATE_"] = data["YIELD_RATE_"].replace("%", "") re_data["HIGH_YIELD_RATE_"] = data["YIELD_RATE_"].replace("%", "") # 售卖区域 re_data["SALE_DISTRICT_"] = data["SALE_AREA_"] # 起购金额 data["START_FUNDS_"] = data["START_FUNDS_"].replace("亿", "00000000") data["START_FUNDS_"] = data["START_FUNDS_"].replace("万", "0000") data["START_FUNDS_"] = data["START_FUNDS_"].replace("千", "000") if data["START_FUNDS_"]: if int(data["START_FUNDS_"]) < 50000: match_funds = "5万以下" elif 50000 <= int(data["START_FUNDS_"]) < 100000: match_funds = "5万-10万" elif 100000 <= int(data["START_FUNDS_"]) < 200000: match_funds = "10万-20万" elif 20000 <= int(data["START_FUNDS_"]) < 500000: match_funds = "20万-50万" elif 500000 <= int(data["START_FUNDS_"]): match_funds = "50万以上" else: match_funds = "不限" for i in self.purchase_amount: if match_funds in i["ITEM_LABEL_"]: re_data["START_FUNDS_"] = data["START_FUNDS_"] re_data["START_FUNDS_CODE_"] = i["ITEM_VALUE_"] # 期限 data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"].replace("天", "") if data["INVEST_PERIOD_"]: if int(data["INVEST_PERIOD_"]) <= 30: match_str = "1个月内" elif 30 < int(data["INVEST_PERIOD_"]) <= 90: match_str = "1-3个月(含)" elif 90 < int(data["INVEST_PERIOD_"]) <= 180: match_str = "3-6个月(含)" elif 180 < int(data["INVEST_PERIOD_"]) <= 365: match_str = "6-12个月(含)" elif 365 < int(data["INVEST_PERIOD_"]): match_str = "12个月以上" else: match_str = "不限" for i in self.duration_type: if match_str in i["ITEM_LABEL_"]: re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"] re_data["INVEST_PERIOD_CODE_"] = i["ITEM_VALUE_"] # 开始售卖时间 re_data["SALE_START_"] = data["SALE_START_"] # 结束售卖时间 re_data["SALE_END_"] = data["SALE_END_"] # 风险等级 # re_data["RISK_LEVEL_"] = data["RISK_LEVEL_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] re_data["DATETIME_"] = data["DATETIME_"] re_data["ENTITY_NAME_"] = "融360理财产品" # NEW # 认购货币(类型) re_data["CURRENCY_TYPE_"] = data["CURRENCY_TYPE_"] # 递增单位 re_data["INCREASE_UNIT_"] = re.sub(r"元.*", "", data["INCREASE_UNIT_"]) # 收益起记(日期) re_data["YIELD_START_DATE_"] = data["YIELD_START_DATE_"][:10] # 收益结束(日期) re_data["YIELD_END_DATE_"] = data["YIELD_START_DATE_"][-10:] # 收益获取方式 for i in self.revenue_type: if i["ITEM_LABEL_"] == data["YIELD_TYPE_"]: re_data["YIELD_TYPE_"] = data["YIELD_TYPE_"] re_data["YIELD_TYPE_CODE_"] = i["ITEM_VALUE_"] break # 对象(目标人群) re_data["TARGET_"] = data["TARGET_"] # 产品类型 re_data["PRODUCT_TYPE_"] = data["PRODUCT_TYPE_"] # 收益率说明 re_data["YIELD_STATMENT_"] = data["YIELD_STATMENT_"] # 投资范围 re_data["INVEST_RANGE_"] = data["INVEST_RANGE_"] # 提前终止条件 re_data["PRE_STOP_"] = data["PRE_STOP_"] # 募集规划条件 re_data["RASE_PLAN_"] = data["RASE_PLAN_"] # 申购条件 re_data["PURCHASE_"] = data["PURCHASE_"] # 无 # re_data["CODE_"] = data["CODE_"] # 是否保本 # re_data["BREAKEVEN_"] = data["BREAKEVEN_"] # # 可否赎回 # re_data["REDEMING_MODE_"] # # 私人银行 # re_data["PRIVATE_BANK_"] return re_data def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="FINPRODUCT_FINASSIST") p_client.verify_list = self.verify_list # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="JSFIN_CCBDATA") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 MongoDB spider_data_old 数据库对象 # old_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST") # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, # serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) # old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = old_client.client_to_mongodb() # collection_old = db_old["FINPRODUCT_FINASSIST"] # 删除表 # p_client.drop_table_phoenix(connection=connection) # # # 表创建语句 # sql = ('create table "FINPRODUCT_FINASSIST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar, "C"."IMPORTANCE_" varchar,' # '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, "C"."SALE_STATUS_CODE_" varchar,' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar,' # '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."TIME_LIMIT_" varchar, "C"."SALE_STATUS_" varchar,' # '"C"."LOW_YIELD_RATE_" varchar, "C"."HIGH_YIELD_RATE_" varchar, "C"."BREAKEVEN_" varchar,' # '"C"."START_FUNDS_" varchar, "C"."START_FUNDS_CODE_" varchar, "C"."INVEST_PERIOD_" varchar,' # '"C"."INVEST_PERIOD_CODE_" varchar, "C"."SALE_DISTRICT_" varchar, "C"."SALE_START_" varchar,' # '"C"."SALE_END_" varchar, "C"."RISK_LEVEL_" varchar, "C"."REDEMING_MODE_" varchar,' # '"C"."PRIVATE_BANK_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."CURRENCY_TYPE_" varchar, "C"."INCREASE_UNIT_" varchar,' # '"C"."YIELD_START_DATE_" varchar, "C"."YIELD_END_DATE_" varchar, "C"."YIELD_TYPE_" varchar,' # '"C"."YIELD_TYPE_CODE_" varchar, "C"."TARGET_" varchar, "C"."PRODUCT_TYPE_" varchar,' # '"C"."INVESTOR_TYPE_" varchar, "C". "INVESTOR_TYPE_CODE_" varchar, "C"."YIELD_STATMENT_" varchar,' # '"C"."OPERA_MODEL_CODE_" varchar, "C"."OPERA_MODEL_" varchar,' # '"C"."INVEST_RANGE_" varchar, "C"."PRE_STOP_" varchar, "C"."RASE_PLAN_" varchar,' # '"C"."PURCHASE_" varchar, "C"."STATUS_" varchar) IMMUTABLE_ROWS = true') # # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 增加列 # colum_list = ["CURRENCY_TYPE_", "INCREASE_UNIT_", "YIELD_START_DATE_", "YIELD_END_DATE_", "YIELD_TYPE_", # "TARGET_", "PRODUCT_TYPE_", "YIELD_STATMENT_", "INVEST_RANGE_", "PRE_STOP_", "RASE_PLAN_", # "PURCHASE_"] # p_client.add_column_phoenix(connection=connection, column=colum_list) status = False self.logger.info("开始进行 ENTITY_CODE_: RONG360FINANCIAL") self.remove_id_list = [] self.copy_mongo_data_list = [] find_id = None try: mongo_data_list = self.get_data_from_mongo(self=self, m_client=m_client, collection=collection, data_id=find_id) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) mongo_data_list = self.get_data_from_mongo(self=self, m_client=m_client, collection=collection, data_id=find_id) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count += mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) re_data = self.data_shuffle(self=self, data=data) if not re_data: self.bad_count += 1 continue except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue print(data_id) # phoenix_HBase 插入数据 if isinstance(re_data, dict): try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e)) continue elif isinstance(re_data, list): for r_data in re_data: try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=r_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) else: quit() # # 添加 {d:1} # if status: # update_count = m_client.update_to_mongodb(collection=collection, data_id=self.remove_id_list, # data_dict={"d": 1}) # self.remove_count += update_count # self.logger.info("MongoDB 更新成功") # # 删除数据 # if status: # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection, # entity_code=entity, # remove_id_list=self.remove_id_list) # self.remove_count += delete_count # # self.logger.info("MongoDB 删除成功") # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # 将数据插入 spider_data_old 中 # if status: # try: # old_client.mongo_db = "spider_data_old" # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except pymongo.errors.ServerSelectionTimeoutError as e: # sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) # 关闭连接 m_client.client_close() # p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.handlers.clear()
class AllToPhoenix(object): def __init__(self): self.file_list = list() self.get_code_list() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.copy_mongo_data_list = list() self.remove_id_list = list() # 字段验证列表 self.verify_list = [ "ID_", "CONTENT_", "NOTICE_TIME_", "TITLE_", "PROJECT_NAME_", "BID_CONTENT_", "SIGN_START_TIME_", "SIGN_END_TIME_", "OPEN_BID_TIME_", "OPEN_BID_PLACE_", "BID_AGENCY_", "APPLY_CONDITION_", "SIGN_QUALIFICATION_", "PROJECT_ID_", "WIN_CANDIDATE_", "CANDIDATE_RANK_", "BID_", "URL_", "DEALTIME_", "CREATE_TIME_", "ENTITY_NAME_", "ENTITY_CODE_", "ENTITY_STATUS_", "SIGN_MATERIAL_", "BID_TYPE_", "DATETIME_", "BUDGET_PRICE_", "PASS_REASON_", "PRESALE_CONTENT_", "PRESALE_WAY_", "PRESALE_START_TIME_", "PRESALE_END_TIME_", "PRESALE_ADDR_", "PRESALE_PREPARE_", "IMAGE_" ] def get_code_list(self): """ 获取当前目录下文件名(去除 "CommonBidding_" 后就是 ENTITY_CODE_ ) :return: """ for root, dirs, files in os.walk(curPath): # print(root) # 当前目录路径 # print(dirs) # 当前路径下所有子目录 # print(files) # 当前路径下所有非目录子文件 self.file_list = files self.file_list.remove("__init_____.py") break def get_data_from_mongo(self, m_client, collection, entity_code): """ :param m_client: MongoDB client :param collection: MongoDB collection :param entity_code: :return: all from MongoDB where ENTITY_CODE_ = entity_code """ m_client.mongo_db = "spider_data" other_query = { "$or": [{ "TITLE_": { "$exists": True } }, { "title": { "$exists": True } }] } try: mongo_data_list = m_client.get_data_from_mongodb( collection=collection, entity_code=entity_code, exclude_code=None, limit_number=None, other_query=other_query) return mongo_data_list except pymongo.errors.ServerSelectionTimeoutError: self.logger.info("连接失败,正在重新连接") time.sleep(1) mongo_data_list = m_client.get_data_from_mongodb( collection=collection, entity_code=entity_code, exclude_code=None, limit_number=None, other_query=other_query) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None def delete_data_from_mongo(self, m_client, collection, entity_code, remove_id_list): """ :param m_client: MongoDB client :param collection: MongoDB collection :param entity_code: :param remove_id_list: delete data id's list :return: delete count """ m_client.mongo_entity_code = entity_code try: remove_count = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return remove_count except pymongo.errors.ServerSelectionTimeoutError: mongo_data_list = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None def shuffle_data(self, data): """ 通用清洗 :param data: :return: """ re_data = dict() if "TITLE_" in data: if not data["TITLE_"]: return hash_m = hashlib.md5() hash_m.update(str(data["TITLE_"]).encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) elif "title" in data: if not data["title"]: return hash_m = hashlib.md5() hash_m.update(data["title"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["entity_code"]) + "_" + str(hash_title) else: return re_data["ID_"] = row_key for key, value in data.items(): # 字段验证 if key in self.verify_list: re_data[key] = value elif key == "entityStatus" or key == "ENTITY_STATUS_": key = "ENTITY_STATUS_" value = "DRAFT" re_data[key] = value if "ENTITY_STATUS_" not in re_data: re_data["ENTITY_STATUS_"] = "DRAFT" for s_key in self.verify_list: if s_key == "CONTENT_" or s_key == "URL_" or s_key == "url": continue if data.get(s_key, ""): data[s_key] = data[s_key].replace("|", "") return re_data def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="CommonBidding") # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="CommonBidding") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 MongoDB spider_data_old 数据库对象 old_client = MongoClient(mongo_collection="CommonBidding") # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, # serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) old_client.mongo_db = "spider_data_old" db_old, collection_list_old = old_client.client_to_mongodb() collection_old = db_old["CommonBidding"] # 删除表 # p_client.drop_table_phoenix(connection=connection) # 招投标表创建语句 # sql = ('create table "CommonBidding" ("ID_" varchar primary key, "F"."CONTENT_" varchar,' # '"F"."NOTICE_TIME_" varchar,"F"."TITLE_" varchar,"F"."PROJECT_NAME_" varchar,' # '"F"."BID_CONTENT_" varchar, "F"."SIGN_START_TIME_" varchar, "F"."SIGN_END_TIME_" varchar,' # '"F"."OPEN_BID_TIME_" varchar, "F"."OPEN_BID_PLACE_" varchar, "F"."BID_AGENCY_" varchar,' # '"F"."APPLY_CONDITION_" varchar, "F"."SIGN_QUALIFICATION_" varchar, "F"."PROJECT_ID_" varchar,' # '"F"."WIN_CANDIDATE_" varchar, "F"."CANDIDATE_RANK_" varchar, "F"."BID_" varchar,"F"."URL_" varchar,' # '"F"."DEALTIME_" varchar, "F"."ENTITY_NAME_" varchar, "F"."ENTITY_CODE_" varchar,' # '"F"."ENTITY_STATUS_" varchar, "F"."SIGN_MATERIAL_" varchar, "F"."BID_TYPE_" varchar,' # '"F"."DATETIME_" varchar, "F"."BUDGET_PRICE_" varchar, "F"."PASS_REASON_" varchar,' # '"F"."PRESALE_CONTENT_" varchar, "F"."PRESALE_WAY_" varchar,"F"."PRESALE_START_TIME_" varchar,' # '"F"."PRESALE_END_TIME_" varchar,"F"."PRESALE_ADDR_" varchar,"F"."PRESALE_PREPARE_" varchar,' # '"F"."IMAGE_" varchar) IMMUTABLE_ROWS = true') # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 增加列 # p_client.add_column_phoenix(connection=connection, column="IMAGE_") # 遍历 ENTITY_CODE_ 列表 # self.file_list = ["CommonBidding_86JCW"] for f in self.file_list: status = False entity_code = f.replace(".py", "") module_name = __import__(entity_code) entity_code_mongo = entity_code.replace("CommonBidding_", "") self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code_mongo)) self.remove_id_list = [] self.copy_mongo_data_list = [] try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code_mongo) except pymongo.errors.ServerSelectionTimeoutError: time.sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code_mongo) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 try: self.find_count += mongo_data_list.count() except pymongo.errors.ServerSelectionTimeoutError: time.sleep(1) self.find_count += mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] self.remove_id_list.append(data_id) del data["_id"] # 深拷贝源数据,用于插入 spider_data 库中 copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) # 数据清洗 try: re_data = module_name.data_shuffle(data) final_data = self.shuffle_data(re_data) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue # phoenix_HBase 插入数据 if final_data: try: p_client.upsert_to_phoenix_by_one( connection=connection, data=final_data) once_count += 1 except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) # 删除数据 if status: delete_count = self.delete_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code_mongo, remove_id_list=self.remove_id_list) self.remove_count += delete_count else: self.logger.info("HBase 插入成功条数0条, 不执行删除") # 将数据插入 spider_data_old 中 if status: try: old_client.mongo_db = "spider_data_old" insert_count = old_client.all_to_mongodb( collection=collection_old, insert_list=self.copy_mongo_data_list) self.old_count += insert_count # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) except pymongo.errors.ServerSelectionTimeoutError as e: time.sleep(1) self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) insert_count = old_client.all_to_mongodb( collection=collection_old, insert_list=self.copy_mongo_data_list) self.old_count += insert_count # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) except Exception as e: self.logger.info(e) # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(p_client.count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.handlers.clear()
class AllToPhoenix(object): def __init__(self): # "CNINFONEWS" pdf too long self.code_list = [ "CAIJINGNEWS", "CNINFONEWS", "CSFINACIAL", "CSFINACIALNEWS", "CSNEWS", "CSNOTICE", "FINAQQNEWS", "XLCJYHMKNEWS", "XLCJNEWS", "XLCJGSNEWS", "WYCJNEWS", "WYCJGSNEWS", "NEWS163DOM", "NEWS10JQKA2", "NEWS10JQKA", "HOUSEQQNEWS" ] self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 # 插入 spider_data_old 的数据列表 # self.copy_mongo_data_list = list() # 删除 spider_data 的数据 _id 列表 # self.remove_id_list = list() # self.branch_code_list = list() self.verify_list = [ "ENTITY_CODE_", "ENTITY_NAME_", "URL_", "PERIOD_CODE_", "STATUS_", "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "BANK_NAME_", "BANK_CODE_", "CONTENT_", "DATA_SOURCE_", "KEYWORDS_", "ENTITY_NAME_", "ID_" ] # 从 MongoDB 获取数据 def get_data_from_mongo(self, m_client, collection, entity_code, find_id): m_client.mongo_db = "spider_data" m_client.mongo_entity_code = entity_code try: mongo_data_list = m_client.search_from_mongodb( collection=collection, data_id=find_id) return mongo_data_list except pymongo.errors.ServerSelectionTimeoutError: self.logger.info("连接失败,正在重新连接") sleep(1) mongo_data_list = m_client.search_from_mongodb( collection=collection, data_id=find_id) return mongo_data_list except Exception as e: self.logger.info(e) return None # 从 MongoDB 删除数据 def delete_data_from_mongo(self, m_client, collection, entity_code, remove_id_list): m_client.mongo_entity_code = entity_code try: remove_count = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return remove_count except pymongo.errors.ServerSelectionTimeoutError: remove_count = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return remove_count except Exception as e: self.logger.info(e) return None def get_brief_from_ai(self, data): data["CONTENT_"] = data["CONTENT_"].replace("|", "") if data["PUBLISH_TIME_"]: data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"][:10] # ID hash_m = hashlib.md5() hash_m.update(data["URL_"].encode("utf-8")) hash_title = hash_m.hexdigest() data["ID_"] = data["ENTITY_CODE_"] + "_" + str(hash_title) text = data["CONTENT_"] ex_line = ("python3" + " " + AI_PATH + " " + '\"{}\"'.format(text) + " " + "1") # print(ex_line) r = os.popen(ex_line) # print(1, r.read()) data["BRIEF_"] = r.read() return data # 主函数 def run(self): # # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="NEWS_FINASSIST") p_client.verify_list = self.verify_list # # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="NEWS_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # # 表创建语句 # sql = ('create table "NEWS_FINASSIST" ("ID_" varchar primary key, "T"."CONTENT_" varchar, ' # '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."TITLE_" varchar, "C"."BRIEF_" varchar, ' # '"C"."PUBLISH_TIME_" varchar, "C"."KEYWORDS_" varchar, "C"."URL_" varchar, "C"."DATA_SOURCE_" varchar,' # '"C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,' # '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "F"."STATUS_" varchar)' # 'IMMUTABLE_ROWS = true') # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 遍历 ENTITY_CODE_ 列表 for entity_code in self.code_list: status = False module_name = __import__(entity_code) self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code)) # self.remove_id_list = [] # self.copy_mongo_data_list = [] # self.branch_code_list = [] if entity_code == "CAIJINGNEWS": find_id = "5c6bfa508d7fee512a4ca68f" else: find_id = "" # find_id = "" try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, find_id=find_id) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, find_id=find_id) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count = mongo_data_list.count() for i in range(1000000): try: data = mongo_data_list.__next__() except pymongo.errors.ServerSelectionTimeoutError: continue except StopIteration: break # for data in mongo_data_list: data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info( "running on data_id: {}".format(data_id)) # print(data_id) # copy_data = {} # self.remove_id_list.append(data_id) try: del data["_id"] # copy_data = deepcopy(data) # self.copy_mongo_data_list.append(copy_data) data_list = [data] re_data = module_name.data_shuffle(data_list) if not re_data: self.bad_count += 1 # self.remove_id_list.remove(data_id) continue except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue if isinstance(re_data, list): for list_data in re_data: # phoenix_HBase 插入数据 if list_data: try: if entity_code != "CNINFONEWS": ai_data = self.get_brief_from_ai( data=list_data) else: ai_data = list_data # print(ai_data["CONTENT_"]) except Exception as e: self.logger.info("AI 调取失败, 错误信息", e) ai_data = re_data try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=ai_data) once_count += success_count self.success_count += success_count if self.success_count % 10 == 0: self.logger.info( "HBase 插入成功, 成功条数{}条".format( once_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue try: # 添加 {d:1} update_count = m_client.update_to_mongodb( collection=collection, data_id=data_id, data_dict={"d": 1}) self.remove_count += update_count # self.logger.info("MongoDB 更新成功") if self.remove_count % 10 == 0: self.logger.info( "MongoDB 更新成功, 成功条数 {} 条".format( "10")) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "MongoDB 更新 _id 为 {} 的数据失败, {}".format( data_id, e)) continue elif isinstance(re_data, dict): # phoenix_HBase 插入数据 if re_data: try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) once_count += success_count self.success_count += success_count self.logger.info( "HBase 插入成功, 成功条数 {} 条".format( success_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if once_count > 0: status = True self.logger.info("ENTITY_CODE_: {} 插入成功条数 {}".format( entity_code, once_count)) mongo_data_list.close() else: continue # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class AllToPhoenix(object): def __init__(self): self.code_list = [ "ABCORGANIZE", "BOCOMORGANIZE", "BOCORGANIZE", "CBHBORGANIZE", "CCBORGANIZE", "CEBORGANIZE", "CGBORGANIZE", "CIBORGANIZE", "CMBCORGANIZE", "CMBORGANIZE", "CZBORGANIZE", "EBCLORGANIZE", "ECITICORGANIZE", "HXBORGANIZE", "ICBCORGANIZE", "PABORGANIZE", "PSBCORGANIZE", "SPDBORGANIZE" ] self.logger = Logger().logger self.count = 0 # 从 MongoDB 获取数据 def get_data_from_mongo(self, m_client, collection, entity_code): m_client.mongo_entity_code = entity_code try: mongo_data_list = m_client.search_from_mongodb(collection) return mongo_data_list except pymongo.errors.ServerSelectionTimeoutError: self.logger.info("连接失败,正在重新连接") sleep(1) mongo_data_list = m_client.search_from_mongodb(collection) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) # print(e) return None # 主函数 def run(self): # 创建 MongoDB 对象 m_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # 遍历 ENTITY_CODE_ 列表 # self.code_list = self.code_list[14:] for entity_code in self.code_list: self.count = 0 hash_list = list() status = False mongo_data_list = self.get_data_from_mongo(m_client=m_client, collection=collection, entity_code=entity_code) if mongo_data_list: self.logger.warning("{} 查取成功".format(entity_code)) self.logger.warning("当前共有{}条".format(mongo_data_list.count())) status = True else: self.logger.warning("{} 无数据".format(entity_code)) if status: for data in mongo_data_list: if "ADDR_" in data: hash_m = hashlib.md5() hash_m.update(data["ADDR_"].encode("utf-8")) hash_title = hash_m.hexdigest() if hash_title in hash_list: self.count += 1 else: hash_list.append(hash_title) else: if "CONTENT_" in data: hash_m = hashlib.md5() hash_m.update(data["CONTENT_"].encode("utf-8")) hash_title = hash_m.hexdigest() if hash_title in hash_list: self.count += 1 else: hash_list.append(hash_title) self.logger.warning("重复数据{}条".format(self.count)) # 关闭连接 m_client.client_close() self.logger.handlers.clear()
def search_title_from_mongodb(self, collection, data_id=None): mon_logger = Logger().logger try: mon_logger.info("开始查取数据") if data_id: find_id = ObjectId(data_id) result_one = collection.find_one({ "$and": [{ "ENTITY_CODE_": self.mongo_entity_code }, { "$or": [{ "TITLE_": { "$exists": True } }, { "title": { "$exists": True } }] }, { "_id": { "$gte": find_id } }] }) else: result_one = collection.find_one({ "$and": [{ "ENTITY_CODE_": self.mongo_entity_code }, { "$or": [{ "TITLE_": { "$exists": True } }, { "title": { "$exists": True } }] }] }) if result_one is not None: result = collection.find( { "$and": [{ "ENTITY_CODE_": self.mongo_entity_code }, { "$or": [{ "TITLE_": { "$exists": True } }, { "title": { "$exists": True } }] }, { "_id": { "$gte": result_one["_id"] } }] }, no_cursor_timeout=True) mon_logger.info("ENTITY: {} 数据查取成功共 {}条".format( self.mongo_entity_code, result.count())) return result else: mon_logger.info( "MongoDB 查取数据为空,请检查 ENTITY_CODE_ 是否正确:{}".format( self.mongo_entity_code)) return None except TypeError as e: mon_logger.warning( "MongoDB数据查取失败,错误信息为{}, 请检查 ENTITY_CODE_ 是否正确:{}".format( e, self.mongo_entity_code)) finally: self.client_close()
class WeiboBasicInfoScript(object): # 初始化参数 def __init__(self, entity_type="WEIBOBASICINFO"): self.entity_type = entity_type self.logger = Logger().logger self.verify_list = [ "ID_", "BANK_CODE_", "BANK_NAME_", "PERIOD_TIME_", "AREA_CODE_", "CREATE_TIME_", "WEIBO_CODE_", "MAIN_URL_", "NAME_", "FOCUS_", "FANS_", "COMPANY_URL_", "COMPANY_", "DETAILED_URL_", "VIRIFIED_", "BIREF_", "ENTITY_NAME_", "ENTITY_CODE_", "DEALTIME_", "PROVINCE_NAME_", "PROVINCE_CODE_", "STATUS_1" ] self.remove_id_list = list() self.copy_mongo_data_list = list() self.branch_code_list = list() self.find_count = 0 self.bad_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 def match_weibo_code(self, match): mongo_client = MongoClient(mongo_collection="WEIBOBASICINFO") db, collection_list = mongo_client.client_to_mongodb() collection = mongo_client.get_check_collection(db, collection_list) result = mongo_client.match_from_mongo(collection=collection, match=match, output="WEIBO_CODE_") return result def data_shuffle(self, data, province_list): re_data = dict() prov_n = "" prov_c = "" # # BANK_NAME_ 字典 # name_dict = {"ICBC": "中国工商银行", "ABC": "中国农业银行", "BOC": "中国银行", "CCB": "中国建设银行", # "BOCOM": "交通银行", "PSBC": "中国邮政储蓄银行", "CZB": "浙商银行", "CBHB": "渤海银行", # "ECITIC": "中信银行", "CEB": "中国光大银行", "HXB": "华夏银行", "CMBC": "中国民生银行", # "CMB": "招商银行", "CIB": "兴业银行", "CGB": "广发银行", "PAB": "平安银行", # "SPDB": "浦发银行", "EBCL": "恒丰银行"} province_list, city_list, area_list, dir_area_list, bank_list = GenericScript.data_from_mysql( ) bank_code = data["BANK_CODE_"][:-9] bank_name = name_dict[bank_code] time_array = time.localtime(int(data["DEALTIME_"])) period_time = time.strftime("%Y%m%d", time_array) for prov in province_list: if prov["NAME_"][:2] in data["LOCATION_"]: prov_n = prov["NAME_"] prov_c = prov["CODE_"] # "C" # TODO row_key 时间戳还是年-月-日 re_data["ID_"] = data["BANK_CODE_"] + "_" + period_time re_data["BANK_CODE_"] = bank_code re_data["BANK_NAME_"] = bank_name re_data["PERIOD_CODE_"] = period_time re_data["AREA_CODE_"] = prov_c re_data["CREATE_TIME_"] = period_time re_data["WEIBO_CODE_"] = data["WEIBO_CODE_"] re_data["MAIN_URL_"] = data["MAIN_URL_"] re_data["NAME_"] = data["NAME_"] re_data["FOCUS_"] = data["FOCUS_"] re_data["FANS_"] = data["FANS_"] re_data["COMPANY_URL_"] = data["COMPANY_URL_"] if "COMPANY_" not in data: re_data["COMPANY_"] = data["VIRIFIED_"] else: re_data["COMPANY_"] = data["COMPANY_"] re_data["DETAILED_URL_"] = data["DETAILED_URL_"] re_data["VIRIFIED_"] = bank_name + "股份有限公司" re_data["BIREF_"] = data["BIREF_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["ENTITY_CODE_"] = data["BANK_CODE_"] re_data["DEALTIME_"] = data["DEALTIME_"] re_data["PROVINCE_NAME_"] = prov_n re_data["PROVINCE_CODE_"] = prov_c re_data["STATUS_"] = "" return re_data # 从 MongoDB 删除数据 def delete_data_from_mongo(self, m_client, collection, remove_id_list): m_client.mongo_entity_code = None try: remove_count = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return remove_count except pymongo.errors.ServerSelectionTimeoutError: mongo_data_list = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="WEIBOBASICINFO") p_client.verify_list = self.verify_list # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="WEIBOBASICINFO") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # # 创建 MongoDB spider_data_old 数据库对象 # old_client = MongoClient(mongo_collection="WEIBOBASICINFO") # # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60, # connectTimeoutMS=60, connect=False) # old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = old_client.client_to_mongodb() # collection_old = db_old["ORGANIZE_FINASSIST"] # 获取地区编码 province_list, city_list, area_list, dir_area_list = (GenericScript( entity_code=None, entity_type=None).area_from_mysql()) # 删除表 p_client.drop_table_phoenix(connection=connection) # quit() # 创建表 sql = ( 'create table "WEIBOBASICINFO" ("ID_" varchar primary key, "C"."BANK_CODE_" varchar,' '"C"."BANK_NAME_" varchar, "C"."PERIOD_CODE_" varchar, "C"."CREATE_TIME_" varchar,' '"C"."UPDATE_TIME_" varchar, "C"."REMARK_" varchar, "C"."WEIBO_CODE_" varchar, "C"."MAIN_URL_" varchar,' '"C"."NAME_" varchar, "C"."FOCUS_" varchar, "C"."FANS_" varchar, "C"."COMPANY_URL_" varchar,' '"C"."COMPANY_" varchar, "C"."DETAILED_URL_" varchar, "C"."VIRIFIED_" varchar,"C"."AREA_CODE_" varchar,' '"C"."BIREF_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,' '"C"."DEALTIME_" varchar,"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar,' '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true') p_client.create_new_table_phoenix(connection=connection, sql=sql) # 增加列 # p_client.add_column_phoenix(connection=connection, column="IMAGE_") # 遍历 ENTITY_CODE_ 列表 status = False self.logger.info("开始进行 WEIBOBASICINFO") try: mongo_data_list = m_client.all_from_mongodb(collection=collection) except pymongo.errors.ServerSelectionTimeoutError: time.sleep(1) mongo_data_list = m_client.all_from_mongodb(collection=collection) # 清洗数据并插入 HBase if mongo_data_list: self.find_count = mongo_data_list.count() for data in mongo_data_list: re_data = "" data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) re_data = self.data_shuffle(data=data, province_list=province_list) if not re_data: self.bad_count += 1 continue except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) # phoenix_HBase 插入数据 try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if self.success_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format( self.success_count)) else: quit() # # 删除数据 # if status: # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection, # remove_id_list=self.remove_id_list) # self.remove_count += delete_count # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # # 将数据插入 spider_data_old 中 # if status: # try: # old_client.mongo_db = "spider_data_old" # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # except pymongo.errors.ServerSelectionTimeoutError as e: # time.sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class MapBarTransfer(object): def __init__(self, table_name="CHA_BRANCH_MAPBAR", collection_name="mapbar"): # phoenix connection self.p_client = PhoenixHbase(table_name=table_name) self.connection = self.p_client.connect_to_phoenix() # MongoDB connection self.m_client = MongoClient(mongo_collection=collection_name, entity_code="MAPBAR_DEATAIL_BJ") self.m_client.mongo_host = "172.22.69.35" self.m_client.mongo_port = 20000 self.m_client.client = pymongo.MongoClient(host="172.22.69.35", port=20000, serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) self.db, self.collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=self.db, collection_list=self.collection_list) # Log self.logger = Logger().logger # count self.count = 0 def main(self): # # 创建表 # table_sql = (f'create table "{self.p_client.table_name}" ("ID_" varchar primary key,' # '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,' # '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,' # '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar' # ') IMMUTABLE_ROWS = true') # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) # 获取数据 # mongo_data_list = self.m_client.all_from_mongodb(collection=self.collection) mongo_data_list = self.m_client.search_from_mongodb( collection=self.collection, field_name="DEALTIME_", field_value={"$gt": "1555136656.0579224"}, data_id="5cb65fac9bb3df61a09c6625") count = 0 while True: # 取一条处理 try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError: time.sleep(3) data = mongo_data_list.__next__() # 清洗 try: data["PHONE_"] = data["PHONE_"].replace("无,", "") u_time_list = re.findall(r"(\d{4}年\d{1,2}月\d{1,2})日", data["UPDATETIME_"]) if u_time_list: u_ = u_time_list[0].replace("年", "-") u_ = u_.replace("月", "-") u_l = u_.split("-") if len(u_l[1]) == 1: u_l[1] = "0" + u_l[1] if len(u_l[2]) == 1: u_l[2] = "0" + u_l[2] data["UPDATETIME_"] = "-".join(u_l) except Exception as e: self.logger.exception(f"数据清洗出错, _id: {data['_id']}, error {e}") continue # 获取经纬度 try: if data["ADDRESS_"]: data["ADDRESS_"] = "".join(data["ADDRESS_"].split("|")[1:]) location_result = get_lat_lng(address=data["ADDRESS_"]) if location_result["status"] == 0: data["LNG_"] = str( location_result["result"]["location"]["lng"]) data["LAT_"] = str( location_result["result"]["location"]["lat"]) else: self.logger.warning(f"_id: {data['_id']} 获取经纬度失败") else: continue except Exception as e: self.logger.exception( f"_id: {data['_id']} 获取经纬度失败, error: {e}") continue # upsert to HBase try: re_data = self.__check_lat(data=data) # 向 HBase 中插入一条 self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) count += 1 if count % 100 == 0: self.logger.info( f"HBase 插入成功, _id: {data['_id']}, 成功条数 {count}") except Exception as e: self.logger.exception( f"HBase 插入失败, _id: {data['_id']}, error: {e}") continue # 关闭 MongoDB cursor mongo_data_list.close() self.logger.info( f"collection: {self.m_client.mongo_collection} 的数据清洗完毕, 成功条数共计: {count} 条" ) def check_lat(self): # # 删除表 # self.p_client.drop_table_phoenix(connection=self.connection, table_name="CHA_BRANCH_MAPBAR") # # table_sql = (f'create table "CHA_BRANCH_MAPBAR" ("ID_" varchar primary key,' # '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,' # '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,' # '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar,' # '"C"."CHECK_LAT_" varchar, "C"."CHECK_LNG_" varchar' # ') IMMUTABLE_ROWS = true') # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) self.p_client.table_name = "FANSILE" data_cursor = self.p_client.search_all_from_phoenix( connection=self.connection, dict_status=True) self.p_client.table_name = "CHA_BRANCH_MAPBAR" while True: try: data = data_cursor.__next__() # del data["('C', 'CHECK_LNG_')"] # if not data["LAT_"]: # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue # if 30.7083860773 < float(data["LAT_"]) < 31.8739003864: # pass # else: # self.logger.warning(f"错误 _id: {data['ID_']}, 经纬度: {data['LAT_']},{data['LNG_']}") # data["CHECK_LAT_"] = data["LAT_"] # data["CHECK_LNG_"] = data["LNG_"] # data["LAT_"] = "" # data["LNG_"] = "" # # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue # if 120.8778122800 < float(data["LNG_"]) < 122.1248433443: # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue # else: # self.logger.warning(f"错误 _id: {data['ID_']}, 经纬度: {data['LAT_']},{data['LNG_']}") # data["CHECK_LAT_"] = data["LAT_"] # data["CHECK_LNG_"] = data["LNG_"] # data["LAT_"] = "" # data["LNG_"] = "" # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=data) self.count += 1 if self.count % 100 == 0: self.logger.info( f"HBase 插入成功, _id: {data['_id']}, 成功条数 {self.count} 条") except StopIteration: break def __check_lat(self, data): if "LAT_" not in data: return data # 上海 # if 30.7083860773 < float(data["LAT_"]) < 31.8739003864: # 北京 if 39.4498800000 < float(data["LAT_"]) < 41.1684980000: pass else: self.logger.warning( f"错误 _id: {data['_id']}, 经纬度: {data['LAT_']},{data['LNG_']}") data["CHECK_LAT_"] = data["LAT_"] data["CHECK_LNG_"] = data["LNG_"] data["LAT_"] = "" data["LNG_"] = "" return data # 上海 # if 120.8778122800 < float(data["LNG_"]) < 122.1248433443: # 北京 if 115.4534230000 < float(data["LNG_"]) < 117.5461160000: return data else: self.logger.warning( f"错误 _id: {data['_id']}, 经纬度: {data['LAT_']},{data['LNG_']}") data["CHECK_LAT_"] = data["LAT_"] data["CHECK_LNG_"] = data["LNG_"] data["LAT_"] = "" data["LNG_"] = "" return data