def insert_to_mysql(self, connection, data): """ 插入新数据 :param connection: :param data: type => tuple List or dict :return: """ mysql_logger = Logger().logger if isinstance(data, dict): k_list = [key for key in data.keys()] v_list = tuple([value for value in data.values()]) v_sql = str(v_list) if v_sql[-2] == ",": v_sql = v_sql[:-2] + ")" sql = f"INSERT INTO {self.mysql_table} ({','.join(k_list)}) VALUES{v_sql}" elif isinstance(data, (list, tuple)): k_list = [key for key in data[0].keys()] value_list = list() for each in data: v_list = str(tuple([value for value in each.values()])) if v_list[-2] == ",": v_list = v_list[:-2] + ")" value_list.append(v_list) sql = f"INSERT INTO {self.mysql_table} ({','.join(k_list)}) VALUES" sql = sql + ",".join(value_list) else: raise Exception("not format type of data") try: mysql_logger.info(f"网络声量sql==>{sql}") count = self.cs_commit(connection=connection, sql=sql) mysql_logger.info(f"MySQL 插入成功 {count} 条") except Exception as e: mysql_logger.exception(f"网络声量 插入失败,ERROR: {e}")
def all_to_mongodb(self, collection, insert_list): """ 插入数据 :param collection: :param insert_list: list :return: """ mon_logger = Logger().logger try: result = collection.insert_many(insert_list) mon_logger.info("MongoDB 数据插入成功, 成功条数 {} 条".format( len(result.inserted_ids))) return len(result.inserted_ids) except TypeError as e: mon_logger.info("MongoDB 数据插入失败,错误信息为 {}".format(e)) except pymongo.errors.ServerSelectionTimeoutError as e: mon_logger.info("MongoDB 连接超时 {}, 正在重新连接...".format(e)) result = collection.insert_many(insert_list) mon_logger.info("数据插入成功, 成功条数 {} 条".format(len( result.inserted_ids))) return len(result.inserted_ids) except Exception as e: mon_logger.info("MongoDB 数据插入失败,错误信息为 {}".format(e)) return 0 finally: self.client_close()
def match_from_mongo(self, collection, match, output): """ 查询所有数据, 返回游标对象(聚合) :param collection: :param match: match condition like dict {"ENTITY_CODE_": "XXXXXXXXX"} :param output: output field like list or dict :return: """ mon_logger = Logger().logger if isinstance(output, str): output = [output] try: mon_logger.info("MongoDB 开始查取数据") output_condition = dict() for o in output: output_condition[o] = 1 result = collection.aggregate([{ "$match": match }, { "$project": output_condition }]) mon_logger.info("MongoDB 数据查取成功") return result except TypeError as e: mon_logger.error( "WEIBO_CODE_ 数据查取失败,错误信息为{}, 请检查匹配规则是否正确:{}".format(e, match)) # raise Exception("WEIBO_CODE_ 查取失败, 错误信息为{}".format(e)) finally: self.client_close()
def match_from_mongo(self, collection, match, output): mon_logger = Logger().logger try: mon_logger.info("开始查取数据") result = collection.aggregate([{ "$match": match }, { "$project": { "budgetPrice": 1, "_id": 0, output: 1 } }]) for i in result: if i is not None: mon_logger.info("数据查取成功") return i[output] else: mon_logger.error("WEIBO_CODE_ 查取数据为空") # raise Exception("WEIBO_CODE_ 查取失败") except TypeError as e: mon_logger.error( "WEIBO_CODE_ 数据查取失败,错误信息为{}, 请检查匹配规则是否正确:{}".format(e, match)) raise Exception("WEIBO_CODE_ 查取失败, 错误信息为{}".format(e)) finally: self.client_close()
def search_from_mysql(self, connection, output=None, where_condition=None, limit_num=None, offset_num=None): """ 查询 :param connection: :param output: 输出字段 :param where_condition: where 条件 :param limit_num: 输出数量 :param offset_num: 跳过数量 :return: """ mysql_logger = Logger().logger if output: if isinstance(output, str): sql = f"SELECT {output} FROM {self.mysql_table}" elif isinstance(output, (tuple, list)): sql = f"SELECT {','.join(output)} FROM {self.mysql_table}" else: raise Exception("not format type of \"output\"") else: sql = f"SELECT * FROM {self.mysql_table}" if where_condition: if "where" in where_condition or "WHERE" in where_condition: sql = sql + " " + where_condition else: sql = sql + f" WHERE {where_condition}" sql = sql + f" LIMIT {limit_num}" if limit_num else sql sql = sql + f" OFFSET {offset_num}" if offset_num else sql try: cs = connection.cursor(pymysql.cursors.DictCursor) count = cs.execute(sql) result = cs.fetchall() if count: mysql_logger.info(f"Mysql 查取成功 {count} 条") return result else: mysql_logger.info("数据库查取数为0") except TypeError: mysql_logger.error("MySQL查取失败,请检查") finally: cs.close()
def get_mongo_column_dict(self, collection, column1, column2): mon_logger = Logger().logger try: mon_logger.info("开始查取数据") result = collection.aggregate([{ "$project": { "_id": 0, column1: 1, column2: 1 } }]) return result except TypeError as e: mon_logger.error( "WEIBO_CODE_ 数据查取失败,错误信息为{}, 请检查匹配规则是否正确".format(e)) raise Exception("WEIBO_CODE_ 查取失败, 错误信息为{}".format(e)) finally: self.m_client.client.close()
def delete_from_mysql(self, connection, where_condition): """ 删除 :param connection: :param where_condition: where 条件 :return: """ mysql_logger = Logger().logger if "where" in where_condition or "WHERE" in where_condition: sql = f"DELETE FROM {self.mysql_table} {where_condition}" else: sql = f"DELETE FROM {self.mysql_table} WHERE {where_condition}" try: count = self.cs_commit(connection=connection, sql=sql) mysql_logger.info(f"MySQL 删除成功 {count} 条") except Exception as e: mysql_logger.exception(f"MySQL 删除失败,ERROR: {e}")
def client_to_mysql(self): ''' :return: 返回mysql连接 ''' mysql_logger = Logger().logger try: mysql_logger.info("正在连接MySQL({}@{}:{})".format( self.mysql_user, self.mysql_host, self.mysql_port)) connection = pymysql.connect(**self.mysql_config) mysql_logger.info("Mysql连接成功({}@{}:{})".format( self.mysql_user, self.mysql_host, self.mysql_port)) return connection except pymysql.err.OperationalError as e: for retry_count in range(2, 7): try: mysql_logger.warning( "MySQL连接失败,正在重试第{}次连接".format(retry_count)) connection = pymysql.connect(**self.mysql_config) mysql_logger.info("Mysql连接成功") return connection except Exception as e: mysql_logger.warning("第{}次连接MySQL失败".format(retry_count)) # print(retry_count) if retry_count == 6: mysql_logger.error("MySQL连接失败,错误信息为{}".format(e))
def client_to_mongodb(self): mon_logger = Logger().logger mon_logger.info("开始连接MongoDB({}:{}),database={}".format( self.mongo_host, self.mongo_port, self.mongo_database)) try: collection_list = self.db.collection_names() mon_logger.info("MongoDB({}:{})连接成功".format( self.mongo_host, self.mongo_port)) return collection_list except pymongo.errors.ServerSelectionTimeoutError as e: mon_logger.warning("MongoDB({}:{})连接失败".format( self.mongo_host, self.mongo_port)) for i in range(2, 6): try: collection_list = self.db.collection_names() mon_logger.info("MongoDB({}:{})连接成功".format( self.mongo_host, self.mongo_port)) return collection_list except Exception: mon_logger.warning("MongoDB({}:{})第{}次连接失败".format( self.mongo_host, self.mongo_port, i)) if i == 5: mon_logger.error( "MongoDB连接失败,错误信息为: {}, 请检查各项参数是否正确host={}, port={},database={}" .format(e, self.mongo_host, self.mongo_port, self.mongo_database)) self.client_close()
def search_by_status(self, collection, data_id=None): mon_logger = Logger().logger try: mon_logger.info("开始查取数据") if data_id: find_id = ObjectId(data_id) result_one = collection.find_one({ "$and": [{ "ENTITY_CODE_": self.mongo_entity_code }, { "_id": { "$gte": find_id } }, { "d": { "$exists": False } }] }) else: result_one = collection.find_one({ "$and": [{ "ENTITY_CODE_": self.mongo_entity_code }, { "d": { "$exists": False } }] }) if result_one is not None: result = collection.find( { "$and": [{ "ENTITY_CODE_": self.mongo_entity_code }, { "_id": { "$gte": result_one["_id"] } }, { "d": { "$exists": False } }] }, no_cursor_timeout=True) mon_logger.info("ENTITY: {} 数据查取成功共 {}条".format( result.count())) return result else: mon_logger.info("ENTITY: {} 数据查取为空".format( self.mongo_entity_code)) return None except TypeError as e: mon_logger.error( "MongoDB数据查取失败,错误信息为{}, 请检查 ENTITY_CODE_ 是否正确:{}".format( e, self.mongo_entity_code)) finally: self.client_close()
def update_to_mysql(self, connection, data, where_condition): """ 更新数据 :param connection: :param data: :param where_condition: where 条件 :return: """ mysql_logger = Logger().logger set_list = list() for key, value in data.items(): set_list.append(f"{key} = \'{value}\'") if "where" in where_condition or "WHERE" in where_condition: sql = f"UPDATE {self.mysql_table} SET {','.join(set_list)} {where_condition}" else: sql = f"UPDATE {self.mysql_table} SET {','.join(set_list)} WHERE {where_condition}" try: count = self.cs_commit(connection=connection, sql=sql) mysql_logger.info(f"MySQL 更新成功 {count} 条") except Exception as e: mysql_logger.exception(f"MySQL 更新失败,ERROR: {e}")
class FundScript(object): def __init__(self): self.code_list = [ "STCNFUND", "ABCFUND", "CCBFUND", "CITICFUND", "ICBCFUND" ] self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.copy_mongo_data_list = list() self.remove_id_list = list() self.branch_code_list = list() # 基金 self.verify_list = [ "ENTITY_CODE_", "ENTITY_NAME_", "URL_", "PERIOD_CODE_", "STATUS_", "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "CODE_", "NAME_", "FUND_NEW_VALUE_", "TOTAL_NEW_VALUE_", "FUND_OLD_VALUE_", "TOTAL_OLD_VALUE_", "DAILY_RATE_", "YEAR_REWARD_", "SUBS_STATUS_", "ATONEM_STATUS_", "TYPE_", "ID_", "NEWEST_VALUE_", "TOTAL_VALUE_", "POPULARITY_", "RATING_", "OLD_VALUE_", "UNIT_VALUE_", "SCALE_", "ESTABLISH_DATE_", "RISK_LEVEL_", "BASE_INFO_", "YIELD_", "INVEST_", "MONTH_RATE_", "QUARTER_RATE_", "HALF_YEAR_RATE_", "HISTORY_RATE_", "FUND_STATUS_", "COMPANY_", "SUBS_STATUS_CODE_", "TYPE_CODE_" ] # 从 MongoDB 获取数据 def get_data_from_mongo(self, m_client, collection, entity_code): m_client.mongo_db = "spider_data" m_client.mongo_entity_code = entity_code try: mongo_data_list = m_client.search_from_mongodb(collection) return mongo_data_list except pymongo.errors.ServerSelectionTimeoutError: self.logger.info("连接失败,正在重新连接") sleep(1) mongo_data_list = m_client.search_from_mongodb(collection) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None # 从 MongoDB 删除数据 def delete_data_from_mongo(self, m_client, collection, entity_code, remove_id_list): m_client.mongo_entity_code = entity_code try: remove_count = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return remove_count except pymongo.errors.ServerSelectionTimeoutError: mongo_data_list = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None # 网点 CODE_ hash_m = hashlib.md5() hash_m.update(re_data["ADDR_"].encode("utf-8")) hash_addr_ = hash_m.hexdigest() re_data["CODE_"] = re_data["BANK_CODE_"] + "_" + re_data[ "AREA_CODE_"] + "_" + hash_addr_ # for i in range(1, 10000): # branch_code = "ABC" + "_" + re_data["AREA_CODE_"] + "_" + "00000" # branch_code = branch_code[:len(branch_code)-len(str(i))] + "{}".format(i) # if branch_code in branch_code_list: # continue # else: # branch_code_list.append(branch_code) # break return re_data def gaode_get_lat_lng(self, address): url = URL_FOR_LAT_LNG + "?" + "key=" + AK + "&address=" + address # url = url + "?location={}&output=json&pois=1&ak={}".format(address, ak) response = requests.get(url) temp = json.loads(response.content) response.close() return temp['geocodes'][0]['location'] def dict_from_mysql(self, dict_code): # 创建 MySQL 对象 mysql_config = { "host": "172.22.67.25", "port": 3306, "database": "chabei", "user": "******", "password": "******", "table": "sys_dict_item" } mysql_client = MysqlClient(**mysql_config) mysql_connection = mysql_client.client_to_mysql() result = mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'{}\'" .format(dict_code), connection=mysql_connection) mysql_client.close_client(connection=mysql_connection) return result # 主函数 def run(self): count = 0 # # 创建 Phoenix 对象-注意表格名字 p_client = PhoenixHbase(table_name="FUND") p_client.verify_list = self.verify_list # # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="JSFUND_CCBDATA") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) #查询省市区的编码列表 # script = GenericScript(entity_code="ICBCFUND", entity_type="JSFUND_CCBDATA") # province_list, city_list, area_list, dir_area_list = script.area_from_mysql() list_SUBS_STATUS = self.dict_from_mysql("FUND_SUBS_STATUS") list_TYPE = self.dict_from_mysql("FUND_TYPE") # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # 基金表创建语句 # sql = ('create table "FUND" ("ID_" varchar primary key,"C"."ENTITY_CODE_" varchar,"C"."AREA_CODE_" varchar,' # '"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,"C"."UNIT_CODE_" varchar,"C"."PERIOD_CODE_" varchar,"C"."REMARK_" varchar,' # '"C"."CREATE_TIME_" varchar,"C"."UPDATE_TIME_" varchar,"C"."STATUS_" varchar,"C"."CODE_" varchar,"C"."NAME_" varchar,' # '"C"."FUND_OLD_VALUE_" varchar,"C"."TOTAL_OLD_VALUE_" varchar,"C"."FUND_NEW_VALUE_" varchar,"C"."TOTAL_NEW_VALUE_" varchar,' # '"C"."INVEST_PERIOD_" varchar,"C"."DAILY_RATE_" varchar,"C"."YEAR_REWARD_" varchar,"C"."SUBS_STATUS_" varchar,' # '"C"."ATONEM_STATUS_" varchar,"C"."TYPE_" varchar,"C"."NEWEST_VALUE_" varchar,"C"."TOTAL_VALUE_" varchar,' # '"C"."POPULARITY_" varchar,"C"."RATING_" varchar,"C"."ENTITY_NAME_" varchar,"C"."OLD_VALUE_" varchar,' # '"C"."UNIT_VALUE_" varchar,"C"."SCALE_" varchar,"C"."ESTABLISH_DATE_" varchar,"C"."RISK_LEVEL_" varchar,' # '"C"."BASE_INFO_" varchar,"C"."YIELD_" varchar,"C"."INVEST_" varchar,"C"."MONTH_RATE_" varchar,' # '"C"."QUARTER_RATE_" varchar,"C"."HALF_YEAR_RATE_" varchar,"C"."URL_" varchar,"C"."HISTORY_RATE_" varchar,' # '"C"."FUND_STATUS_" varchar,"C"."COMPANY_" varchar,"C"."SUBS_STATUS_CODE_" varchar,"C"."TYPE_CODE_" varchar)IMMUTABLE_ROWS = true') # # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 遍历 ENTITY_CODE_ 列表 # self.code_list = ["ABCORGANIZE"] for entity_code in self.code_list: status = False module_name = __import__(entity_code) self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code)) self.remove_id_list = [] self.copy_mongo_data_list = [] self.branch_code_list = [] try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count = mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) # re_data = module_name.data_shuffle(data_list,province_list, city_list, area_list,list_SUBS_STATUS,list_TYPE) re_data = module_name.data_shuffle( data, list_SUBS_STATUS, list_TYPE) # re_data = module_name.data_shuffle(data_list) if not re_data: self.bad_count += 1 continue except Exception as e: # except jpype._jexception.SQLExceptionPyRaisable as e: # except org.apache.phoenix.exception.BatchUpdateExecution as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue if isinstance(re_data, list): for list_data in re_data: # try: # area_data = self.shuffle_for_area(list_data) # except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) # self.logger.warning("_id:{} 获取经纬度失败, {}".format(data_id, e)) # continue # except ValueError: # pass # phoenix_HBase 插入数据 if list_data: try: count += 1 print(count) # print(list_data) success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=list_data) # 导出csv # pd.DataFrame(area_data).to_csv("E:\\NEWS_CLEAN_\\" + module_name+ ".csv") once_count += success_count self.success_count += success_count self.logger.info( "HBase 插入成功, 成功条数 {} 条".format( success_count)) if self.success_count % 50 == 0: update_count = m_client.update_to_mongodb( collection=collection, data_id=self.remove_id_list, data_dict={"d": 1}) self.remove_count += update_count self.logger.info("MongoDB 更新成功") except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue elif isinstance(re_data, dict): # try: # area_data = self.shuffle_for_area(re_data) # except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) # self.logger.warning("_id: {}获取经纬度失败, {}".format(data_id, e)) # continue # phoenix_HBase 插入数据 if re_data: try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) once_count += success_count self.success_count += success_count if self.success_count % 100 == 0: self.logger.info( "HBase 插入成功, 成功条数 {} 条".format( self.success_count)) # 添加 {d:1} if self.success_count % 50 == 0: update_count = m_client.update_to_mongodb( collection=collection, data_id=self.remove_id_list, data_dict={"d": 1}) self.remove_count += update_count self.logger.info("MongoDB 更新成功") except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) else: continue # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class WechatScript(object): def __init__(self, entity_type="WECHAT"): """ 初始化参数 :param entity_type: WECHAT """ self.entity_type = entity_type self.logger = Logger().logger # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name=self.entity_type) # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="WECHAT") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # # 创建 MongoDB spider_data_old 数据库对象 # self.old_client = MongoClient(mongo_collection="WECHAT") # # 本地测试 # self.old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60, # connectTimeoutMS=60, connect=False) # self.old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = self.old_client.client_to_mongodb() # self.collection_old = db_old["WECHAT"] # 创建 MySQL 对象 self.mysql_client = GenericScript(entity_code=None, entity_type=None) self.remove_id_list = list() self.copy_mongo_data_list = list() self.verify_list = [ "ID_", "ENTITY_CODE_", "URL_", "AREA_CODE_", "BANK_CODE_", "BANK_NAME_", "UNIT_CODE_", "PERIOD_CODE_", "CONTENT_", "CONTENT_TYPE_", "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "TITLE_", "ENTITY_NAME_", "DEALTIME_", "DATETIME_", "STATUS_", "WECHAT_NAME_", "WECHAT_ID_" ] # BANK_NAME_ 字典 self.name_dict = { "ICBC": "中国工商银行", "ABC": "中国农业银行", "BOC": "中国银行", "CCB": "中国建设银行", "BOCOM": "交通银行", "PSBC": "中国邮政储蓄银行", "CZB": "浙商银行", "CBHB": "渤海银行", "ECITIC": "中信银行", "CEB": "中国光大银行", "HXB": "华夏银行", "CMBC": "中国民生银行", "CMB": "招商银行", "CIB": "兴业银行", "CGB": "广发银行", "PAB": "平安银行", "SPDB": "浦发银行", "EBCL": "恒丰银行" } self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", "Accept-Language": "zh-CN,zh;q=0.9", "Host": "weixin.sogou.com", "Referer": "http://weixin.sogou.com/" } self.url = "http://weixin.sogou.com/weixin?type=1&query={}&ie=utf8&s_from=input&_sug_=y&_sug_type_=" self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def check_name(self, wechat_id): check_dict = dict() with open("wechat_id_name.txt", "r", encoding="utf-8") as rf: read_data = rf.read() if read_data: read_data = read_data.replace("\'", "\"") read_data = read_data.replace(": None", ": \"None\"") # print(read_data) check_dict = json.loads(read_data) else: wechat_name = self.req_for_name(wechat_id) check_dict[wechat_id] = wechat_name with open("wechat_id_name.txt", "w", encoding="utf-8") as wf: wf.write(str(check_dict)) return check_dict[wechat_id] if wechat_id in check_dict: return check_dict[wechat_id] else: wechat_name = self.req_for_name(wechat_id) check_dict[wechat_id] = wechat_name with open("wechat_id_name.txt", "w", encoding="utf-8") as wf: wf.write(str(check_dict)) return check_dict[wechat_id] def req_for_name(self, wechat_id): url = self.url.format(wechat_id) # response = WanDou().http_client(url=url, param=self.headers) resp1 = requests.get( url= r"http://h.wandouip.com/get/ip-list?pack=853&num=1&xy=1&type=2&lb=\r\n&mr=1&" ) resp2 = resp1.json()["data"][0] # print(resp2) # resp1.close() time.sleep(2) try: response = requests.get( url=url, headers=self.headers, proxies={"http": "{}:{}".format(resp2["ip"], resp2["port"])}) except Exception as e: print(1, e) self.logger.info("error ip: {}".format(resp2)) time.sleep(5) return self.req_for_name(wechat_id) html = HTML(response.content.decode()) # response.close() name = html.xpath('//p[@class="tit"]/a/text()') if name: # print(name) self.error_count = 0 return name[0] else: self.error_count += 1 if self.error_count == 5: self.logger.info("wetchat id error: \"{}\"".format(wechat_id)) return "None" else: time.sleep(2) self.req_for_name(wechat_id) # if response is None: # self.logger.info("ip_prox error") # return self.req_for_name(wechat_id) # if isinstance(response, str): # html = HTML(response) # name = html.xpath('//p[@class="tit"]/a/text()') # if name: # print(name) # return name[0] # else: # self.logger.info("ip_prox error2") # return self.req_for_name(wechat_id) # else: # self.logger.info("ip_prox error change") # return self.req_for_name(wechat_id) def data_shuffle(self, data, province_list, city_list, area_list): """ 数据清洗 :param data: :param province_list: :param city_list: :param area_list: :return: re_data or None """ # BANK_CODE_正则匹配规则 pattern = re.compile( r'ICBC|ABC|BOCOM|CCB|BOC|PSBC|CZB|CBHB|ECITIC|CEB|HXB|CMBC|CMB|CIB|CGB|PAB|SPDB|EBCL' ) re_data = dict() if data["TITLE_"]: # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" 通用列族字段 re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] # re_data["URL_"] = "" prov_c = None prov_n = None city_c = None city_n = None area_c = None area_n = None bank_n = None bank_c = pattern.match(data["ENTITY_CODE_"]) if bank_c: re_data["BANK_CODE_"] = bank_c.group() else: return None # 正则去除银行名称,方便匹配地区编码 bank_n = re.sub( r"{}银?行?|微信|[总分支]行".format( self.name_dict[re_data["BANK_CODE_"]][:-2]), "", data["ENTITY_NAME_"]) re_data["BANK_NAME_"] = self.name_dict[re_data["BANK_CODE_"]] re_data["PERIOD_CODE_"] = data["PERIOD_CODE_"].replace("-", "") re_data["NOTICE_TIME_"] = data["PERIOD_CODE_"] re_data["STATUS_"] = "1" re_data["CONTENT_"] = data["CONTENT_"] re_data["REMARK_"] = "" for area in area_list: if area["NAME_"] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] if area_c: pass else: for prov in province_list: if prov["NAME_"] in bank_n: prov_c = prov["CODE_"] prov_n = prov["NAME_"] bank_n = bank_n.replace(prov_n, "") break elif prov["NAME_"][:-1] in bank_n: prov_c = prov["CODE_"] prov_n = prov["NAME_"] bank_n = bank_n.replace(prov_n[:-1], "") break elif prov["NAME_"][:4] in bank_n: prov_c = prov["CODE_"] prov_n = prov["NAME_"] bank_n = bank_n.replace(prov_n[:4], "") break elif prov["NAME_"][:3] in bank_n: prov_c = prov["CODE_"] prov_n = prov["NAME_"] bank_n = bank_n.replace(prov_n[:3], "") break elif prov["NAME_"][:2] in bank_n: prov_c = prov["CODE_"] prov_n = prov["NAME_"] bank_n = bank_n.replace(prov_n[:2], "") break for city in city_list: if len(city["NAME_"]) == 1: continue if prov_c: if city["CODE_"][:2] == prov_c[:2]: if city["NAME_"] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n, "") break elif city["NAME_"][:-1] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:-1], "") break elif city["NAME_"][:4] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:4], "") break elif city["NAME_"][:3] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:3], "") break elif city["NAME_"][:2] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:2], "") break else: if city["NAME_"] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n, "") break elif city["NAME_"][:-1] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:-1], "") break elif city["NAME_"][:4] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:4], "") break elif city["NAME_"][:3] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:3], "") break elif city["NAME_"][:2] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:2], "") break for area in area_list: if city_c: if area["CODE_"][:2] == city_c[:2]: if area["NAME_"] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:-1] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:4] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:3] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:2] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif prov_c: if area["CODE_"][:2] == prov_c[:2]: if area["NAME_"] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:-1] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:4] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:3] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:2] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break else: if area["NAME_"][:-1] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:4] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:3] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:2] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break # 特殊情况 星子县现为庐山市 喻家山位于武汉洪山区 if "星子县" in data["ENTITY_NAME_"]: area_c = "360483" area_n = "庐山市" elif "喻家山" in data["ENTITY_NAME_"]: area_c = "420111" area_n = "洪山区" elif "江南西" in data["ENTITY_NAME_"]: area_c = "440105" area_n = "海珠区" elif "两路口" in data["ENTITY_NAME_"]: area_c = "500103" area_n = "渝中区" elif "大兴安岭" in data["ENTITY_NAME_"]: area_c = "232700" area_n = "大兴安岭地区" elif "张家港" in data["ENTITY_NAME_"]: area_c = "320582" area_n = "张家港市" elif "兴业银行新阳支行" in data["ENTITY_NAME_"]: area_c = "230102" area_n = "道里区" if area_c: pass elif (not area_c) and city_c: area_c = city_c area_n = city_n elif (not area_c) and (not city_c) and prov_c: area_c = prov_c area_n = prov_n # 总行地区处理 elif (not area_c) and (not city_c) and (not prov_c): if re_data["BANK_CODE_"] == "ICBC": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "ABC": area_c = "110101" area_n = "东城区" elif re_data["BANK_CODE_"] == "BOCOM": area_c = "310115" area_n = "浦东新区" elif re_data["BANK_CODE_"] == "CCB": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "BOC": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "PSBC": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "CZB": area_c = "330103" area_n = "下城区" elif re_data["BANK_CODE_"] == "CBHB": area_c = "120103" area_n = "河西区" elif re_data["BANK_CODE_"] == "ECITIC": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "CEB": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "HXB": area_c = "110101" area_n = "东城区" elif re_data["BANK_CODE_"] == "CMBC": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "CMB": area_c = "440304" area_n = "福田区" elif re_data["BANK_CODE_"] == "CIB": area_c = "350102" area_n = "鼓楼区" elif re_data["BANK_CODE_"] == "CGB": area_c = "440104" area_n = "越秀区" elif re_data["BANK_CODE_"] == "PAB": area_c = "440303" area_n = "罗湖区" elif re_data["BANK_CODE_"] == "SPDB": area_c = "310101" area_n = "黄浦区" elif re_data["BANK_CODE_"] == "EBCL": area_c = "370602" area_n = "芝罘区" re_data["AREA_CODE_"] = area_c if area_c: re_data["UNIT_CODE_"] = re_data[ "BANK_CODE_"] + "_" + area_c[:4] + "00" if ("b" in data["BANK_NAME_"]) or ("B" in data["BANK_NAME_"]): return None if "DATETIME_" not in data: time_array = time.localtime(int(float(data["DEALTIME_"]))) value_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) re_data["CREATE_TIME_"] = value_time else: re_data["CREATE_TIME_"] = data["DATETIME_"] # data["UPDATE_TIME_"] = "" re_data["TITLE_"] = data["TITLE_"] re_data["CONTENT_TYPE_"] = data["CONTENT_TYPE_"] re_data["WECHAT_ID_"] = data["WECHAT_"].replace(" ", "") re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["DEALTIME_"] = str(data["DEALTIME_"]) # print(area_c, area_n, data["ENTITY_NAME_"]) return re_data else: return None def delete_data_from_mongo(self): """ 从 MongoDB 删除数据 :return: delete_count """ try: remove_count = self.m_client.remove_from_mongo( collection=self.collection, remove_id_list=self.remove_id_list) return remove_count except pymongo.errors.ServerSelectionTimeoutError: mongo_data_list = self.m_client.remove_from_mongo( collection=self.collection, remove_id_list=self.remove_id_list) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None def upsert_and_delete(self, mongo_data_list, province_list, city_list, area_list): """ 插入和删除 :param mongo_data_list: :param province_list: :param city_list: :param area_list: :return: """ for i in range(1000000): status = False self.data_id = "" success_count = 0 try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) # print(data["_id"]) # self.remove_id_list.append(self.data_id) # del data["_id"] # copy_data = deepcopy(data) # self.copy_mongo_data_list.append(copy_data) # 清洗数据 try: re_data = self.data_shuffle(data=data, province_list=province_list, city_list=city_list, area_list=area_list) except Exception as e: # self.remove_id_list.remove(self.data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # 获取公众号名称 # try: # print(re_data["WECHAT_ID_"]) re_data["WECHAT_NAME_"] = self.check_name( re_data["WECHAT_ID_"]) # re_data["WECHAT_NAME_"] = self.req_for_name(re_data["WECHAT_ID_"]) # print(re_data["WECHAT_ID_"]) # print(re_data["WECHAT_NAME_"]) # except Exception as e: # 向 HBase 插入数据 try: count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) success_count += count except jaydebeapi.DatabaseError as e: # self.logger.info("error: {}".format(e)) # self.remove_id_list.remove(self.data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # # Phoenix 连接关闭 # p_client.close_client_phoenix(connection=connection) # time.sleep(10) # # 连接 Phoenix # connection = p_client.connect_to_phoenix() # # 向 HBase 插入数据 # count = p_client.upsert_to_phoenix_by_one(connection=connection, data=re_data) # success_count += count # try: # # 添加 {d:1} # update_count = self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, # data_dict={"d": 1}) # self.remove_count += update_count # # self.logger.info("MongoDB 更新成功") # if self.remove_count % 10 == 0: # self.logger.info("MongoDB 更新成功, 成功条数 {} 条".format("10")) # except Exception as e: # # self.remove_id_list.remove(data_id) # # self.copy_mongo_data_list.remove(copy_data) # self.logger.warning("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e)) # continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 # self.remove_id_list.remove(self.data_id) # self.copy_mongo_data_list.remove(copy_data) continue # # 删除数据 # if status: # delete_count = self.delete_data_from_mongo() # self.remove_count += delete_count # self.logger.info("MongoDB 删除成功") # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # # 将数据插入 spider_data_old 中 # if status: # try: # self.old_client.mongo_db = "spider_data_old" # insert_count = self.old_client.all_to_mongodb(collection=self.collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except pymongo.errors.ServerSelectionTimeoutError as e: # time.sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = self.old_client.all_to_mongodb(collection=self.collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) def main(self): """ :return: """ # # 删除表 # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # 建表语句 # table_sql = ('create table "WECHAT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."URL_" varchar, "C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar,' # '"C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar,' # '"C"."REMARK_" varchar, "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, ' # '"T"."CONTENT_" varchar, "C"."CONTENT_TYPE_" varchar, "C"."TITLE_" varchar,' # '"C"."WECHAT_ID_" varchar, "C"."WECHAT_NAME_" varchar, "C"."ENTITY_NAME_" varchar,' # '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar, "C"."PRAISES_" varchar,' # '"C"."READ_NUM_" varchar, "C"."REPLIES_" varchar, "C"."RELAYS_" varchar,' # '"C"."NOTICE_TIME_" varchar, "C"."IMPROTANCE_" varchar) IMMUTABLE_ROWS = true') # # # 创建表 # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) # f_id = "5c1267258d7fee59f7d089f8" # gte 10M # f_id = "5c1271a28d7fee66df0fdd83" # gte 10M # f_id = "5c127e7b9bb3df7412b53b04" # gte 10M # f_id = "5c1330d28d7fee4d9c87d6e1" # gte 10M # f_id = "5c1330ed9bb3df2de33bb746" # gte 10M # f_id = "5c13490a8d7fee79f1d9e87f" # gte 10M # f_id = "5c1350ee8d7fee2d29b601ef" # gte 10M # f_id = "5c1351c79bb3df0e23ee68c1" # gte 10M # f_id = "5c13547d9bb3df06d41997d5" # gte 10M # f_id = "5c1354849bb3df202508ee3e" # gte 10M # f_id = "5c1354bd8d7fee44b881b11a" # gte 10M # f_id = "5c1354e89bb3df1b2a6ef59c" # gte 10M # f_id = "5c1355139bb3df197beb11c0" # gte 10M # f_id = "5c1355328d7fee2f0997a3ac" # gte 10M # f_id = "5c13558e8d7fee50ea04bd0a" # gte 10M # f_id = "5c135a5f8d7fee5bf7db91b8" # gte 10M # f_id = "5c135b0c8d7fee697fa5bd80" # gte 10M # f_id = "5c135bd59bb3df4d7aa66cad" # gte 10M # f_id = "5c135bdb9bb3df454c0157a3" # gte 10M # f_id = "5c135bfc8d7fee73c8f84567" # gte 10M # f_id = "5c135c119bb3df48aeb8fe63" # gte 10M # f_id = "5c135dfe9bb3df4d7aa66cc2" # gte 10M # f_id = "5c13602d8d7fee7f7a48c485" # gte 10M # f_id = "5c1361858d7fee223825f805" # gte 10M # f_id = "5c1361d68d7fee561806fc4d" # gte 10M # f_id = "5c1362068d7fee223825f808" # gte 10M # f_id = "5c1362159bb3df26bba60a05" # gte 10M # f_id = "5c1366248d7fee6741adb5be" # gte 10M # f_id = "5c1366418d7fee673f6c95cb" # gte 10M # f_id = "5c1367099bb3df5a0e013c4d" # gte 10M # f_id = "5c13686d8d7fee76ac78735b" # gte 10M # f_id = "5c1368788d7fee6fcb24daa3" # gte 10M # f_id = "5c1369438d7fee63412b04ff" # gte 10M # f_id = "5c13697b9bb3df60429b5d31" # gte 10M # f_id = "5c1389468d7fee6a94c413c3" # gte 10M # f_id = "5c1389c29bb3df75adc8861a" # gte 10M # f_id = "5c138b039bb3df75adc88620" # gte 10M # f_id = "5c138e3d9bb3df074c4ec0b3" # gte 10M # f_id = "5c138e4d8d7fee06a4f8fd59" # gte 10M # f_id = "5c1391318d7fee168749a96e" # gte 10M # f_id = "5c25a4f19bb3df51eba386b8" # gte 10M # f_id = "5c2601ef9bb3df7d42fe2084" # gte 10M # f_id = "5c2608099bb3df24f5db4527" # gte 10M # f_id = "5c2608be9bb3df2d58d08e32" # gte 10M # f_id = "5c260d2b9bb3df3c084d2a83" # gte 10M # f_id = "5c2615868d7fee2771bb3914" # gte 10M # f_id = "5c261d528d7fee3c1383db85" # gte 10M # f_id = "5c26340e8d7fee66d784fe8a" # gte 10M # f_id = "5c263b818d7fee630f0d3ac4" # gte 10M # f_id = "5c263ee28d7fee04ddc62e31" # gte 10M # f_id = "5c263f269bb3df0d29d1e1e5" # gte 10M # f_id = "5c2766718d7fee2aa36fa166" # gte 10M # f_id = "5c2b79ef8d7fee3025e02575" # gte 10M # f_id = "5c2b854a9bb3df27dc669d5a" # gte 10M # f_id = "5c2e00078d7fee1b60443cf3" # gte 10M # f_id = "5c2f69028d7fee62d31a72db" # gte 10M # f_id = "5c36a7948d7fee18d9333327" # gte 10M # f_id = "5c36b9ff9bb3df332dfebe39" # gte 10M # f_id = "5c3754579bb3df02b680150b" # gte 10M # f_id = "5c375c969bb3df6afd18e22d" # gte 10M # f_id = "5c38a1e59bb3df6b2ff2f269" # gte 10M # f_id = "5c394e058d7fee6a2582d1d3" # gte 10M # f_id = "5c3c983e9bb3df21ddf94a92" # gte 10M # f_id = "5c3ca38a9bb3df60bca07833" # gte 10M f_id = "5c3c983e9bb3df21ddf94a92" # f_id = "" self.data_id = f_id province_list, city_list, area_list, dir_area_list = self.mysql_client.area_from_mysql( ) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection, data_id=self.data_id) self.find_count += mongo_data_list.count() try: self.upsert_and_delete(mongo_data_list=mongo_data_list, province_list=province_list, city_list=city_list, area_list=area_list) except jaydebeapi.DatabaseError: self.logger.info("error id is: {}".format(self.data_id)) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection, data_id=self.data_id) self.upsert_and_delete(mongo_data_list=mongo_data_list, province_list=province_list, city_list=city_list, area_list=area_list) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class FinProductScript(object): def __init__(self): # 创建 MySQL 对象 __mysql_config = { "host": MYSQL_HOST_25, "port": MYSQL_PORT_25, "database": MYSQL_DATABASE_25, "user": MYSQL_USER_25, "password": MYSQL_PASSWORD_25, "table": MYSQL_TABLE_25 } __mysql_client = MysqlClient(**__mysql_config) __mysql_connection = __mysql_client.client_to_mysql() self.sales_status = __mysql_client.search_area_code( sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'SALES_STATUS\'", connection=__mysql_connection) self.produc_category = __mysql_client.search_area_code( sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'PRODUC_CATEGORY\'", connection=__mysql_connection) self.revenue_type = __mysql_client.search_area_code( sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'REVENUE_TYPE\'", connection=__mysql_connection) self.operaton_pattern = __mysql_client.search_area_code( sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'OPERATION_PATTERN\'", connection=__mysql_connection) self.purchase_amount = __mysql_client.search_area_code( sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'PURCHASE_AMOUNT\'", connection=__mysql_connection) self.duration_type = __mysql_client.search_area_code( sql="select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'DURATION_TYPE\'", connection=__mysql_connection) __mysql_client.close_client(connection=__mysql_connection) self.logger = Logger().logger self.remove_id_list = list() self.copy_mongo_data_list = list() # "CZBFinancial", "PABFinancial", "PSBCFinancial", # self.entity_list = ["ABCFinancial", "BOCFinancial", "BOCOMFinancial", "CBHBFinancial", "CCBFinancial", "CEBFinancial", "CGBFinancial", "CIBFinancial", "CMBCFinancial", "CMBFinancial", "EBCLFinancial", "ECITICFinancial", "HXBFinancial", "ICBCFinancial", "SPDBFinancial", "CHINANETFINANCIAL", "JSFIN_CCBDATA"] self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.verify_list = ["ID_", "ENTITY_CODE_", "AREA_CODE_", "BANK_CODE_", "BANK_NAME_", "UNIT_CODE_", "PERIOD_CODE_", "CONTENT_", "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "CODE_", "NAME_", "TIME_LIMIT_", "YIELD_RATE_", "BREAKEVEN_", "START_FUNDS_", "INVEST_PERIOD_", "SALE_START_", "SALE_END_", "RISK_LEVEL_", "REDEMING_MODE_", "PRIVATE_BANK_", "URL_", "DEALTIME_", "DATETIME_", "ENTITY_NAME_", "STATUS_", "SALE_DISTRICT_"] def get_data_from_mongo(self, m_client, collection, entity_code, data_id): """ :param m_client: MongoDB client :param collection: MongoDB collection :param entity_code: :return: data from MongoDB """ m_client.mongo_db = "spider_data" m_client.mongo_entity_code = entity_code try: mongo_data_list = m_client.search_from_mongodb(collection, data_id=data_id) return mongo_data_list except pymongo.errors.ServerSelectionTimeoutError: self.logger.info("连接失败,正在重新连接") sleep(1) mongo_data_list = m_client.search_from_mongodb(collection, data_id=data_id) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None def delete_data_from_mongo(self, m_client, collection, entity_code, remove_id_list): """ :param m_client: MongoDB client :param collection: MongoDB collection :param entity_code: :param remove_id_list: id list to remove :return: delete count """ m_client.mongo_entity_code = entity_code try: remove_count = m_client.remove_from_mongo(collection=collection, remove_id_list=remove_id_list) return remove_count except pymongo.errors.ServerSelectionTimeoutError: mongo_data_list = m_client.remove_from_mongo(collection=collection, remove_id_list=remove_id_list) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="FINPRODUCT_FINASSIST") p_client.verify_list = self.verify_list # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection(db=db, collection_list=collection_list) # 创建 MongoDB spider_data_old 数据库对象 # old_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST") # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, # serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) # old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = old_client.client_to_mongodb() # collection_old = db_old["FINPRODUCT_FINASSIST"] # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # 表创建语句 # sql = ('create table "FINPRODUCT_FINASSIST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,' # '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, ' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "C"."STATUS_" varchar,' # '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."TIME_LIMIT_" varchar,' # '"C"."YIELD_RATE_" varchar, "C"."BREAKEVEN_" varchar, "C"."START_FUNDS_" varchar,' # '"C"."INVEST_PERIOD_" varchar, "C"."SALE_DISTRICT_" varchar, "C"."SALE_START_" varchar,' # '"C"."SALE_END_" varchar, "C"."RISK_LEVEL_" varchar, "C"."REDEMING_MODE_" varchar,' # '"C"."PRIVATE_BANK_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."CURRENCY_TYPE_" varchar, "C"."INCREASE_UNIT_" varchar,' # '"C"."YIELD_START_DATE_" varchar, "C"."YIELD_END_DATE_" varchar, "C"."YIELD_TYPE_" varchar,' # '"C"."TARGET_" varchar, "C"."PRODUCT_TYPE_" varchar, "C"."YIELD_STATMENT_" varchar,' # '"C"."INVEST_RANGE_" varchar, "C"."PRE_STOP_" varchar, "C"."RASE_PLAN_" varchar,' # '"C"."PURCHASE_" varchar, "T"."CONTENT_" varchar, "C"."IMAGE_" varchar) IMMUTABLE_ROWS = true') # # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 增加列 # p_client.add_column_phoenix(connection=connection, column="IMAGE_") for entity in ["CHINANETFINANCIAL", "JSFIN_CCBDATA"]: # for entity in self.entity_list: status = False module_name = __import__(entity) self.logger.info("开始进行 ENTITY_CODE_: {}".format(entity)) self.remove_id_list = [] self.copy_mongo_data_list = [] # find_id = "5c3f118f8d7fee068da6ef53" find_id = None try: if entity == "JSFIN_CCBDATA": m_client.mongo_collection = "JSFIN_CCBDATA" collection = m_client.get_check_collection(db=db, collection_list=collection_list) mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client, collection=collection, data_id=None) else: m_client.mongo_collection = "FINPRODUCT_FINASSIST" collection = m_client.get_check_collection(db=db, collection_list=collection_list) mongo_data_list = self.get_data_from_mongo(m_client=m_client, collection=collection, entity_code=entity, data_id=find_id) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) if entity == "JSFIN_CCBDATA": m_client.mongo_collection = "JSFIN_CCBDATA" mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client, collection=collection, data_id=None) else: m_client.mongo_collection = "FINPRODUCT_FINASSIST" collection = m_client.get_check_collection(db=db, collection_list=collection_list) mongo_data_list = self.get_data_from_mongo(m_client=m_client, collection=collection, entity_code=entity, data_id=find_id) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count += mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) if entity == "CHINANETFINANCIAL": re_data = module_name.data_shuffle(data=data, sales_status=self.sales_status, produc_category=self.produc_category, revenue_type=self.revenue_type, operaton_pattern=self.operaton_pattern, purchase_amount=self.purchase_amount, duration_type=self.duration_type) elif entity == "JSFIN_CCBDATA": re_data = module_name.ScriptCCB.data_shuffle(self=self, data=data) else: re_data = module_name.data_shuffle(data) if not re_data: self.bad_count += 1 continue except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format(data_id, e)) continue print(data_id) # phoenix_HBase 插入数据 if isinstance(re_data, dict): try: success_count = p_client.upsert_to_phoenix_by_one(connection=connection, data=re_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e)) continue elif isinstance(re_data, list): for r_data in re_data: try: success_count = p_client.upsert_to_phoenix_by_one(connection=connection, data=r_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e)) continue if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) else: continue break # # 删除数据 # if status: # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection, # entity_code=entity, # remove_id_list=self.remove_id_list) # self.remove_count += delete_count # # self.logger.info("MongoDB 删除成功") # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # 将数据插入 spider_data_old 中 # if status: # try: # old_client.mongo_db = "spider_data_old" # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except pymongo.errors.ServerSelectionTimeoutError as e: # sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) # 关闭连接 m_client.client_close() # p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.handlers.clear()
class ScriptCCB(object): def __init__(self): self.logger = Logger().logger self.remove_id_list = list() self.copy_mongo_data_list = list() # 创建 MySQL 对象 __mysql_config = { "host": MYSQL_HOST_25, "port": MYSQL_PORT_25, "database": MYSQL_DATABASE_25, "user": MYSQL_USER_25, "password": MYSQL_PASSWORD_25, "table": MYSQL_TABLE_25 } __mysql_client = MysqlClient(**__mysql_config) __mysql_connection = __mysql_client.client_to_mysql() self.sales_status = __mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'SALES_STATUS\'", connection=__mysql_connection) self.produc_category = __mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'PRODUC_CATEGORY\'", connection=__mysql_connection) self.revenue_type = __mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'REVENUE_TYPE\'", connection=__mysql_connection) self.operaton_pattern = __mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'OPERATION_PATTERN\'", connection=__mysql_connection) self.purchase_amount = __mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'PURCHASE_AMOUNT\'", connection=__mysql_connection) self.duration_type = __mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'DURATION_TYPE\'", connection=__mysql_connection) __mysql_client.close_client(connection=__mysql_connection) self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.verify_list = [ "ID_", "ENTITY_CODE_", "AREA_CODE_", "BANK_CODE_", "BANK_NAME_", "UNIT_CODE_", "PERIOD_CODE_", "CONTENT_", "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "CODE_", "NAME_", "TIME_LIMIT_", "YIELD_RATE_", "BREAKEVEN_", "START_FUNDS_", "INVEST_PERIOD_", "SALE_START_", "SALE_END_", "RISK_LEVEL_", "REDEMING_MODE_", "PRIVATE_BANK_", "URL_", "DEALTIME_", "DATETIME_", "ENTITY_NAME_", "STATUS_", "SALE_DISTRICT_", "CURRENCY_TYPE_", "INCREASE_UNIT_", "YIELD_START_DATE_", "YIELD_END_DATE_", "YIELD_TYPE_", "TARGET_", "PRODUCT_TYPE_", "YIELD_STATMENT_", "INVEST_RANGE_", "PRE_STOP_", "RASE_PLAN_", "PURCHASE_" ] @staticmethod def get_data_from_mongo(self, m_client, collection, data_id): """ :param m_client: MongoDB client :param collection: MongoDB collection :return: data from MongoDB """ m_client.mongo_db = "spider_data" try: mongo_data_list = m_client.all_from_mongodb(collection, data_id=data_id) return mongo_data_list except pymongo.errors.ServerSelectionTimeoutError: self.logger.info("连接失败,正在重新连接") sleep(1) mongo_data_list = m_client.all_from_mongodb(collection, data_id=data_id) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None @staticmethod def data_shuffle(self, data): if "上海银行" in data["BANK_NAME"]: bank_code = "BankOfShanghai" elif "天津银行" in data["BANK_NAME"]: bank_code = "TJBANK" elif "北京银行" in data["BANK_NAME"]: bank_code = "BOB" elif "中国光大银行股份有限公司" in data["BANK_NAME"]: bank_code = "CEB" else: print(data) re_data = dict() hash_m = hashlib.md5() hash_m.update(data["NAME_"].encode("utf-8")) hash_id = hash_m.hexdigest() re_data["ID_"] = bank_code + "_" + hash_id + "_" + data["SALE_START_"] re_data["ENTITY_CODE_"] = "RONG360FINANCIAL" # re_data["AREA_CODE_"] re_data["BANK_CODE_"] = bank_code re_data["BANK_NAME_"] = data["BANK_NAME"].replace("股份有限公司", "") # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["SALE_START_"].replace("-", "") # re_data["CONTENT_"] re_data["STATUS_"] = "" # re_data["REMARK_"] re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] re_data["NAME_"] = data["NAME_"] # 售卖时间范围 re_data["TIME_LIMIT_"] = "" # 收益率 re_data["LOW_YIELD_RATE_"] = data["YIELD_RATE_"].replace("%", "") re_data["HIGH_YIELD_RATE_"] = data["YIELD_RATE_"].replace("%", "") # 售卖区域 re_data["SALE_DISTRICT_"] = data["SALE_AREA_"] # 起购金额 data["START_FUNDS_"] = data["START_FUNDS_"].replace("亿", "00000000") data["START_FUNDS_"] = data["START_FUNDS_"].replace("万", "0000") data["START_FUNDS_"] = data["START_FUNDS_"].replace("千", "000") if data["START_FUNDS_"]: if int(data["START_FUNDS_"]) < 50000: match_funds = "5万以下" elif 50000 <= int(data["START_FUNDS_"]) < 100000: match_funds = "5万-10万" elif 100000 <= int(data["START_FUNDS_"]) < 200000: match_funds = "10万-20万" elif 20000 <= int(data["START_FUNDS_"]) < 500000: match_funds = "20万-50万" elif 500000 <= int(data["START_FUNDS_"]): match_funds = "50万以上" else: match_funds = "不限" for i in self.purchase_amount: if match_funds in i["ITEM_LABEL_"]: re_data["START_FUNDS_"] = data["START_FUNDS_"] re_data["START_FUNDS_CODE_"] = i["ITEM_VALUE_"] # 期限 data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"].replace("天", "") if data["INVEST_PERIOD_"]: if int(data["INVEST_PERIOD_"]) <= 30: match_str = "1个月内" elif 30 < int(data["INVEST_PERIOD_"]) <= 90: match_str = "1-3个月(含)" elif 90 < int(data["INVEST_PERIOD_"]) <= 180: match_str = "3-6个月(含)" elif 180 < int(data["INVEST_PERIOD_"]) <= 365: match_str = "6-12个月(含)" elif 365 < int(data["INVEST_PERIOD_"]): match_str = "12个月以上" else: match_str = "不限" for i in self.duration_type: if match_str in i["ITEM_LABEL_"]: re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"] re_data["INVEST_PERIOD_CODE_"] = i["ITEM_VALUE_"] # 开始售卖时间 re_data["SALE_START_"] = data["SALE_START_"] # 结束售卖时间 re_data["SALE_END_"] = data["SALE_END_"] # 风险等级 # re_data["RISK_LEVEL_"] = data["RISK_LEVEL_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] re_data["DATETIME_"] = data["DATETIME_"] re_data["ENTITY_NAME_"] = "融360理财产品" # NEW # 认购货币(类型) re_data["CURRENCY_TYPE_"] = data["CURRENCY_TYPE_"] # 递增单位 re_data["INCREASE_UNIT_"] = re.sub(r"元.*", "", data["INCREASE_UNIT_"]) # 收益起记(日期) re_data["YIELD_START_DATE_"] = data["YIELD_START_DATE_"][:10] # 收益结束(日期) re_data["YIELD_END_DATE_"] = data["YIELD_START_DATE_"][-10:] # 收益获取方式 for i in self.revenue_type: if i["ITEM_LABEL_"] == data["YIELD_TYPE_"]: re_data["YIELD_TYPE_"] = data["YIELD_TYPE_"] re_data["YIELD_TYPE_CODE_"] = i["ITEM_VALUE_"] break # 对象(目标人群) re_data["TARGET_"] = data["TARGET_"] # 产品类型 re_data["PRODUCT_TYPE_"] = data["PRODUCT_TYPE_"] # 收益率说明 re_data["YIELD_STATMENT_"] = data["YIELD_STATMENT_"] # 投资范围 re_data["INVEST_RANGE_"] = data["INVEST_RANGE_"] # 提前终止条件 re_data["PRE_STOP_"] = data["PRE_STOP_"] # 募集规划条件 re_data["RASE_PLAN_"] = data["RASE_PLAN_"] # 申购条件 re_data["PURCHASE_"] = data["PURCHASE_"] # 无 # re_data["CODE_"] = data["CODE_"] # 是否保本 # re_data["BREAKEVEN_"] = data["BREAKEVEN_"] # # 可否赎回 # re_data["REDEMING_MODE_"] # # 私人银行 # re_data["PRIVATE_BANK_"] return re_data def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="FINPRODUCT_FINASSIST") p_client.verify_list = self.verify_list # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="JSFIN_CCBDATA") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 MongoDB spider_data_old 数据库对象 # old_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST") # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, # serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) # old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = old_client.client_to_mongodb() # collection_old = db_old["FINPRODUCT_FINASSIST"] # 删除表 # p_client.drop_table_phoenix(connection=connection) # # # 表创建语句 # sql = ('create table "FINPRODUCT_FINASSIST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar, "C"."IMPORTANCE_" varchar,' # '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, "C"."SALE_STATUS_CODE_" varchar,' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar,' # '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."TIME_LIMIT_" varchar, "C"."SALE_STATUS_" varchar,' # '"C"."LOW_YIELD_RATE_" varchar, "C"."HIGH_YIELD_RATE_" varchar, "C"."BREAKEVEN_" varchar,' # '"C"."START_FUNDS_" varchar, "C"."START_FUNDS_CODE_" varchar, "C"."INVEST_PERIOD_" varchar,' # '"C"."INVEST_PERIOD_CODE_" varchar, "C"."SALE_DISTRICT_" varchar, "C"."SALE_START_" varchar,' # '"C"."SALE_END_" varchar, "C"."RISK_LEVEL_" varchar, "C"."REDEMING_MODE_" varchar,' # '"C"."PRIVATE_BANK_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."CURRENCY_TYPE_" varchar, "C"."INCREASE_UNIT_" varchar,' # '"C"."YIELD_START_DATE_" varchar, "C"."YIELD_END_DATE_" varchar, "C"."YIELD_TYPE_" varchar,' # '"C"."YIELD_TYPE_CODE_" varchar, "C"."TARGET_" varchar, "C"."PRODUCT_TYPE_" varchar,' # '"C"."INVESTOR_TYPE_" varchar, "C". "INVESTOR_TYPE_CODE_" varchar, "C"."YIELD_STATMENT_" varchar,' # '"C"."OPERA_MODEL_CODE_" varchar, "C"."OPERA_MODEL_" varchar,' # '"C"."INVEST_RANGE_" varchar, "C"."PRE_STOP_" varchar, "C"."RASE_PLAN_" varchar,' # '"C"."PURCHASE_" varchar, "C"."STATUS_" varchar) IMMUTABLE_ROWS = true') # # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 增加列 # colum_list = ["CURRENCY_TYPE_", "INCREASE_UNIT_", "YIELD_START_DATE_", "YIELD_END_DATE_", "YIELD_TYPE_", # "TARGET_", "PRODUCT_TYPE_", "YIELD_STATMENT_", "INVEST_RANGE_", "PRE_STOP_", "RASE_PLAN_", # "PURCHASE_"] # p_client.add_column_phoenix(connection=connection, column=colum_list) status = False self.logger.info("开始进行 ENTITY_CODE_: RONG360FINANCIAL") self.remove_id_list = [] self.copy_mongo_data_list = [] find_id = None try: mongo_data_list = self.get_data_from_mongo(self=self, m_client=m_client, collection=collection, data_id=find_id) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) mongo_data_list = self.get_data_from_mongo(self=self, m_client=m_client, collection=collection, data_id=find_id) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count += mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) re_data = self.data_shuffle(self=self, data=data) if not re_data: self.bad_count += 1 continue except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue print(data_id) # phoenix_HBase 插入数据 if isinstance(re_data, dict): try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e)) continue elif isinstance(re_data, list): for r_data in re_data: try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=r_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) else: quit() # # 添加 {d:1} # if status: # update_count = m_client.update_to_mongodb(collection=collection, data_id=self.remove_id_list, # data_dict={"d": 1}) # self.remove_count += update_count # self.logger.info("MongoDB 更新成功") # # 删除数据 # if status: # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection, # entity_code=entity, # remove_id_list=self.remove_id_list) # self.remove_count += delete_count # # self.logger.info("MongoDB 删除成功") # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # 将数据插入 spider_data_old 中 # if status: # try: # old_client.mongo_db = "spider_data_old" # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except pymongo.errors.ServerSelectionTimeoutError as e: # sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) # 关闭连接 m_client.client_close() # p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.handlers.clear()
class WeiboBasicInfoScript(object): # 初始化参数 def __init__(self, entity_type="WEIBOBASICINFO"): self.entity_type = entity_type self.logger = Logger().logger self.verify_list = [ "ID_", "BANK_CODE_", "BANK_NAME_", "PERIOD_TIME_", "AREA_CODE_", "CREATE_TIME_", "WEIBO_CODE_", "MAIN_URL_", "NAME_", "FOCUS_", "FANS_", "COMPANY_URL_", "COMPANY_", "DETAILED_URL_", "VIRIFIED_", "BIREF_", "ENTITY_NAME_", "ENTITY_CODE_", "DEALTIME_", "PROVINCE_NAME_", "PROVINCE_CODE_", "STATUS_1" ] self.remove_id_list = list() self.copy_mongo_data_list = list() self.branch_code_list = list() self.find_count = 0 self.bad_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 def match_weibo_code(self, match): mongo_client = MongoClient(mongo_collection="WEIBOBASICINFO") db, collection_list = mongo_client.client_to_mongodb() collection = mongo_client.get_check_collection(db, collection_list) result = mongo_client.match_from_mongo(collection=collection, match=match, output="WEIBO_CODE_") return result def data_shuffle(self, data, province_list): re_data = dict() prov_n = "" prov_c = "" # # BANK_NAME_ 字典 # name_dict = {"ICBC": "中国工商银行", "ABC": "中国农业银行", "BOC": "中国银行", "CCB": "中国建设银行", # "BOCOM": "交通银行", "PSBC": "中国邮政储蓄银行", "CZB": "浙商银行", "CBHB": "渤海银行", # "ECITIC": "中信银行", "CEB": "中国光大银行", "HXB": "华夏银行", "CMBC": "中国民生银行", # "CMB": "招商银行", "CIB": "兴业银行", "CGB": "广发银行", "PAB": "平安银行", # "SPDB": "浦发银行", "EBCL": "恒丰银行"} province_list, city_list, area_list, dir_area_list, bank_list = GenericScript.data_from_mysql( ) bank_code = data["BANK_CODE_"][:-9] bank_name = name_dict[bank_code] time_array = time.localtime(int(data["DEALTIME_"])) period_time = time.strftime("%Y%m%d", time_array) for prov in province_list: if prov["NAME_"][:2] in data["LOCATION_"]: prov_n = prov["NAME_"] prov_c = prov["CODE_"] # "C" # TODO row_key 时间戳还是年-月-日 re_data["ID_"] = data["BANK_CODE_"] + "_" + period_time re_data["BANK_CODE_"] = bank_code re_data["BANK_NAME_"] = bank_name re_data["PERIOD_CODE_"] = period_time re_data["AREA_CODE_"] = prov_c re_data["CREATE_TIME_"] = period_time re_data["WEIBO_CODE_"] = data["WEIBO_CODE_"] re_data["MAIN_URL_"] = data["MAIN_URL_"] re_data["NAME_"] = data["NAME_"] re_data["FOCUS_"] = data["FOCUS_"] re_data["FANS_"] = data["FANS_"] re_data["COMPANY_URL_"] = data["COMPANY_URL_"] if "COMPANY_" not in data: re_data["COMPANY_"] = data["VIRIFIED_"] else: re_data["COMPANY_"] = data["COMPANY_"] re_data["DETAILED_URL_"] = data["DETAILED_URL_"] re_data["VIRIFIED_"] = bank_name + "股份有限公司" re_data["BIREF_"] = data["BIREF_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["ENTITY_CODE_"] = data["BANK_CODE_"] re_data["DEALTIME_"] = data["DEALTIME_"] re_data["PROVINCE_NAME_"] = prov_n re_data["PROVINCE_CODE_"] = prov_c re_data["STATUS_"] = "" return re_data # 从 MongoDB 删除数据 def delete_data_from_mongo(self, m_client, collection, remove_id_list): m_client.mongo_entity_code = None try: remove_count = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return remove_count except pymongo.errors.ServerSelectionTimeoutError: mongo_data_list = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="WEIBOBASICINFO") p_client.verify_list = self.verify_list # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="WEIBOBASICINFO") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # # 创建 MongoDB spider_data_old 数据库对象 # old_client = MongoClient(mongo_collection="WEIBOBASICINFO") # # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60, # connectTimeoutMS=60, connect=False) # old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = old_client.client_to_mongodb() # collection_old = db_old["ORGANIZE_FINASSIST"] # 获取地区编码 province_list, city_list, area_list, dir_area_list = (GenericScript( entity_code=None, entity_type=None).area_from_mysql()) # 删除表 p_client.drop_table_phoenix(connection=connection) # quit() # 创建表 sql = ( 'create table "WEIBOBASICINFO" ("ID_" varchar primary key, "C"."BANK_CODE_" varchar,' '"C"."BANK_NAME_" varchar, "C"."PERIOD_CODE_" varchar, "C"."CREATE_TIME_" varchar,' '"C"."UPDATE_TIME_" varchar, "C"."REMARK_" varchar, "C"."WEIBO_CODE_" varchar, "C"."MAIN_URL_" varchar,' '"C"."NAME_" varchar, "C"."FOCUS_" varchar, "C"."FANS_" varchar, "C"."COMPANY_URL_" varchar,' '"C"."COMPANY_" varchar, "C"."DETAILED_URL_" varchar, "C"."VIRIFIED_" varchar,"C"."AREA_CODE_" varchar,' '"C"."BIREF_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,' '"C"."DEALTIME_" varchar,"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar,' '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true') p_client.create_new_table_phoenix(connection=connection, sql=sql) # 增加列 # p_client.add_column_phoenix(connection=connection, column="IMAGE_") # 遍历 ENTITY_CODE_ 列表 status = False self.logger.info("开始进行 WEIBOBASICINFO") try: mongo_data_list = m_client.all_from_mongodb(collection=collection) except pymongo.errors.ServerSelectionTimeoutError: time.sleep(1) mongo_data_list = m_client.all_from_mongodb(collection=collection) # 清洗数据并插入 HBase if mongo_data_list: self.find_count = mongo_data_list.count() for data in mongo_data_list: re_data = "" data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) re_data = self.data_shuffle(data=data, province_list=province_list) if not re_data: self.bad_count += 1 continue except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) # phoenix_HBase 插入数据 try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if self.success_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format( self.success_count)) else: quit() # # 删除数据 # if status: # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection, # remove_id_list=self.remove_id_list) # self.remove_count += delete_count # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # # 将数据插入 spider_data_old 中 # if status: # try: # old_client.mongo_db = "spider_data_old" # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # except pymongo.errors.ServerSelectionTimeoutError as e: # time.sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class AllToPhoenix(object): def __init__(self): self.file_list = list() self.get_code_list() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.copy_mongo_data_list = list() self.remove_id_list = list() # 字段验证列表 self.verify_list = [ "ID_", "CONTENT_", "NOTICE_TIME_", "TITLE_", "PROJECT_NAME_", "BID_CONTENT_", "SIGN_START_TIME_", "SIGN_END_TIME_", "OPEN_BID_TIME_", "OPEN_BID_PLACE_", "BID_AGENCY_", "APPLY_CONDITION_", "SIGN_QUALIFICATION_", "PROJECT_ID_", "WIN_CANDIDATE_", "CANDIDATE_RANK_", "BID_", "URL_", "DEALTIME_", "CREATE_TIME_", "ENTITY_NAME_", "ENTITY_CODE_", "ENTITY_STATUS_", "SIGN_MATERIAL_", "BID_TYPE_", "DATETIME_", "BUDGET_PRICE_", "PASS_REASON_", "PRESALE_CONTENT_", "PRESALE_WAY_", "PRESALE_START_TIME_", "PRESALE_END_TIME_", "PRESALE_ADDR_", "PRESALE_PREPARE_", "IMAGE_" ] def get_code_list(self): """ 获取当前目录下文件名(去除 "CommonBidding_" 后就是 ENTITY_CODE_ ) :return: """ for root, dirs, files in os.walk(curPath): # print(root) # 当前目录路径 # print(dirs) # 当前路径下所有子目录 # print(files) # 当前路径下所有非目录子文件 self.file_list = files self.file_list.remove("__init_____.py") break def get_data_from_mongo(self, m_client, collection, entity_code): """ :param m_client: MongoDB client :param collection: MongoDB collection :param entity_code: :return: all from MongoDB where ENTITY_CODE_ = entity_code """ m_client.mongo_db = "spider_data" other_query = { "$or": [{ "TITLE_": { "$exists": True } }, { "title": { "$exists": True } }] } try: mongo_data_list = m_client.get_data_from_mongodb( collection=collection, entity_code=entity_code, exclude_code=None, limit_number=None, other_query=other_query) return mongo_data_list except pymongo.errors.ServerSelectionTimeoutError: self.logger.info("连接失败,正在重新连接") time.sleep(1) mongo_data_list = m_client.get_data_from_mongodb( collection=collection, entity_code=entity_code, exclude_code=None, limit_number=None, other_query=other_query) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None def delete_data_from_mongo(self, m_client, collection, entity_code, remove_id_list): """ :param m_client: MongoDB client :param collection: MongoDB collection :param entity_code: :param remove_id_list: delete data id's list :return: delete count """ m_client.mongo_entity_code = entity_code try: remove_count = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return remove_count except pymongo.errors.ServerSelectionTimeoutError: mongo_data_list = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None def shuffle_data(self, data): """ 通用清洗 :param data: :return: """ re_data = dict() if "TITLE_" in data: if not data["TITLE_"]: return hash_m = hashlib.md5() hash_m.update(str(data["TITLE_"]).encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) elif "title" in data: if not data["title"]: return hash_m = hashlib.md5() hash_m.update(data["title"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["entity_code"]) + "_" + str(hash_title) else: return re_data["ID_"] = row_key for key, value in data.items(): # 字段验证 if key in self.verify_list: re_data[key] = value elif key == "entityStatus" or key == "ENTITY_STATUS_": key = "ENTITY_STATUS_" value = "DRAFT" re_data[key] = value if "ENTITY_STATUS_" not in re_data: re_data["ENTITY_STATUS_"] = "DRAFT" for s_key in self.verify_list: if s_key == "CONTENT_" or s_key == "URL_" or s_key == "url": continue if data.get(s_key, ""): data[s_key] = data[s_key].replace("|", "") return re_data def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="CommonBidding") # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="CommonBidding") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 MongoDB spider_data_old 数据库对象 old_client = MongoClient(mongo_collection="CommonBidding") # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, # serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) old_client.mongo_db = "spider_data_old" db_old, collection_list_old = old_client.client_to_mongodb() collection_old = db_old["CommonBidding"] # 删除表 # p_client.drop_table_phoenix(connection=connection) # 招投标表创建语句 # sql = ('create table "CommonBidding" ("ID_" varchar primary key, "F"."CONTENT_" varchar,' # '"F"."NOTICE_TIME_" varchar,"F"."TITLE_" varchar,"F"."PROJECT_NAME_" varchar,' # '"F"."BID_CONTENT_" varchar, "F"."SIGN_START_TIME_" varchar, "F"."SIGN_END_TIME_" varchar,' # '"F"."OPEN_BID_TIME_" varchar, "F"."OPEN_BID_PLACE_" varchar, "F"."BID_AGENCY_" varchar,' # '"F"."APPLY_CONDITION_" varchar, "F"."SIGN_QUALIFICATION_" varchar, "F"."PROJECT_ID_" varchar,' # '"F"."WIN_CANDIDATE_" varchar, "F"."CANDIDATE_RANK_" varchar, "F"."BID_" varchar,"F"."URL_" varchar,' # '"F"."DEALTIME_" varchar, "F"."ENTITY_NAME_" varchar, "F"."ENTITY_CODE_" varchar,' # '"F"."ENTITY_STATUS_" varchar, "F"."SIGN_MATERIAL_" varchar, "F"."BID_TYPE_" varchar,' # '"F"."DATETIME_" varchar, "F"."BUDGET_PRICE_" varchar, "F"."PASS_REASON_" varchar,' # '"F"."PRESALE_CONTENT_" varchar, "F"."PRESALE_WAY_" varchar,"F"."PRESALE_START_TIME_" varchar,' # '"F"."PRESALE_END_TIME_" varchar,"F"."PRESALE_ADDR_" varchar,"F"."PRESALE_PREPARE_" varchar,' # '"F"."IMAGE_" varchar) IMMUTABLE_ROWS = true') # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 增加列 # p_client.add_column_phoenix(connection=connection, column="IMAGE_") # 遍历 ENTITY_CODE_ 列表 # self.file_list = ["CommonBidding_86JCW"] for f in self.file_list: status = False entity_code = f.replace(".py", "") module_name = __import__(entity_code) entity_code_mongo = entity_code.replace("CommonBidding_", "") self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code_mongo)) self.remove_id_list = [] self.copy_mongo_data_list = [] try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code_mongo) except pymongo.errors.ServerSelectionTimeoutError: time.sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code_mongo) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 try: self.find_count += mongo_data_list.count() except pymongo.errors.ServerSelectionTimeoutError: time.sleep(1) self.find_count += mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] self.remove_id_list.append(data_id) del data["_id"] # 深拷贝源数据,用于插入 spider_data 库中 copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) # 数据清洗 try: re_data = module_name.data_shuffle(data) final_data = self.shuffle_data(re_data) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue # phoenix_HBase 插入数据 if final_data: try: p_client.upsert_to_phoenix_by_one( connection=connection, data=final_data) once_count += 1 except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) # 删除数据 if status: delete_count = self.delete_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code_mongo, remove_id_list=self.remove_id_list) self.remove_count += delete_count else: self.logger.info("HBase 插入成功条数0条, 不执行删除") # 将数据插入 spider_data_old 中 if status: try: old_client.mongo_db = "spider_data_old" insert_count = old_client.all_to_mongodb( collection=collection_old, insert_list=self.copy_mongo_data_list) self.old_count += insert_count # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) except pymongo.errors.ServerSelectionTimeoutError as e: time.sleep(1) self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) insert_count = old_client.all_to_mongodb( collection=collection_old, insert_list=self.copy_mongo_data_list) self.old_count += insert_count # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) except Exception as e: self.logger.info(e) # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(p_client.count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.handlers.clear()
def get_data_from_mongodb(self, collection, entity_code=None, exclude_code=None, limit_number=None, data_id=None, find_query=None): """ 从 MongoDB 获取数据 :param collection: :param entity_code: 需要查取的 实体编码 :param exclude_code: 需要排除的 实体编码 :param limit_number: 查取的数据条数 :param data_id: 查取 data_id 后的数据 :return: """ mon_logger = Logger().logger query_list = list() if isinstance(find_query, dict): query_list.append(find_query) elif isinstance(find_query, list): query_list.extend(find_query) if isinstance(entity_code, str): query_list.append({"ENTITY_CODE_": entity_code}) elif isinstance(entity_code, (list, tuple)): query_list.append({"ENTITY_CODE_": {"$in": list(entity_code)}}) if isinstance(exclude_code, str): query_list.append({"ENTITY_CODE_": {"$ne": exclude_code}}) elif isinstance(exclude_code, (list, tuple)): query_list.append({"ENTITY_CODE_": {"$nin": list(exclude_code)}}) if data_id: find_id = ObjectId(data_id) query_list.append({"_id": {"$gte": find_id}}) if query_list: query = {"$and": query_list} else: query = {} try: mon_logger.info("MongoDB 开始查取数据") result_one = collection.find_one(query) if result_one: if limit_number: result = collection.find(query, no_cursor_timeout=True).limit( int(limit_number)) if entity_code: mon_logger.info( f"ENTITY: {entity_code} 数据查取成功共 {limit_number} 条") else: mon_logger.info("数据查取成功共 {}条".format(limit_number)) else: result = collection.find(query, no_cursor_timeout=True) if entity_code: mon_logger.info( f"ENTITY: {entity_code} 数据查取成功共 {result.count()}条") else: mon_logger.info("数据查取成功共 {}条".format(result.count())) return result else: if entity_code: mon_logger.info("ENTITY: {} 数据查取为空".format( self.mongo_entity_code)) else: mon_logger.info("数据查取为空".format(self.mongo_entity_code)) return None except TypeError as e: mon_logger.error("MongoDB数据查取失败,错误信息为{}, 请检查 {}".format( e, self.mongo_entity_code)) except pymongo.errors.ServerSelectionTimeoutError as e: mon_logger.info("MongoDB 连接超时 {}, 正在重新连接...".format(e)) result_one = collection.find_one(query) if result_one: if limit_number: result = collection.find(query, no_cursor_timeout=True).limit( int(limit_number)) if entity_code: mon_logger.info( f"ENTITY: {entity_code} 数据查取成功共 {limit_number} 条") else: mon_logger.info("数据查取成功共 {}条".format(limit_number)) else: result = collection.find(query, no_cursor_timeout=True) if entity_code: mon_logger.info( f"ENTITY: {entity_code} 数据查取成功共 {result.count()}条") else: mon_logger.info("数据查取成功共 {}条".format(result.count())) return result else: if entity_code: mon_logger.info("ENTITY: {} 数据查取为空".format( self.mongo_entity_code)) else: mon_logger.info("数据查取为空".format(self.mongo_entity_code)) return None
class AllToPhoenix(object): def __init__(self): self.code_list = [ "ABCORGANIZE", "BOCOMORGANIZE", "BOCORGANIZE", "CBHBORGANIZE", "CCBORGANIZE", "CEBORGANIZE", "CGBORGANIZE", "CIBORGANIZE", "CMBCORGANIZE", "CMBORGANIZE", "CZBORGANIZE", "EBCLORGANIZE", "ECITICORGANIZE", "HXBORGANIZE", "ICBCORGANIZE", "PABORGANIZE", "PSBCORGANIZE", "SPDBORGANIZE" ] self.logger = Logger().logger self.count = 0 # 从 MongoDB 获取数据 def get_data_from_mongo(self, m_client, collection, entity_code): m_client.mongo_entity_code = entity_code try: mongo_data_list = m_client.search_from_mongodb(collection) return mongo_data_list except pymongo.errors.ServerSelectionTimeoutError: self.logger.info("连接失败,正在重新连接") sleep(1) mongo_data_list = m_client.search_from_mongodb(collection) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) # print(e) return None # 主函数 def run(self): # 创建 MongoDB 对象 m_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # 遍历 ENTITY_CODE_ 列表 # self.code_list = self.code_list[14:] for entity_code in self.code_list: self.count = 0 hash_list = list() status = False mongo_data_list = self.get_data_from_mongo(m_client=m_client, collection=collection, entity_code=entity_code) if mongo_data_list: self.logger.warning("{} 查取成功".format(entity_code)) self.logger.warning("当前共有{}条".format(mongo_data_list.count())) status = True else: self.logger.warning("{} 无数据".format(entity_code)) if status: for data in mongo_data_list: if "ADDR_" in data: hash_m = hashlib.md5() hash_m.update(data["ADDR_"].encode("utf-8")) hash_title = hash_m.hexdigest() if hash_title in hash_list: self.count += 1 else: hash_list.append(hash_title) else: if "CONTENT_" in data: hash_m = hashlib.md5() hash_m.update(data["CONTENT_"].encode("utf-8")) hash_title = hash_m.hexdigest() if hash_title in hash_list: self.count += 1 else: hash_list.append(hash_title) self.logger.warning("重复数据{}条".format(self.count)) # 关闭连接 m_client.client_close() self.logger.handlers.clear()
class JsInsuranceCcbData(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="JSINSURANCE_CCBDATA") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 MySQL 对象 __mysql_config = { "host": MYSQL_HOST_25, "port": MYSQL_PORT_25, "database": MYSQL_DATABASE_25, "user": MYSQL_USER_25, "password": MYSQL_PASSWORD_25, "table": MYSQL_TABLE_25 } __mysql_client = MysqlClient(**__mysql_config) __mysql_connection = __mysql_client.client_to_mysql() self.type = __mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'TYPE\'", connection=__mysql_connection) __mysql_client.close_client(connection=__mysql_connection) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="INSURANCE") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" self.a = list() def data_shuffle(self, data): if data["ENTITY_CODE_"] == "PAINSURANCE": return None elif data["ENTITY_CODE_"] == "BJBINSURANCE": data["CONTET_"] = data["CONTET_"].replace("|主险2:", "主险2:") first_shuffle = data["CONTET_"].split("|") data_list = list() company_dict = dict() index_list = list() for first in first_shuffle: if first[-2:] == "公司": company_index = first_shuffle.index(first) company_dict[company_index] = first index_list.append(company_index) else: continue for key in index_list: # print(index_list) j = key + 1 for i in range(100): if index_list.index(key) == len(index_list) - 1: if j == len(first_shuffle) - 1: break else: if j == index_list[index_list.index(key) + 1]: break data_dict = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(first_shuffle[j].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" data_dict["ID_"] = row_key data_dict["ENTITY_CODE_"] = data["ENTITY_CODE_"] data_dict["ENTITY_NAME_"] = data["ENTITY_NAME_"].replace( "模板", "产品") data_dict["BANK_CODE_"] = "BJB" data_dict["BANK_NAME_"] = "北京银行" data_dict["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") data_dict["URL_"] = data["URL_"] data_dict["PRODUCT_NAME_"] = first_shuffle[j] j += 1 # data_dict["TYPE_"] = first_shuffle[j] data_dict["TYPE_"] = "" data_dict["TYPE_CODE_"] = "" for i in self.type: if i["ITEM_LABEL_"][:-1] in first_shuffle[j]: data_dict["TYPE_"] = data_dict["TYPE_"] + i[ "ITEM_LABEL_"] + "|" data_dict["TYPE_CODE_"] = data_dict[ "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|" data_dict["TYPE_"] = data_dict["TYPE_"][:-1] data_dict["TYPE_CODE_"] = data_dict["TYPE_CODE_"][:-1] j += 1 # data_dict["RISK_LEVEL_"] = first_shuffle[j] j += 1 data_dict["PAY_METHOD_"] = first_shuffle[j] j += 1 # data_dict["INSURANCE_DATE_"] = first_shuffle[j] j += 1 # data_dict["TOUZIZHE_TYPE_"] = first_shuffle[j] j += 1 data_dict["COM_NAME_"] = company_dict[key] # data_dict["CONSIGNMENT_"] = "代销" # if "CONTENT_" in data: # data_dict["CONTENT_"] = data["CONTENT_"] data_dict["DEALTIME_"] = data["DEALTIME_"] data_dict["CREATE_TIME_"] = data["DATETIME_"] data_dict["STATUS_"] = "1" # print(data_dict) data_list.append(data_dict) return data_list elif data["ENTITY_CODE_"] == "CIBINSURANCE": data_list = list() insurance_name = re.findall(r".*?计划", data["PRODUCT_NAME_"]) for name in insurance_name: re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(name.encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["PRODUCT_NAME_"] = name re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["BANK_CODE_"] = "CIB" re_data["BANK_NAME_"] = "兴业银行" re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] re_data["CREATE_TIME_"] = data["DATETIME_"] re_data["STATUS_"] = "1" re_data["TYPE_"] = "" re_data["TYPE_CODE_"] = "" for i in self.type: if i["ITEM_LABEL_"][:-1] in name: re_data["TYPE_"] = re_data["TYPE_"] + i[ "ITEM_LABEL_"] + "|" re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"] + i[ "ITEM_VALUE_"] + "|" re_data["TYPE_"] = re_data["TYPE_"][:-1] re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1] data_list.append(re_data) return data_list else: if "INSURANCE_NAME_" not in data and ("PRODUCT_NAME_" not in data): return None else: if "INSURANCE_NAME_" in data: # # 承保年龄 # if ("INSURANCE_AGE_" not in data) or (not data["INSURANCE_AGE_"]): # age = re.findall(r"(\d*)周岁", data["INSURANCE_NAME_"]) # if age: # data["INSURANCE_AGE_"] = age[0] # 保障期限 # if ("INSURANCE_DATE_" not in data) or (not data["INSURANCE_DATE_"]): # limit = re.findall(r"保(终身)|保(\d*年)|(\d*年)期", data["INSURANCE_NAME_"]) # if limit: # for l in limit[0]: # if l: # data["INSURANCE_DATE_"] = l # break re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["INSURANCE_NAME_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["BANK_CODE_"] = data["ENTITY_CODE_"].replace( "INSURANCE", "") re_data["BANK_NAME_"] = data["ENTITY_NAME_"].replace( "保险产品", "") if "INSURANCE_NAME_" in data: re_data["PRODUCT_NAME_"] = data["INSURANCE_NAME_"] if ("INSURANCE_AGE_" in data) or ("AGE_" in data): re_data["AGE_"] = data["INSURANCE_AGE_"] if "TYPE_" in data: re_data["TYPE_"] = "" re_data["TYPE_CODE_"] = "" if data["TYPE_"] == "财险": re_data["TYPE_"] = "财产险" re_data["TYPE_CODE_"] = "PROPERTY_INSURANCE" else: for i in self.type: if i["ITEM_LABEL_"][:-1] in data["TYPE_"]: re_data["TYPE_"] = re_data["TYPE_"] + i[ "ITEM_LABEL_"] + "|" re_data["TYPE_CODE_"] = re_data[ "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|" re_data["TYPE_"] = re_data["TYPE_"][:-1] re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1] else: re_data["TYPE_"] = "" re_data["TYPE_CODE_"] = "" for i in self.type: if i["ITEM_LABEL_"][:-1] in data["ENTITY_NAME_"]: re_data["TYPE_"] = re_data["TYPE_"] + i[ "ITEM_LABEL_"] + "|" re_data["TYPE_CODE_"] = re_data[ "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|" re_data["TYPE_"] = re_data["TYPE_"][:-1] re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1] # if "INSURANCE_DATE_" in data: # re_data["INSURANCE_DATE_"] = data["INSURANCE_DATE_"] if "INSURANCE_DETAIL_" in data: re_data["PRODUCT_DETAIL_"] = data["INSURANCE_DETAIL_"] if "COMPANY_NAME_" in data: re_data["COM_NAME_"] = data["COMPANY_NAME_"] if "LIMIT_NUMBER_" in data: re_data["BUY_LIMIT_"] = data["LIMIT_NUMBER_"] # re_data["AREA_CODE_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") if "CONTENT_" in data: re_data["CONTENT_"] = data["CONTENT_"] # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"] re_data["STATUS_"] = "1" # re_data["REMARK_"] = "" re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] # re_data["TITLE_"] = data["TITLE_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] # re_data["DATETIME_"] = data["DATETIME_"] return re_data elif "PRODUCT_NAME_" in data: re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["PRODUCT_NAME_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] # re_data["BANK_CODE_"] = data["ENTITY_CODE_"] # re_data["BANK_NAME_"] = data["ENTITY_NAME_"] if "PRODUCT_NAME_" in data: re_data["PRODUCT_NAME_"] = data["PRODUCT_NAME_"] if "FEATURE_NAME_" in data: re_data["FEATURE_NAME_"] = data["FEATURE_NAME_"] if "TYPE_" in data: re_data["TYPE_"] = "" re_data["TYPE_CODE_"] = "" if data["TYPE_"] == "财险": re_data["TYPE_"] = "财产险" re_data["TYPE_CODE_"] = "PROPERTY_INSURANCE" elif data["TYPE_"] == "100种疾病保障": re_data["TYPE_"] = "健康险" re_data["TYPE_CODE_"] = "HEALTH_INSURANCE" else: for i in self.type: if i["ITEM_LABEL_"][:-1] in data["TYPE_"]: re_data["TYPE_"] = re_data["TYPE_"] + i[ "ITEM_LABEL_"] + "|" re_data["TYPE_CODE_"] = re_data[ "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|" re_data["TYPE_"] = re_data["TYPE_"][:-1] re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1] if "POLICY_DUTY_" in data: re_data["POLICY_DUTY_"] = data["POLICY_DUTY_"] if "PRODUCT_CASE_" in data: re_data["PRODUCT_CASE_"] = data["PRODUCT_CASE_"] if "BUY_LIMIT_" in data: re_data["BUY_LIMIT_"] = data["BUY_LIMIT_"] if "ENSURE_PRICE_" in data: re_data["ENSURE_PRICE_"] = data["ENSURE_PRICE_"] # re_data["AREA_CODE_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") if "PRODUCT_PRICE_" in data: re_data["PRODUCT_PRICE_"] = data["PRODUCT_PRICE_"] if "PRODUCT_ID_" in data: re_data["PRODUCT_ID_"] = data["PRODUCT_ID_"] if "PRODUCT_CLAUSE_" in data: re_data["PRODUCT_CLAUSE_"] = data["PRODUCT_CLAUSE_"] if "GENDER_" in data: re_data["GENDER_"] = data["GENDER_"] if "AGE_" in data: re_data["AGE_"] = data["AGE_"] if "COM_NAME_" in data: re_data["COM_NAME_"] = data["COM_NAME_"] if "PAY_METHOD_" in data: re_data["PAY_METHOD_"] = data["PAY_METHOD_"] if "PROBLEM_" in data: re_data["PROBLEM_"] = data["PROBLEM_"] if "CLAIM_" in data: re_data["CLAIM_"] = data["CLAIM_"] if "COMMENT_" in data: re_data["COMMENT_"] = data["COMMENT_"] if "ENSURE_CONTENT_" in data: re_data["ENSURE_CONTENT_"] = data["ENSURE_CONTENT_"] if "INSURE_INFO_" in data: re_data["INSURE_INFO_"] = data["INSURE_INFO_"] if "RATE_INFO_" in data: re_data["RATE_INFO_"] = data["RATE_INFO_"] if "SALE_SERVICE_" in data: re_data["SALE_SERVICE_"] = data["SALE_SERVICE_"] # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"] re_data["STATUS_"] = "1" # re_data["REMARK_"] = "" re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] # re_data["TITLE_"] = data["TITLE_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] # re_data["DATETIME_"] = data["DATETIME_"] return re_data def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # # create table sql # table_sql = ('create table "INSURANCE" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,' # ' "C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, ' # '"C"."REMARK_" varchar, "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar,' # '"C"."TYPE_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C".PRODUCT_CLAUSE_ varchar,' # '"C"."SOURCE_" varchar, "C"."PRODUCT_NAME_" varchar, "C"."FEATURE_NAME_" varchar,' # '"C"."POLICY_DUTY_" varchar, "C"."PRODUCT_CASE_" varchar, "C"."BUY_LIMIT_" varchar,' # '"C"."ENSURE_PRICE_" varchar, "C"."PRODUCT_PRICE_" varchar, "C"."PRODUCT_ID_" varchar,' # '"C"."GENDER_" varchar, "C"."AGE_" varchar, "C"."COM_NAME_" varchar, "C"."TYPE_CODE_" varchar,' # '"C"."PAY_METHOD_" varchar, "C"."PRODUCT_DETAIL_" varchar, "C"."PROBLEM_" varchar,' # '"C"."CLAIM_" varchar, "C"."COMMENT_" varchar, "C"."STATUS_" varchar,' # '"C"."ENSURE_CONTENT_" varchar, "C"."INSURE_INFO_" varchar, "C"."RATE_INFO_" varchar,' # '"C"."SALE_SERVICE_" varchar) IMMUTABLE_ROWS = true') # # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection) for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) # print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: if isinstance(re_data, dict): # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue elif isinstance(re_data, list): for r_d in re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=r_d) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # # add {d:1} # try: # self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, # data_dict={"d": 1}) # self.remove_count += 1 # if self.remove_count % 10 == 0: # self.logger.info("MongoDB 更新成功, 成功条数 {}".format(self.remove_count)) # except Exception as e: # self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e)) # continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
def all_from_mongodb(self, collection, data_id=None, d=False): mon_logger = Logger().logger if data_id: find_id = ObjectId(data_id) try: mon_logger.info("开始查取数据") # result = collection.find_one({"_id": {"$gte": find_id}}) result = collection.find_one({ "$and": [{ "_id": { "$gte": find_id } }, { "ORDER_ID": { "$exists": False } }] }) if result is not None: result = collection.find( { "$and": [{ "_id": { "$gte": find_id } }, { "ORDER_ID": { "$exists": False } }] }, no_cursor_timeout=True) mon_logger.info("数据查取成功, 共 {} 条".format(result.count())) return result else: mon_logger.info("MongoDB 查取数据为空") return None except TypeError as e: mon_logger.error("MongoDB数据查取失败,错误信息为{}, 请检查 {}".format( e, self.mongo_entity_code)) except pymongo.errors.ServerSelectionTimeoutError as e: mon_logger.info("MongoDB 连接超时 {}, 正在重新连接...".format(e)) result = collection.find_one({ "$and": [{ "_id": { "$gte": find_id } }, { "ORDER_ID": { "$exists": False } }] }) if result: result = collection.find( { "$and": [{ "_id": { "$gte": find_id } }, { "ORDER_ID": { "$exists": False } }] }, no_cursor_timeout=True) mon_logger.info("数据查取成功, 共 {} 条".format(result.count())) return result else: mon_logger.info("MongoDB 查取数据为空") return None else: if d: try: mon_logger.info("开始查取数据") result = collection.find_one() if result is not None: result = collection.find(no_cursor_timeout=True) mon_logger.info("数据查取成功, 共 {} 条".format( result.count())) return result else: mon_logger.info("MongoDB 查取数据为空") return None except TypeError as e: mon_logger.error("MongoDB数据查取失败,错误信息为{}, 请检查 {}".format( e, self.mongo_entity_code)) except pymongo.errors.ServerSelectionTimeoutError as e: mon_logger.info("MongoDB 连接超时 {}, 正在重新连接...".format(e)) result = collection.find_one() if result: result = collection.find(no_cursor_timeout=True) mon_logger.info("数据查取成功, 共 {} 条".format( result.count())) return result else: mon_logger.info("MongoDB 查取数据为空") return None else: try: mon_logger.info("开始查取数据") result = collection.find_one({"d": {"$exists": False}}) if result is not None: result = collection.find({"d": { "$exists": False }}, no_cursor_timeout=True) mon_logger.info("数据查取成功, 共 {} 条".format( result.count())) return result else: mon_logger.info("MongoDB 查取数据为空") return None except TypeError as e: mon_logger.error("MongoDB数据查取失败,错误信息为{}, 请检查 {}".format( e, self.mongo_entity_code)) except pymongo.errors.ServerSelectionTimeoutError as e: mon_logger.info("MongoDB 连接超时 {}, 正在重新连接...".format(e)) result = collection.find_one() if result: result = collection.find(no_cursor_timeout=True) mon_logger.info("数据查取成功, 共 {} 条".format( result.count())) return result else: mon_logger.info("MongoDB 查取数据为空") return None
class DataShuffle(object): def __init__(self, param): self.logger = Logger().logger self.invoke_type = "BRANCH" self.base_dir = os.path.dirname(os.getcwd()) self.param_dict = eval(param) self.param = "\"" + param + "\"" if self.param_dict: try: self.entity_type = self.param_dict["entityType"] except Exception: raise Exception def invoking_the_script(self): """ 调用脚本 :return: """ if self.invoke_type == "BRANCH": # 微博和微信 if self.entity_type == "WEIBOINFO" or self.entity_type == "WEIBOBASICINFO" or self.entity_type == "WECHAT": file_path = "/".join([ "./branch_scripts2", self.entity_type, f"{self.entity_type}.py" ]) # 金融产品 elif self.entity_type[:4] == "JRCP": file_path = "/".join([ "./branch_scripts2", "FIN_PRODUCT", self.entity_type, "__init_____.py" ]) # 网点 elif self.entity_type[:2] == "WD": file_path = "/".join([ "./branch_scripts2", "SURROUNDING_FACILITIES", self.entity_type, "__init_____.py" ]) # 招投标 elif self.entity_type == "COMMONBIDDING" or self.entity_type == "commonbidding" or self.entity_type == "CommonBidding": file_path = "/".join( ["./scripts", "CommonBidding", "__init_____.py"]) # 招投标 elif self.entity_type == "ORGANIZE": file_path = "/".join( ["./scripts", "CommonBidding", "__init_____.py"]) else: file_path = "/".join( ["./crm.scripts", self.entity_type, "__init_____.py"]) else: if self.entity_type == "WEIBOINFO" or self.entity_type == "WEIBOBASICINFO" or self.entity_type == "WECHAT": file_path = "/".join([ "./branch_scripts2", self.entity_type, f"{self.entity_type}.py" ]) else: file_path = "/".join( ["./branch_scripts2", self.entity_type, "__init_____.py"]) self.logger.info("调用的脚本文件为{}".format(file_path)) os.system(" ".join(["python ", file_path, self.param])) def main(self): self.invoking_the_script()
def remove_from_mysql(self, connection, list_remove): mysql_logger = Logger().logger sql = "DELETE FROM {} WHERE ID_ IN {}".format(self.mysql_table, list_remove) count = self.cs_commit(connection=connection, sql=sql) mysql_logger.info("删除成功 {}条".format(count))
def get_data_and_update(self, collection, entity_code, exclude_code, update_dict, data_id=None, other_query=None, sort_query=None): """ 查询一条数据并更新 :param collection: :param entity_code: 需要查取的 实体编码 :param exclude_code: 需要排除的 实体编码 :param update_dict: 需要更新的字段与值 :param data_id: 查取 data_id 后的数据 :param other_query: 其他过滤条件 :return: """ mon_logger = Logger().logger query_list = list() if isinstance(entity_code, str): query_list.append({"ENTITY_CODE_": entity_code}) elif isinstance(entity_code, (list, tuple)): query_list.append({"ENTITY_CODE_": {"$in": list(entity_code)}}) if isinstance(exclude_code, str): query_list.append({"ENTITY_CODE_": {"$ne": exclude_code}}) elif isinstance(exclude_code, (list, tuple)): query_list.append({"ENTITY_CODE_": {"$nin": list(exclude_code)}}) if isinstance(other_query, dict): query_list.append(other_query) elif isinstance(other_query, list): query_list.extend(other_query) if data_id: find_id = ObjectId(data_id) query_list.append({"_id": {"$gte": find_id}}) if query_list: query = {"$and": query_list} else: query = {} try: mon_logger.info(f"query={query}") result_one = collection.find_one_and_update(query, update_dict, sort=sort_query) if result_one: mon_logger.info(f"MongoDB--数据查取并更新成功") return result_one else: mon_logger.info("MongoDB 数据查取为空".format( self.mongo_entity_code)) return None except TypeError as e: mon_logger.error("MongoDB数据查取失败,错误信息为{}, 请检查 {}".format( e, self.mongo_entity_code)) except pymongo.errors.ServerSelectionTimeoutError as e: mon_logger.info("MongoDB 连接超时 {}, 正在重新连接...".format(e)) result_one = collection.find_one_and_update(query, update_dict) if result_one: mon_logger.info("MongoDB--数据查取并更新成功") return result_one else: mon_logger.info("MongoDB 数据查取为空".format( self.mongo_entity_code)) return None
class WechatScript(object): def __init__(self, entity_type="FOR_TEST_WECHAT"): """ 初始化参数 :param entity_type: WECHAT """ self.entity_type = entity_type self.logger = Logger().logger # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name=self.entity_type) # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.remove_id_list = list() self.copy_mongo_data_list = list() self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" self.row_key_count = 0 # def write_to_local(self, data): # with open("wechat_test.txt", "a", encoding="utf-8") as f: # f.write(str(data)) def read_from_local(self): with open("wechat_test.txt", "r", encoding="utf-8") as f: data = f.read() return data def main(self, data): """ :return: """ # # 删除表 # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # 建表语句 # table_sql = ('create table "FOR_TEST_WECHAT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."URL_" varchar, "C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar,' # '"C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar,' # '"C"."REMARK_" varchar, "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, ' # '"T"."CONTENT_" varchar, "C"."CONTENT_TYPE_" varchar, "C"."TITLE_" varchar,' # '"C"."WECHAT_ID_" varchar, "C"."WECHAT_NAME_" varchar, "C"."ENTITY_NAME_" varchar,' # '"C"."DEALTIME_" varchar,"C"."STATUS_" varchar) IMMUTABLE_ROWS = true') # # # 创建表 # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) i = random.randint(0, 100000000) re_data = eval(data) re_data["ID_"] = re_data["ID_"] + "_" + str(i) if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format(self.success_count)) # 向 HBase 插入数据 try: count = self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=re_data) self.success_count += count return self.success_count except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format(self.data_id, e)) return None def final(self): self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class AllToPhoenix(object): def __init__(self): # "CNINFONEWS" pdf too long self.code_list = [ "CAIJINGNEWS", "CNINFONEWS", "CSFINACIAL", "CSFINACIALNEWS", "CSNEWS", "CSNOTICE", "FINAQQNEWS", "XLCJYHMKNEWS", "XLCJNEWS", "XLCJGSNEWS", "WYCJNEWS", "WYCJGSNEWS", "NEWS163DOM", "NEWS10JQKA2", "NEWS10JQKA", "HOUSEQQNEWS" ] self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 # 插入 spider_data_old 的数据列表 # self.copy_mongo_data_list = list() # 删除 spider_data 的数据 _id 列表 # self.remove_id_list = list() # self.branch_code_list = list() self.verify_list = [ "ENTITY_CODE_", "ENTITY_NAME_", "URL_", "PERIOD_CODE_", "STATUS_", "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "BANK_NAME_", "BANK_CODE_", "CONTENT_", "DATA_SOURCE_", "KEYWORDS_", "ENTITY_NAME_", "ID_" ] # 从 MongoDB 获取数据 def get_data_from_mongo(self, m_client, collection, entity_code, find_id): m_client.mongo_db = "spider_data" m_client.mongo_entity_code = entity_code try: mongo_data_list = m_client.search_from_mongodb( collection=collection, data_id=find_id) return mongo_data_list except pymongo.errors.ServerSelectionTimeoutError: self.logger.info("连接失败,正在重新连接") sleep(1) mongo_data_list = m_client.search_from_mongodb( collection=collection, data_id=find_id) return mongo_data_list except Exception as e: self.logger.info(e) return None # 从 MongoDB 删除数据 def delete_data_from_mongo(self, m_client, collection, entity_code, remove_id_list): m_client.mongo_entity_code = entity_code try: remove_count = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return remove_count except pymongo.errors.ServerSelectionTimeoutError: remove_count = m_client.remove_from_mongo( collection=collection, remove_id_list=remove_id_list) return remove_count except Exception as e: self.logger.info(e) return None def get_brief_from_ai(self, data): data["CONTENT_"] = data["CONTENT_"].replace("|", "") if data["PUBLISH_TIME_"]: data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"][:10] # ID hash_m = hashlib.md5() hash_m.update(data["URL_"].encode("utf-8")) hash_title = hash_m.hexdigest() data["ID_"] = data["ENTITY_CODE_"] + "_" + str(hash_title) text = data["CONTENT_"] ex_line = ("python3" + " " + AI_PATH + " " + '\"{}\"'.format(text) + " " + "1") # print(ex_line) r = os.popen(ex_line) # print(1, r.read()) data["BRIEF_"] = r.read() return data # 主函数 def run(self): # # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="NEWS_FINASSIST") p_client.verify_list = self.verify_list # # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="NEWS_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # # 表创建语句 # sql = ('create table "NEWS_FINASSIST" ("ID_" varchar primary key, "T"."CONTENT_" varchar, ' # '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."TITLE_" varchar, "C"."BRIEF_" varchar, ' # '"C"."PUBLISH_TIME_" varchar, "C"."KEYWORDS_" varchar, "C"."URL_" varchar, "C"."DATA_SOURCE_" varchar,' # '"C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,' # '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "F"."STATUS_" varchar)' # 'IMMUTABLE_ROWS = true') # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 遍历 ENTITY_CODE_ 列表 for entity_code in self.code_list: status = False module_name = __import__(entity_code) self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code)) # self.remove_id_list = [] # self.copy_mongo_data_list = [] # self.branch_code_list = [] if entity_code == "CAIJINGNEWS": find_id = "5c6bfa508d7fee512a4ca68f" else: find_id = "" # find_id = "" try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, find_id=find_id) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, find_id=find_id) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count = mongo_data_list.count() for i in range(1000000): try: data = mongo_data_list.__next__() except pymongo.errors.ServerSelectionTimeoutError: continue except StopIteration: break # for data in mongo_data_list: data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info( "running on data_id: {}".format(data_id)) # print(data_id) # copy_data = {} # self.remove_id_list.append(data_id) try: del data["_id"] # copy_data = deepcopy(data) # self.copy_mongo_data_list.append(copy_data) data_list = [data] re_data = module_name.data_shuffle(data_list) if not re_data: self.bad_count += 1 # self.remove_id_list.remove(data_id) continue except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue if isinstance(re_data, list): for list_data in re_data: # phoenix_HBase 插入数据 if list_data: try: if entity_code != "CNINFONEWS": ai_data = self.get_brief_from_ai( data=list_data) else: ai_data = list_data # print(ai_data["CONTENT_"]) except Exception as e: self.logger.info("AI 调取失败, 错误信息", e) ai_data = re_data try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=ai_data) once_count += success_count self.success_count += success_count if self.success_count % 10 == 0: self.logger.info( "HBase 插入成功, 成功条数{}条".format( once_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue try: # 添加 {d:1} update_count = m_client.update_to_mongodb( collection=collection, data_id=data_id, data_dict={"d": 1}) self.remove_count += update_count # self.logger.info("MongoDB 更新成功") if self.remove_count % 10 == 0: self.logger.info( "MongoDB 更新成功, 成功条数 {} 条".format( "10")) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "MongoDB 更新 _id 为 {} 的数据失败, {}".format( data_id, e)) continue elif isinstance(re_data, dict): # phoenix_HBase 插入数据 if re_data: try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) once_count += success_count self.success_count += success_count self.logger.info( "HBase 插入成功, 成功条数 {} 条".format( success_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if once_count > 0: status = True self.logger.info("ENTITY_CODE_: {} 插入成功条数 {}".format( entity_code, once_count)) mongo_data_list.close() else: continue # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class Trend(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="TREND") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="CHA_BRANCH_MARKET_ACT") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def data_shuffle(self, data): re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # 分行 copy_result = dict() copy_result["ID_"] = row_key copy_result["ENTITY_CODE_"] = data["ENTITY_CODE_"] copy_result["ENTITY_NAME_"] = data["ENTITY_NAME_"] copy_result["URL_"] = data["URL_"] # copy_result["PROVINCE_CODE_"] = result[""] # copy_result["PROVINCE_NAME_"] = result[""] # copy_result["CITY_CODE_"] = result[""] # copy_result["CITY_NAME_"] = result[""] # copy_result["AREA_CODE_"] = result[""] # copy_result["AREA_NAME_"] = result[""] # copy_result["LAT_"] = result[""] # copy_result["LNG_"] = result[""] copy_result["APP_VERSION_"] = "BRANCH" copy_result["BANK_CODE_"] = data["ENTITY_CODE_"].replace( "PRIVATEINFO", "") copy_result["BANK_NAME_"] = data["ENTITY_NAME_"].replace("私行动态", "") # copy_result["UNIT_CODE_"] = result["UNIT_CODE_"] # copy_result["UNIT_NAME_"] = result[""] copy_result["PERIOD_CODE_"] = data["NOTICE_TIME_"].replace("-", "") # copy_result["REMARK_"] = result[""] time_array = time.localtime() create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) copy_result["CREATE_TIME_"] = create_time copy_result["SPIDER_TIME_"] = data["DATETIME_"] # copy_result["MODIFIED_TIME_"] = result[""] copy_result["CREATE_BY_ID_"] = "P0131857" copy_result["CREATE_BY_NAME_"] = "钟楷文" # copy_result["MODIFIED_BY_ID_"] = result[""] # copy_result["MODIFIED_BY_NAME_"] = result[""] copy_result["M_STATUS_"] = "0" copy_result["DELETE_STATUS_"] = "0" copy_result["DATA_STATUS_"] = "uncheck" # copy_result["TAGS_"] = result[""] source = re.findall(r"(https?://.*?)/", data["URL_"]) copy_result["SOURCE_"] = source[0] copy_result["SOURCE_NAME_"] = data["ENTITY_NAME_"] # copy_result["SOURCE_TYPE_"] = result[""] # copy_result["HOT_"] = result[""] # copy_result["IMPORTANCE_"] = result[""] copy_result["ACT_NAME_"] = data["TITLE_"] # copy_result["IMAGES_"] = data[""] # copy_result["TARGET_"] = data[""] # copy_result["BRIEF_"] = data[""] copy_result["DETAILS_"] = data["CONTENT_"] # copy_result["RULE_"] = data[""] # copy_result["START_TIME_"] = data[""] # copy_result["END_TIME_"] = data[""] # copy_result["ACT_TYPE1_"] = data[""] # copy_result["ACT_TYPE2_"] = data[""] # copy_result["ACT_TYPE3_"] = data[""] copy_result["PUBLISH_TIME_"] = data["NOTICE_TIME_"] # copy_result["READS_"] = data[""] # copy_result["LIKES_"] = data[""] # copy_result["COMMENTS_"] = data[""] # copy_result["JOINS_"] = data[""] # copy_result["RELAYS_"] = data[""] # copy_result["SOURCE_ID_"] = data[""] # copy_result["HTML_"] = data[""] # copy_result["SOURCE_OWN_NAME_"] = data[""] # copy_result["SOURCE_OWN_ID_"] = data[""] return copy_result # "C" # re_data["ID_"] = row_key # re_data["TYPE_"] = random.choice( # ["税务法律", "子女教育", "健康医养", "财富管理", "生活娱乐", "旅游出行", "艺术/艺术品", "节日庆贺", "其他"]) # re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] # re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] # re_data["BANK_CODE_"] = data["ENTITY_CODE_"].replace("PRIVATEINFO", "") # re_data["BANK_NAME_"] = data["ENTITY_NAME_"].replace("私行动态", "") # # re_data["AREA_CODE_"] # # re_data["UNIT_CODE_"] # period_code = data["NOTICE_TIME_"].replace("-", "") # re_data["PERIOD_CODE_"] = period_code # re_data["CONTENT_"] = data["CONTENT_"] # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"] # re_data["STATUS_"] = "1" # # re_data["REMARK_"] = "" # re_data["CREATE_TIME_"] = data["DATETIME_"] # # re_data["UPDATE_TIME_"] # re_data["TITLE_"] = data["TITLE_"] # re_data["URL_"] = data["URL_"] # re_data["DEALTIME_"] = data["DEALTIME_"] # # re_data["DATETIME_"] = data["DATETIME_"] # # re_data["SOURCE_TYPE_"] # # return re_data def run(self): # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # quit() # add colum # self.p_client.add_column_phoenix(connection=self.connection, column="SOURCE_TYPE_") # quit() # create table sql # table_sql = ('create table "MARKETING_ACT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."TITLE_" varchar,"C"."NOTICE_TIME_" varchar,' # '"T"."CONTENT_" varchar,"C"."OBJ_" varchar, "C"."ATENDANCE_" varchar, "C"."PERIOD_CODE_" varchar,' # '"C"."IMAGES_" varchar, "C"."RESULTS_" varchar,"C"."PLACE_" varchar, "C"."TYPE_" varchar,' # '"C"."READ_NUM_" varchar, "C"."CONTENT_NUM_" varchar, "C"."COMMENT_CONTENT_" varchar, ' # '"C"."FORWARD_NUM_" varchar, "C"."COLLECTION_NUM_" varchar, "C"."PRAISE_NUM_" varchar,' # '"C"."BANK_NAME_" varchar, "C"."STATUS_" varchar, "C"."REMARK_" varchar, "C"."SOURCE_ID_" varchar,' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar,"C"."SOURCE_" varchar,' # '"C"."URL_" varchar, "C"."BANK_CODE_" varchar, "C"."DEALTIME_" varchar, ' # '"C"."SOURCE_TYPE_" varchar, "C"."IMPROTANCE_" varchar) IMMUTABLE_ROWS = true') # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection) # for i in range(mongo_data_list.count() + 100): for i in range(100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # add {d:1} try: self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, data_dict={"d": 1}) self.remove_count += 1 if self.remove_count % 10 == 0: self.logger.info("MongoDB 更新成功, 成功条数 {}".format( self.remove_count)) except Exception as e: self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format( self.data_id, e)) continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class Meipian(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="meipian_CCBDATA") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="MEIPIAN_CCBDATA") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def data_shuffle(self, data): re_data = dict() if data["TITLE_"]: # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" 通用列族字段 re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] # re_data["BANK_NAME_"] time_arrary = arrow.get(data["CREATE_TIME"]) period_code = time_arrary.format("YYYYMMDD") publish_time = time_arrary.format("YYYY-MM-DD HH:mm:ss") re_data["PERIOD_CODE_"] = str(period_code) re_data["PUBLISH_TIME_"] = str(publish_time) re_data["STATUS_"] = "UNPROCESSED" re_data["CONTENT_"] = data["CONTENT_"] re_data["REMARK_"] = "" # re_data["AREA_CODE_"] # re_data["UNIT_CODE_"] re_data["CREATE_TIME_"] = data["DATETIME_"] re_data["URL_"] = data["URL_"] re_data["TITLE_"] = data["TITLE_"] re_data["CONTENT_TYPE_"] = data["CONTENT_TYPE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["DEALTIME_"] = str(data["DEALTIME_"]) re_data["VISIT_COUNT_"] = data["VISIT_COUNT"] re_data["PRAISE_COUNT_"] = data["PRAISE_COUNT"] re_data["COMMENT_COUNT_"] = data["COMMENT_COUNT"] re_data["SOURCE_"] = data["SOURCE_"] return re_data else: return None def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # # create table sql # table_sql = ('create table "MEIPIAN_CCBDATA" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."URL_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,' # ' "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, ' # '"C"."TITLE_" varchar, "C"."CONTENT_TYPE_" varchar, "C"."ENTITY_NAME_" varchar,' # '"C"."VISIT_COUNT_" varchar, "C"."PRAISE_COUNT_" varchar, "C"."COMMENT_COUNT_" varchar,' # '"C"."DEALTIME_" varchar, "C"."SOURCE_" varchar, "C"."PUBLISH_TIME_" varchar,' # '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true') # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) # f_id = "5c6fa1328d7fee306de9463d" # quit() # f_id = "5c6fe1ba8d7fee1d44775989" # quit() # f_id = "5c6fdb448d7fee394da6a5fb" # quit() Exception while executing batch. # f_id = "5c6fe1ba8d7fee1d44775989" f_id = "5c6fe11b9bb3df6b0ec6168b" # gt 10M mongo_data_list = self.m_client.all_from_mongodb(self.collection, data_id=f_id) for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # add {d:1} try: self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, data_dict={"d": 1}) self.remove_count += 1 if self.remove_count % 10 == 0: self.logger.info("MongoDB 更新成功, 成功条数 {}".format( self.remove_count)) except Exception as e: self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format( self.data_id, e)) continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class BaiduSearch(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="BAIDU_SEARCH") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="BAIDU_SEARCH") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" # BANK_NAME_ 字典 交通银行 BOCOM 改为 COMM 中信银行 ECITIC 改为 CITIC 增加 平安银行 北京银行 上海银行 self.name_dict = { "ICBC": "中国工商银行", "ABC": "中国农业银行", "BOC": "中国银行", "CCB": "中国建设银行", "COMM": "交通银行", "PSBC": "中国邮政储蓄银行", "CZB": "浙商银行", "CBHB": "渤海银行", "CITIC": "中信银行", "CEB": "中国光大银行", "HXB": "华夏银行", "CMBC": "中国民生银行", "CMB": "招商银行", "CIB": "兴业银行", "CGB": "广发银行", "PAB": "平安银行", "SPDB": "浦发银行", "EBCL": "恒丰银行", "PINGAN": "平安银行", "LTD": "中国光大银行", "BEIJING": "北京银行", "BOSC": "上海银行" } # TYPE_ 列表 self.type_list = [ "Market", "Activity", "GoodStart", "MidSeason", "PrivateBank", "Recommendation" ] def data_shuffle(self, data): re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] # re_data["AREA_CODE_"] for bank_c in self.name_dict: if bank_c in data["ENTITY_CODE_"]: re_data["BANK_CODE_"] = bank_c break if "BANK_CODE_" in re_data: re_data["BANK_NAME_"] = self.name_dict[re_data["BANK_CODE_"]] else: print(data["ENTITY_CODE_"]) # re_data["UNIT_CODE_"] # re_data["PERIOD_CODE_"] = "" re_data["CONTENT_"] = data["CONTENT_"] re_data["STATUS_"] = "UNPROCESSED" # re_data["REMARK_"] = "" re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] for type in self.type_list: if type in data["ENTITY_CODE_"]: re_data["TYPE_"] = type break re_data["TITLE_"] = data["TITLE_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] # re_data["DATETIME_"] = data["DATETIME_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] return re_data def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # # create table sql # table_sql = ('create table "BAIDU_SEARCH" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,"C"."REMARK_" varchar,' # ' "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, ' # '"C"."TITLE_" varchar, "C"."URL_" varchar, "C"."ENTITY_NAME_" varchar,"C"."TYPE_" varchar,' # '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar) IMMUTABLE_ROWS = true') # # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection) for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # add {d:1} try: self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, data_dict={"d": 1}) self.remove_count += 1 if self.remove_count % 10 == 0: self.logger.info("MongoDB 更新成功, 成功条数 {}".format( self.remove_count)) except Exception as e: self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format( self.data_id, e)) continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class Entrust(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="JSENTRUST_CCBDATA") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="ENTRUST") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def data_shuffle(self, data): re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["NAME_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["CREATE_TIME_"] = data["DATETIME_"] re_data["STATUS_"] = "1" re_data["DEALTIME_"] = data["DEALTIME_"] re_data["URL_"] = data["URL_"] if data["ENTITY_CODE_"] == "CHINATRC": # "C" # re_data["AREA_CODE_"] # re_data["BANK_CODE_"] # re_data["BANK_NAME_"] # re_data["UNIT_CODE_"] pub_date = eval(data["PUB_DATE_"]) date = str(pub_date["time"])[:-3] t = arrow.get(int(date)) publish_date = str(t)[:10] period_code = publish_date.replace("-", "") re_data["PERIOD_CODE_"] = period_code # re_data["REMARK_"] # re_data["UPDATE_TIME_"] re_data["CODE_"] = data["CODE_"] re_data["NAME_"] = data["NAME_"] re_data["ISSUER_"] = data["ISSUER_"] re_data["FUNCTION_"] = data["FUNCTION_"] pro_date = eval(data["PRO_START_"]) pro_date = str(pro_date["time"])[:-3] p_t = arrow.get(int(pro_date)) product_date = str(p_t)[:10] re_data["PRO_START_"] = product_date re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"] re_data["RUN_MODE_"] = data["RUN_MODE_"] re_data["INDUSTRY_"] = data["INDUSTRY_"] re_data["PUB_DATE_"] = publish_date # re_data["SCALE_"] = data[""] # re_data["MONTH_"] # re_data["YIELD_RATE_"] # re_data["START_FUNDS_"] # re_data["PURPOSE_"] # re_data["ESTAB_ANNOUNCEMENT_"] # re_data["ENTRUST_STATUS_"] # # re_data["DISTRIBU_MODE_"] # re_data["INVEST_AREA_"] # re_data["TERM_TYPE_"] = data["TERM_TYPE_"] # re_data["INVEST_DIRECTION_"] # re_data["INVEST_MODE_"] = data["INVEST_MODE_"] # re_data["CURRENCY_"] # re_data["MANAGE_TYPE_"] # re_data["SALE_TARGET_"] # re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"] # re_data["ISSUER_AREA_"] # re_data["RESERVE_INFO_"] # re_data["TRUSTEESHIP_BANK_"] # re_data["OTHER_INFO_"] # re_data["OTHER_INFO_"] elif data["ENTITY_CODE_"] == "TRUSTHEXUN": # "C" # re_data["AREA_CODE_"] # re_data["BANK_CODE_"] # re_data["BANK_NAME_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "") # re_data["REMARK_"] # re_data["UPDATE_TIME_"] # re_data["CODE_"] = data["CODE_"] re_data["NAME_"] = data["NAME_"] re_data["ISSUER_"] = data["ISSUER_"] # re_data["FUNCTION_"] = data["FUNCTION_"] # re_data["PRO_START_"] re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"].replace( "至月", "") # re_data["RUN_MODE_"] = data["RUN_MODE_"] re_data["INDUSTRY_"] = data["INDUSTRY_"] re_data["PUB_DATE_"] = data["PUB_DATE_"] re_data["SCALE_"] = data["SCALE_"] # re_data["MONTH_"] re_data["YIELD_RATE_"] = data["YIELD_RATE_"] re_data["START_FUNDS_"] = data["START_FUNDS_"] # re_data["PURPOSE_"] # re_data["ESTAB_ANNOUNCEMENT_"] # re_data["ENTRUST_STATUS_"] # # re_data["DISTRIBU_MODE_"] # re_data["INVEST_AREA_"] # re_data["TERM_TYPE_"] = data["TERM_TYPE_"] # re_data["INVEST_DIRECTION_"] re_data["INVEST_MODE_"] = data["INVEST_MODE_"] re_data["CURRENCY_"] = data["CURRENCY_"] re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"] re_data["SALE_TARGET_"] = data["SALE_TARGET_"] re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"] re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"] re_data["RESERVE_INFO_"] = data["RESERVE_INFO_"] # re_data["TRUSTEESHIP_BANK_"] re_data["OTHER_INFO_"] = data["OTHER_INFO_"] elif data["ENTITY_CODE_"] == "YANGLEE": # "C" # re_data["AREA_CODE_"] # re_data["BANK_CODE_"] # re_data["BANK_NAME_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "") # # re_data["REMARK_"] # # re_data["UPDATE_TIME_"] # re_data["CODE_"] = data["CODE_"] re_data["NAME_"] = data["NAME_"] re_data["ISSUER_"] = data["ISSUER_"] # re_data["FUNCTION_"] = data["FUNCTION_"] # # re_data["PRO_START_"] re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"] # re_data["RUN_MODE_"] = data["RUN_MODE_"] re_data["INDUSTRY_"] = data["INDUSTRY_"] re_data["PUB_DATE_"] = data["PUB_DATE_"] # re_data["SCALE_"] = data["SCALE_"] # # re_data["MONTH_"] re_data["YIELD_RATE_"] = data["YIELD_RATE_"] re_data["START_FUNDS_"] = data["START_FUNDS_"] # # re_data["PURPOSE_"] # # re_data["ESTAB_ANNOUNCEMENT_"] re_data["ENTRUST_STATUS_"] = data["STATUS_"] # re_data["DISTRIBU_MODE_"] = data["DISTRIBU_MODE_"] # # re_data["INVEST_AREA_"] re_data["TERM_TYPE_"] = data["TERM_TYPE_"] # # re_data["INVEST_DIRECTION_"] # re_data["INVEST_MODE_"] = data["INVEST_MODE_"] # re_data["CURRENCY_"] = data["CURRENCY_"] # re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"] # re_data["SALE_TARGET_"] = data["SALE_TARGET_"] # re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"] re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"] # re_data["RESERVE_INFO_"] = data["RESERVE_INFO_"] re_data["TRUSTEESHIP_BANK_"] = data["TRUSTEESHIP_BANK_"] re_data["OTHER_INFO_"] = data["OTHER_INFO_"] elif data["ENTITY_CODE_"] == "TRUSTONE": # "C" # re_data["AREA_CODE_"] # re_data["BANK_CODE_"] # re_data["BANK_NAME_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "") # # re_data["REMARK_"] # # re_data["UPDATE_TIME_"] # re_data["CODE_"] = data["CODE_"] re_data["NAME_"] = data["NAME_"] re_data["ISSUER_"] = data["ISSUER_"] # re_data["FUNCTION_"] = data["FUNCTION_"] # # re_data["PRO_START_"] # re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"] # re_data["RUN_MODE_"] = data["RUN_MODE_"] # re_data["INDUSTRY_"] = data["INDUSTRY_"] re_data["PUB_DATE_"] = data["PUB_DATE_"] re_data["SCALE_"] = data["SCALE_"] # # re_data["MONTH_"] re_data["YIELD_RATE_"] = data["YIELD_RATE_"] re_data["START_FUNDS_"] = data["START_FUNDS_"] # # re_data["PURPOSE_"] # # re_data["ESTAB_ANNOUNCEMENT_"] # re_data["ENTRUST_STATUS_"] = data["STATUS_"] # # re_data["DISTRIBU_MODE_"] = data["DISTRIBU_MODE_"] re_data["INVEST_AREA_"] = data["INVEST_AREA_"] re_data["TERM_TYPE_"] = data["TERM_TYPE_"] re_data["INVEST_DIRECTION_"] = data["INVEST_DIRECTION_"] re_data["INVEST_MODE_"] = data["INVEST_MODE_"] # re_data["CURRENCY_"] = data["CURRENCY_"] # re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"] # re_data["SALE_TARGET_"] = data["SALE_TARGET_"] re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"] # re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"] re_data["RESERVE_INFO_"] = re.sub(r"</?\w*>", "", data["RESERVE_INFO_"]) # re_data["TRUSTEESHIP_BANK_"] = data["TRUSTEESHIP_BANK_"] # re_data["OTHER_INFO_"] = data["OTHER_INFO_"] return re_data def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # # create table sql # table_sql = ('create table "ENTRUST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."CREATE_TIME_" varchar, "C"."STATUS_" varchar,' # '"C"."DEALTIME_" varchar, "C"."URL_" varchar, "C"."AREA_CODE_" varchar, "C"."FUNCTION_" varchar,' # '"C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar,' # '"C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, "C"."UPDATE_TIME_" varchar,' # '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."ISSUER_" varchar, "C"."PRO_START_" varchar,' # '"C"."INVEST_PERIOD_" varchar,"C"."RUN_MODE_" varchar, "C"."INDUSTRY_" varchar,' # '"C"."PUB_DATE_" varchar, "C"."SCALE_" varchar, "C"."MONTH_" varchar, "C"."YIELD_RATE_" varchar,' # '"C"."START_FUNDS_" varchar, "C"."PURPOSE_" varchar, "C"."ESTAB_ANNOUNCEMENT_" varchar,' # '"C"."ENTRUST_STATUS_" varchar, "C"."DISTRIBU_MODE_" varchar, "C"."INVEST_AREA_" varchar,' # '"C"."TERM_TYPE_" varchar, "C"."INVEST_DIRECTION_" varchar, "C"."INVEST_MODE_" varchar,' # '"C"."CURRENCY_" varchar, "C"."MANAGE_TYPE_" varchar, "C"."SALE_TARGET_" varchar,' # '"C"."PROFIT_TYPE_" varchar, "C"."ISSUER_AREA_" varchar, "C"."RESERVE_INFO_" varchar,' # '"C"."TRUSTEESHIP_BANK_" varchar, "C"."OTHER_INFO_" varchar) IMMUTABLE_ROWS = true') # # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection, data_id="5c67307d9bb3df76b4229f79") for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data # try: re_data = self.data_shuffle(data=data) # except Exception as e: # self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) # continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # # add {d:1} # try: # self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, # data_dict={"d": 1}) # self.remove_count += 1 # if self.remove_count % 10 == 0: # self.logger.info("MongoDB 更新成功, 成功条数 {}".format(self.remove_count)) # except Exception as e: # self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e)) # continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()