class Trend(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="TREND") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="CHA_BRANCH_MARKET_ACT") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def data_shuffle(self, data): re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # 分行 copy_result = dict() copy_result["ID_"] = row_key copy_result["ENTITY_CODE_"] = data["ENTITY_CODE_"] copy_result["ENTITY_NAME_"] = data["ENTITY_NAME_"] copy_result["URL_"] = data["URL_"] # copy_result["PROVINCE_CODE_"] = result[""] # copy_result["PROVINCE_NAME_"] = result[""] # copy_result["CITY_CODE_"] = result[""] # copy_result["CITY_NAME_"] = result[""] # copy_result["AREA_CODE_"] = result[""] # copy_result["AREA_NAME_"] = result[""] # copy_result["LAT_"] = result[""] # copy_result["LNG_"] = result[""] copy_result["APP_VERSION_"] = "BRANCH" copy_result["BANK_CODE_"] = data["ENTITY_CODE_"].replace( "PRIVATEINFO", "") copy_result["BANK_NAME_"] = data["ENTITY_NAME_"].replace("私行动态", "") # copy_result["UNIT_CODE_"] = result["UNIT_CODE_"] # copy_result["UNIT_NAME_"] = result[""] copy_result["PERIOD_CODE_"] = data["NOTICE_TIME_"].replace("-", "") # copy_result["REMARK_"] = result[""] time_array = time.localtime() create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) copy_result["CREATE_TIME_"] = create_time copy_result["SPIDER_TIME_"] = data["DATETIME_"] # copy_result["MODIFIED_TIME_"] = result[""] copy_result["CREATE_BY_ID_"] = "P0131857" copy_result["CREATE_BY_NAME_"] = "钟楷文" # copy_result["MODIFIED_BY_ID_"] = result[""] # copy_result["MODIFIED_BY_NAME_"] = result[""] copy_result["M_STATUS_"] = "0" copy_result["DELETE_STATUS_"] = "0" copy_result["DATA_STATUS_"] = "uncheck" # copy_result["TAGS_"] = result[""] source = re.findall(r"(https?://.*?)/", data["URL_"]) copy_result["SOURCE_"] = source[0] copy_result["SOURCE_NAME_"] = data["ENTITY_NAME_"] # copy_result["SOURCE_TYPE_"] = result[""] # copy_result["HOT_"] = result[""] # copy_result["IMPORTANCE_"] = result[""] copy_result["ACT_NAME_"] = data["TITLE_"] # copy_result["IMAGES_"] = data[""] # copy_result["TARGET_"] = data[""] # copy_result["BRIEF_"] = data[""] copy_result["DETAILS_"] = data["CONTENT_"] # copy_result["RULE_"] = data[""] # copy_result["START_TIME_"] = data[""] # copy_result["END_TIME_"] = data[""] # copy_result["ACT_TYPE1_"] = data[""] # copy_result["ACT_TYPE2_"] = data[""] # copy_result["ACT_TYPE3_"] = data[""] copy_result["PUBLISH_TIME_"] = data["NOTICE_TIME_"] # copy_result["READS_"] = data[""] # copy_result["LIKES_"] = data[""] # copy_result["COMMENTS_"] = data[""] # copy_result["JOINS_"] = data[""] # copy_result["RELAYS_"] = data[""] # copy_result["SOURCE_ID_"] = data[""] # copy_result["HTML_"] = data[""] # copy_result["SOURCE_OWN_NAME_"] = data[""] # copy_result["SOURCE_OWN_ID_"] = data[""] return copy_result # "C" # re_data["ID_"] = row_key # re_data["TYPE_"] = random.choice( # ["税务法律", "子女教育", "健康医养", "财富管理", "生活娱乐", "旅游出行", "艺术/艺术品", "节日庆贺", "其他"]) # re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] # re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] # re_data["BANK_CODE_"] = data["ENTITY_CODE_"].replace("PRIVATEINFO", "") # re_data["BANK_NAME_"] = data["ENTITY_NAME_"].replace("私行动态", "") # # re_data["AREA_CODE_"] # # re_data["UNIT_CODE_"] # period_code = data["NOTICE_TIME_"].replace("-", "") # re_data["PERIOD_CODE_"] = period_code # re_data["CONTENT_"] = data["CONTENT_"] # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"] # re_data["STATUS_"] = "1" # # re_data["REMARK_"] = "" # re_data["CREATE_TIME_"] = data["DATETIME_"] # # re_data["UPDATE_TIME_"] # re_data["TITLE_"] = data["TITLE_"] # re_data["URL_"] = data["URL_"] # re_data["DEALTIME_"] = data["DEALTIME_"] # # re_data["DATETIME_"] = data["DATETIME_"] # # re_data["SOURCE_TYPE_"] # # return re_data def run(self): # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # quit() # add colum # self.p_client.add_column_phoenix(connection=self.connection, column="SOURCE_TYPE_") # quit() # create table sql # table_sql = ('create table "MARKETING_ACT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."TITLE_" varchar,"C"."NOTICE_TIME_" varchar,' # '"T"."CONTENT_" varchar,"C"."OBJ_" varchar, "C"."ATENDANCE_" varchar, "C"."PERIOD_CODE_" varchar,' # '"C"."IMAGES_" varchar, "C"."RESULTS_" varchar,"C"."PLACE_" varchar, "C"."TYPE_" varchar,' # '"C"."READ_NUM_" varchar, "C"."CONTENT_NUM_" varchar, "C"."COMMENT_CONTENT_" varchar, ' # '"C"."FORWARD_NUM_" varchar, "C"."COLLECTION_NUM_" varchar, "C"."PRAISE_NUM_" varchar,' # '"C"."BANK_NAME_" varchar, "C"."STATUS_" varchar, "C"."REMARK_" varchar, "C"."SOURCE_ID_" varchar,' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar,"C"."SOURCE_" varchar,' # '"C"."URL_" varchar, "C"."BANK_CODE_" varchar, "C"."DEALTIME_" varchar, ' # '"C"."SOURCE_TYPE_" varchar, "C"."IMPROTANCE_" varchar) IMMUTABLE_ROWS = true') # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection) # for i in range(mongo_data_list.count() + 100): for i in range(100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # add {d:1} try: self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, data_dict={"d": 1}) self.remove_count += 1 if self.remove_count % 10 == 0: self.logger.info("MongoDB 更新成功, 成功条数 {}".format( self.remove_count)) except Exception as e: self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format( self.data_id, e)) continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class BaiduSearch(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="BAIDU_SEARCH") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="BAIDU_SEARCH") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" # BANK_NAME_ 字典 交通银行 BOCOM 改为 COMM 中信银行 ECITIC 改为 CITIC 增加 平安银行 北京银行 上海银行 self.name_dict = { "ICBC": "中国工商银行", "ABC": "中国农业银行", "BOC": "中国银行", "CCB": "中国建设银行", "COMM": "交通银行", "PSBC": "中国邮政储蓄银行", "CZB": "浙商银行", "CBHB": "渤海银行", "CITIC": "中信银行", "CEB": "中国光大银行", "HXB": "华夏银行", "CMBC": "中国民生银行", "CMB": "招商银行", "CIB": "兴业银行", "CGB": "广发银行", "PAB": "平安银行", "SPDB": "浦发银行", "EBCL": "恒丰银行", "PINGAN": "平安银行", "LTD": "中国光大银行", "BEIJING": "北京银行", "BOSC": "上海银行" } # TYPE_ 列表 self.type_list = [ "Market", "Activity", "GoodStart", "MidSeason", "PrivateBank", "Recommendation" ] def data_shuffle(self, data): re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] # re_data["AREA_CODE_"] for bank_c in self.name_dict: if bank_c in data["ENTITY_CODE_"]: re_data["BANK_CODE_"] = bank_c break if "BANK_CODE_" in re_data: re_data["BANK_NAME_"] = self.name_dict[re_data["BANK_CODE_"]] else: print(data["ENTITY_CODE_"]) # re_data["UNIT_CODE_"] # re_data["PERIOD_CODE_"] = "" re_data["CONTENT_"] = data["CONTENT_"] re_data["STATUS_"] = "UNPROCESSED" # re_data["REMARK_"] = "" re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] for type in self.type_list: if type in data["ENTITY_CODE_"]: re_data["TYPE_"] = type break re_data["TITLE_"] = data["TITLE_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] # re_data["DATETIME_"] = data["DATETIME_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] return re_data def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # # create table sql # table_sql = ('create table "BAIDU_SEARCH" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,"C"."REMARK_" varchar,' # ' "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, ' # '"C"."TITLE_" varchar, "C"."URL_" varchar, "C"."ENTITY_NAME_" varchar,"C"."TYPE_" varchar,' # '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar) IMMUTABLE_ROWS = true') # # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection) for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # add {d:1} try: self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, data_dict={"d": 1}) self.remove_count += 1 if self.remove_count % 10 == 0: self.logger.info("MongoDB 更新成功, 成功条数 {}".format( self.remove_count)) except Exception as e: self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format( self.data_id, e)) continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class Statistics(object): def __init__(self, entity_type=None): """ 初始化 :param entity_type: """ t = arrow.now() self.local_time = t.format("YYYY-MM-DD") h_t = t.shift(days=-1) self.hesternal_time = h_t.format("YYYY-MM-DD") self.entity_type = entity_type self.__base_path = os.path.abspath(os.path.dirname(__file__)) self.__dir_path = self.__base_path + "/scripts/{}".format( self.entity_type) self.file_path = self.__base_path + "/statistics/{}".format( self.local_time) self.__type_list = list() self.__file_list = list() if self.entity_type: self.get_entity_code() self.mongo_client = MongoClient() # "hesternal_spider_url_temp": 0, "hesternal_spider_url_fixed": 0, self.name_dict = { "实体编码": "", "待爬数据": 0, "需爬取总量": 0, "现有数据": 0, "昨日爬取数据": 0 } def get_entity_code(self): """ 获取目标目录下文件名(去除 "CommonBidding_" 后就是 ENTITY_CODE_ ) :return: """ for root, dirs, files in os.walk(self.__dir_path): # print(root) # 当前目录路径 # print(dirs) # 当前路径下所有子目录 # print(files) # 当前路径下所有非目录子文件 self.__file_list = files self.__file_list.remove("__init_____.py") break def save_to_csv(self, file_path): """ save each count and save to csv :param file_path: :return: """ if os.path.exists(file_path): with open(file_path, "a", newline="", errors="ignore") as f: writer = csv.writer(f) append_list = list() for key, value in self.name_dict.items(): append_list.append(value) writer.writerows([append_list]) else: try: with open(file_path, "a", newline="", errors="ignore") as f: writer = csv.writer(f) check_list = list() append_list = list() for key, value in self.name_dict.items(): check_list.append(key) append_list.append(value) writer.writerows([check_list]) writer.writerows([append_list]) except FileNotFoundError: os.makedirs(self.file_path) with open(file_path, "a", newline="", errors="ignore") as f: writer = csv.writer(f) check_list = list() append_list = list() for key, value in self.name_dict.items(): check_list.append(key) append_list.append(value) writer.writerows([check_list]) writer.writerows([append_list]) def count_from_database(self): """ count data for database "spider_url_temp", "spider_url_fixed", "spider_data" where entity_code == entity_type :return: """ # test_index = self.__file_list.index("CommonBidding_650500HMSSY.py") # self.__file_list = self.__file_list[test_index:] for entity_code in self.__file_list: entity_code = entity_code.replace("CommonBidding_", "") entity_code = entity_code.replace(".py", "") print(entity_code) self.name_dict["实体编码"] = entity_code # spider_url_temp db = self.mongo_client.client["spider_url_temp"] collection = db[entity_code] # 统计该实体所有数据 try: mongo_data_list_temp = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_temp: self.name_dict["待爬数据"] = mongo_data_list_temp.count() else: self.name_dict["待爬数据"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) mongo_data_list_temp = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_temp: self.name_dict["待爬数据"] = mongo_data_list_temp.count() else: self.name_dict["待爬数据"] = 0 # 统计该实体昨天数据 # try: # temp_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}}, # {"DATETIME_": {"$lte": self.local_time}}]}) # if temp_day_ago: # self.name_dict["hesternal_spider_url_temp"] = temp_day_ago.count() # else: # self.name_dict["hesternal_spider_url_temp"] = 0 # except pymongo.errors.ServerSelectionTimeoutError: # time.sleep(5) # temp_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}}, # {"DATETIME_": {"$lte": self.local_time}}]}) # if temp_day_ago: # self.name_dict["hesternal_spider_url_temp"] = temp_day_ago.count() # else: # self.name_dict["hesternal_spider_url_temp"] = 0 # spider_url_fixed db = self.mongo_client.client["spider_url_fixed"] collection = db[entity_code] # 统计该实体所有数据 try: mongo_data_list_fixed = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_fixed: self.name_dict["需爬取总量"] = mongo_data_list_fixed.count() else: self.name_dict["需爬取总量"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) mongo_data_list_fixed = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_fixed: self.name_dict["需爬取总量"] = mongo_data_list_fixed.count() else: self.name_dict["需爬取总量"] = 0 # # 统计该实体昨天数据 # try: # fixed_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}}, # {"DATETIME_": {"$lte": self.local_time}}]}) # if fixed_day_ago: # self.name_dict["hesternal_spider_url_fixed"] = fixed_day_ago.count() # else: # self.name_dict["hesternal_spider_url_fixed"] = 0 # except pymongo.errors.ServerSelectionTimeoutError: # time.sleep(5) # fixed_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}}, # {"DATETIME_": {"$lte": self.local_time}}]}) # if fixed_day_ago: # self.name_dict["hesternal_spider_url_fixed"] = fixed_day_ago.count() # else: # self.name_dict["hesternal_spider_url_fixed"] = 0 # spider_data db = self.mongo_client.client["spider_data"] collection = db[self.entity_type] self.mongo_client.mongo_entity_code = entity_code if len(self.__file_list) == 1: # 统计该实体所有数据 try: mongo_data_list_data = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_data: self.name_dict["现有数据"] = mongo_data_list_data.count() else: self.name_dict["现有数据"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) mongo_data_list_data = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_data: self.name_dict["现有数据"] = mongo_data_list_data.count() else: self.name_dict["现有数据"] = 0 # 统计该实体昨天数据 try: data_day_ago = collection.find({ "$and": [{ "DATETIME_": { "$gte": self.hesternal_time } }, { "DATETIME_": { "$lte": self.local_time } }] }) if data_day_ago: self.name_dict["昨日爬取数据"] = data_day_ago.count() else: self.name_dict["昨日爬取数据"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) data_day_ago = collection.find({ "$and": [{ "DATETIME_": { "$gte": self.hesternal_time } }, { "DATETIME_": { "$lte": self.local_time } }] }) if data_day_ago: self.name_dict["昨日爬取数据"] = data_day_ago.count() else: self.name_dict["昨日爬取数据"] = 0 else: # 统计该实体所有数据 try: mongo_data_list_data = self.mongo_client.search_from_mongodb( collection=collection) if mongo_data_list_data: self.name_dict["现有数据"] = mongo_data_list_data.count() else: self.name_dict["现有数据"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) mongo_data_list_data = self.mongo_client.search_from_mongodb( collection=collection) if mongo_data_list_data: self.name_dict["现有数据"] = mongo_data_list_data.count() else: self.name_dict["现有数据"] = 0 # 统计该实体昨天数据 try: data_day_ago = collection.find({ "$and": [{ "ENTITY_CODE_": entity_code }, { "DATETIME_": { "$gte": self.hesternal_time } }, { "DATETIME_": { "$lte": self.local_time } }] }) if data_day_ago: self.name_dict["昨日爬取数据"] = data_day_ago.count() else: self.name_dict["昨日爬取数据"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) data_day_ago = collection.find({ "$and": [{ "ENTITY_CODE_": entity_code }, { "DATETIME_": { "$gte": self.hesternal_time } }, { "DATETIME_": { "$lte": self.local_time } }] }) if data_day_ago: self.name_dict["昨日爬取数据"] = data_day_ago.count() else: self.name_dict["昨日爬取数据"] = 0 file_path = self.file_path + "/count_for_{}.csv".format( self.entity_type) self.save_to_csv(file_path) self.mongo_client.client_close() def run(self): if self.entity_type: self.count_from_database() else: for root, dirs, files in os.walk(self.__base_path + "/scripts"): # print(root) # 当前目录路径 # print(dirs) # 当前路径下所有子目录 # print(files) # 当前路径下所有非目录子文件 self.__type_list = dirs self.__type_list.remove("__pycache__") break # 中断 # test_index = self.__type_list.index("NEWS_FINASSIST") # self.__type_list = self.__type_list[test_index:] for _type in self.__type_list: print(_type) self.entity_type = _type self.__dir_path = self.__base_path + "/scripts/{}".format( _type) self.get_entity_code() self.count_from_database()
class WechatScript(object): def __init__(self, entity_type="WECHAT"): """ 初始化参数 :param entity_type: WECHAT """ self.entity_type = entity_type self.logger = Logger().logger # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name=self.entity_type) # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="WECHAT") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # # 创建 MongoDB spider_data_old 数据库对象 # self.old_client = MongoClient(mongo_collection="WECHAT") # # 本地测试 # self.old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60, # connectTimeoutMS=60, connect=False) # self.old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = self.old_client.client_to_mongodb() # self.collection_old = db_old["WECHAT"] # 创建 MySQL 对象 self.mysql_client = GenericScript(entity_code=None, entity_type=None) self.remove_id_list = list() self.copy_mongo_data_list = list() self.verify_list = [ "ID_", "ENTITY_CODE_", "URL_", "AREA_CODE_", "BANK_CODE_", "BANK_NAME_", "UNIT_CODE_", "PERIOD_CODE_", "CONTENT_", "CONTENT_TYPE_", "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "TITLE_", "ENTITY_NAME_", "DEALTIME_", "DATETIME_", "STATUS_", "WECHAT_NAME_", "WECHAT_ID_" ] # BANK_NAME_ 字典 self.name_dict = { "ICBC": "中国工商银行", "ABC": "中国农业银行", "BOC": "中国银行", "CCB": "中国建设银行", "BOCOM": "交通银行", "PSBC": "中国邮政储蓄银行", "CZB": "浙商银行", "CBHB": "渤海银行", "ECITIC": "中信银行", "CEB": "中国光大银行", "HXB": "华夏银行", "CMBC": "中国民生银行", "CMB": "招商银行", "CIB": "兴业银行", "CGB": "广发银行", "PAB": "平安银行", "SPDB": "浦发银行", "EBCL": "恒丰银行" } self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", "Accept-Language": "zh-CN,zh;q=0.9", "Host": "weixin.sogou.com", "Referer": "http://weixin.sogou.com/" } self.url = "http://weixin.sogou.com/weixin?type=1&query={}&ie=utf8&s_from=input&_sug_=y&_sug_type_=" self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def check_name(self, wechat_id): check_dict = dict() with open("wechat_id_name.txt", "r", encoding="utf-8") as rf: read_data = rf.read() if read_data: read_data = read_data.replace("\'", "\"") read_data = read_data.replace(": None", ": \"None\"") # print(read_data) check_dict = json.loads(read_data) else: wechat_name = self.req_for_name(wechat_id) check_dict[wechat_id] = wechat_name with open("wechat_id_name.txt", "w", encoding="utf-8") as wf: wf.write(str(check_dict)) return check_dict[wechat_id] if wechat_id in check_dict: return check_dict[wechat_id] else: wechat_name = self.req_for_name(wechat_id) check_dict[wechat_id] = wechat_name with open("wechat_id_name.txt", "w", encoding="utf-8") as wf: wf.write(str(check_dict)) return check_dict[wechat_id] def req_for_name(self, wechat_id): url = self.url.format(wechat_id) # response = WanDou().http_client(url=url, param=self.headers) resp1 = requests.get( url= r"http://h.wandouip.com/get/ip-list?pack=853&num=1&xy=1&type=2&lb=\r\n&mr=1&" ) resp2 = resp1.json()["data"][0] # print(resp2) # resp1.close() time.sleep(2) try: response = requests.get( url=url, headers=self.headers, proxies={"http": "{}:{}".format(resp2["ip"], resp2["port"])}) except Exception as e: print(1, e) self.logger.info("error ip: {}".format(resp2)) time.sleep(5) return self.req_for_name(wechat_id) html = HTML(response.content.decode()) # response.close() name = html.xpath('//p[@class="tit"]/a/text()') if name: # print(name) self.error_count = 0 return name[0] else: self.error_count += 1 if self.error_count == 5: self.logger.info("wetchat id error: \"{}\"".format(wechat_id)) return "None" else: time.sleep(2) self.req_for_name(wechat_id) # if response is None: # self.logger.info("ip_prox error") # return self.req_for_name(wechat_id) # if isinstance(response, str): # html = HTML(response) # name = html.xpath('//p[@class="tit"]/a/text()') # if name: # print(name) # return name[0] # else: # self.logger.info("ip_prox error2") # return self.req_for_name(wechat_id) # else: # self.logger.info("ip_prox error change") # return self.req_for_name(wechat_id) def data_shuffle(self, data, province_list, city_list, area_list): """ 数据清洗 :param data: :param province_list: :param city_list: :param area_list: :return: re_data or None """ # BANK_CODE_正则匹配规则 pattern = re.compile( r'ICBC|ABC|BOCOM|CCB|BOC|PSBC|CZB|CBHB|ECITIC|CEB|HXB|CMBC|CMB|CIB|CGB|PAB|SPDB|EBCL' ) re_data = dict() if data["TITLE_"]: # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" 通用列族字段 re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] # re_data["URL_"] = "" prov_c = None prov_n = None city_c = None city_n = None area_c = None area_n = None bank_n = None bank_c = pattern.match(data["ENTITY_CODE_"]) if bank_c: re_data["BANK_CODE_"] = bank_c.group() else: return None # 正则去除银行名称,方便匹配地区编码 bank_n = re.sub( r"{}银?行?|微信|[总分支]行".format( self.name_dict[re_data["BANK_CODE_"]][:-2]), "", data["ENTITY_NAME_"]) re_data["BANK_NAME_"] = self.name_dict[re_data["BANK_CODE_"]] re_data["PERIOD_CODE_"] = data["PERIOD_CODE_"].replace("-", "") re_data["NOTICE_TIME_"] = data["PERIOD_CODE_"] re_data["STATUS_"] = "1" re_data["CONTENT_"] = data["CONTENT_"] re_data["REMARK_"] = "" for area in area_list: if area["NAME_"] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] if area_c: pass else: for prov in province_list: if prov["NAME_"] in bank_n: prov_c = prov["CODE_"] prov_n = prov["NAME_"] bank_n = bank_n.replace(prov_n, "") break elif prov["NAME_"][:-1] in bank_n: prov_c = prov["CODE_"] prov_n = prov["NAME_"] bank_n = bank_n.replace(prov_n[:-1], "") break elif prov["NAME_"][:4] in bank_n: prov_c = prov["CODE_"] prov_n = prov["NAME_"] bank_n = bank_n.replace(prov_n[:4], "") break elif prov["NAME_"][:3] in bank_n: prov_c = prov["CODE_"] prov_n = prov["NAME_"] bank_n = bank_n.replace(prov_n[:3], "") break elif prov["NAME_"][:2] in bank_n: prov_c = prov["CODE_"] prov_n = prov["NAME_"] bank_n = bank_n.replace(prov_n[:2], "") break for city in city_list: if len(city["NAME_"]) == 1: continue if prov_c: if city["CODE_"][:2] == prov_c[:2]: if city["NAME_"] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n, "") break elif city["NAME_"][:-1] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:-1], "") break elif city["NAME_"][:4] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:4], "") break elif city["NAME_"][:3] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:3], "") break elif city["NAME_"][:2] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:2], "") break else: if city["NAME_"] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n, "") break elif city["NAME_"][:-1] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:-1], "") break elif city["NAME_"][:4] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:4], "") break elif city["NAME_"][:3] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:3], "") break elif city["NAME_"][:2] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:2], "") break for area in area_list: if city_c: if area["CODE_"][:2] == city_c[:2]: if area["NAME_"] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:-1] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:4] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:3] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:2] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif prov_c: if area["CODE_"][:2] == prov_c[:2]: if area["NAME_"] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:-1] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:4] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:3] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:2] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break else: if area["NAME_"][:-1] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:4] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:3] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:2] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break # 特殊情况 星子县现为庐山市 喻家山位于武汉洪山区 if "星子县" in data["ENTITY_NAME_"]: area_c = "360483" area_n = "庐山市" elif "喻家山" in data["ENTITY_NAME_"]: area_c = "420111" area_n = "洪山区" elif "江南西" in data["ENTITY_NAME_"]: area_c = "440105" area_n = "海珠区" elif "两路口" in data["ENTITY_NAME_"]: area_c = "500103" area_n = "渝中区" elif "大兴安岭" in data["ENTITY_NAME_"]: area_c = "232700" area_n = "大兴安岭地区" elif "张家港" in data["ENTITY_NAME_"]: area_c = "320582" area_n = "张家港市" elif "兴业银行新阳支行" in data["ENTITY_NAME_"]: area_c = "230102" area_n = "道里区" if area_c: pass elif (not area_c) and city_c: area_c = city_c area_n = city_n elif (not area_c) and (not city_c) and prov_c: area_c = prov_c area_n = prov_n # 总行地区处理 elif (not area_c) and (not city_c) and (not prov_c): if re_data["BANK_CODE_"] == "ICBC": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "ABC": area_c = "110101" area_n = "东城区" elif re_data["BANK_CODE_"] == "BOCOM": area_c = "310115" area_n = "浦东新区" elif re_data["BANK_CODE_"] == "CCB": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "BOC": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "PSBC": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "CZB": area_c = "330103" area_n = "下城区" elif re_data["BANK_CODE_"] == "CBHB": area_c = "120103" area_n = "河西区" elif re_data["BANK_CODE_"] == "ECITIC": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "CEB": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "HXB": area_c = "110101" area_n = "东城区" elif re_data["BANK_CODE_"] == "CMBC": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "CMB": area_c = "440304" area_n = "福田区" elif re_data["BANK_CODE_"] == "CIB": area_c = "350102" area_n = "鼓楼区" elif re_data["BANK_CODE_"] == "CGB": area_c = "440104" area_n = "越秀区" elif re_data["BANK_CODE_"] == "PAB": area_c = "440303" area_n = "罗湖区" elif re_data["BANK_CODE_"] == "SPDB": area_c = "310101" area_n = "黄浦区" elif re_data["BANK_CODE_"] == "EBCL": area_c = "370602" area_n = "芝罘区" re_data["AREA_CODE_"] = area_c if area_c: re_data["UNIT_CODE_"] = re_data[ "BANK_CODE_"] + "_" + area_c[:4] + "00" if ("b" in data["BANK_NAME_"]) or ("B" in data["BANK_NAME_"]): return None if "DATETIME_" not in data: time_array = time.localtime(int(float(data["DEALTIME_"]))) value_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) re_data["CREATE_TIME_"] = value_time else: re_data["CREATE_TIME_"] = data["DATETIME_"] # data["UPDATE_TIME_"] = "" re_data["TITLE_"] = data["TITLE_"] re_data["CONTENT_TYPE_"] = data["CONTENT_TYPE_"] re_data["WECHAT_ID_"] = data["WECHAT_"].replace(" ", "") re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["DEALTIME_"] = str(data["DEALTIME_"]) # print(area_c, area_n, data["ENTITY_NAME_"]) return re_data else: return None def delete_data_from_mongo(self): """ 从 MongoDB 删除数据 :return: delete_count """ try: remove_count = self.m_client.remove_from_mongo( collection=self.collection, remove_id_list=self.remove_id_list) return remove_count except pymongo.errors.ServerSelectionTimeoutError: mongo_data_list = self.m_client.remove_from_mongo( collection=self.collection, remove_id_list=self.remove_id_list) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None def upsert_and_delete(self, mongo_data_list, province_list, city_list, area_list): """ 插入和删除 :param mongo_data_list: :param province_list: :param city_list: :param area_list: :return: """ for i in range(1000000): status = False self.data_id = "" success_count = 0 try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) # print(data["_id"]) # self.remove_id_list.append(self.data_id) # del data["_id"] # copy_data = deepcopy(data) # self.copy_mongo_data_list.append(copy_data) # 清洗数据 try: re_data = self.data_shuffle(data=data, province_list=province_list, city_list=city_list, area_list=area_list) except Exception as e: # self.remove_id_list.remove(self.data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # 获取公众号名称 # try: # print(re_data["WECHAT_ID_"]) re_data["WECHAT_NAME_"] = self.check_name( re_data["WECHAT_ID_"]) # re_data["WECHAT_NAME_"] = self.req_for_name(re_data["WECHAT_ID_"]) # print(re_data["WECHAT_ID_"]) # print(re_data["WECHAT_NAME_"]) # except Exception as e: # 向 HBase 插入数据 try: count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) success_count += count except jaydebeapi.DatabaseError as e: # self.logger.info("error: {}".format(e)) # self.remove_id_list.remove(self.data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # # Phoenix 连接关闭 # p_client.close_client_phoenix(connection=connection) # time.sleep(10) # # 连接 Phoenix # connection = p_client.connect_to_phoenix() # # 向 HBase 插入数据 # count = p_client.upsert_to_phoenix_by_one(connection=connection, data=re_data) # success_count += count # try: # # 添加 {d:1} # update_count = self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, # data_dict={"d": 1}) # self.remove_count += update_count # # self.logger.info("MongoDB 更新成功") # if self.remove_count % 10 == 0: # self.logger.info("MongoDB 更新成功, 成功条数 {} 条".format("10")) # except Exception as e: # # self.remove_id_list.remove(data_id) # # self.copy_mongo_data_list.remove(copy_data) # self.logger.warning("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e)) # continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 # self.remove_id_list.remove(self.data_id) # self.copy_mongo_data_list.remove(copy_data) continue # # 删除数据 # if status: # delete_count = self.delete_data_from_mongo() # self.remove_count += delete_count # self.logger.info("MongoDB 删除成功") # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # # 将数据插入 spider_data_old 中 # if status: # try: # self.old_client.mongo_db = "spider_data_old" # insert_count = self.old_client.all_to_mongodb(collection=self.collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except pymongo.errors.ServerSelectionTimeoutError as e: # time.sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = self.old_client.all_to_mongodb(collection=self.collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) def main(self): """ :return: """ # # 删除表 # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # 建表语句 # table_sql = ('create table "WECHAT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."URL_" varchar, "C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar,' # '"C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar,' # '"C"."REMARK_" varchar, "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, ' # '"T"."CONTENT_" varchar, "C"."CONTENT_TYPE_" varchar, "C"."TITLE_" varchar,' # '"C"."WECHAT_ID_" varchar, "C"."WECHAT_NAME_" varchar, "C"."ENTITY_NAME_" varchar,' # '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar, "C"."PRAISES_" varchar,' # '"C"."READ_NUM_" varchar, "C"."REPLIES_" varchar, "C"."RELAYS_" varchar,' # '"C"."NOTICE_TIME_" varchar, "C"."IMPROTANCE_" varchar) IMMUTABLE_ROWS = true') # # # 创建表 # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) # f_id = "5c1267258d7fee59f7d089f8" # gte 10M # f_id = "5c1271a28d7fee66df0fdd83" # gte 10M # f_id = "5c127e7b9bb3df7412b53b04" # gte 10M # f_id = "5c1330d28d7fee4d9c87d6e1" # gte 10M # f_id = "5c1330ed9bb3df2de33bb746" # gte 10M # f_id = "5c13490a8d7fee79f1d9e87f" # gte 10M # f_id = "5c1350ee8d7fee2d29b601ef" # gte 10M # f_id = "5c1351c79bb3df0e23ee68c1" # gte 10M # f_id = "5c13547d9bb3df06d41997d5" # gte 10M # f_id = "5c1354849bb3df202508ee3e" # gte 10M # f_id = "5c1354bd8d7fee44b881b11a" # gte 10M # f_id = "5c1354e89bb3df1b2a6ef59c" # gte 10M # f_id = "5c1355139bb3df197beb11c0" # gte 10M # f_id = "5c1355328d7fee2f0997a3ac" # gte 10M # f_id = "5c13558e8d7fee50ea04bd0a" # gte 10M # f_id = "5c135a5f8d7fee5bf7db91b8" # gte 10M # f_id = "5c135b0c8d7fee697fa5bd80" # gte 10M # f_id = "5c135bd59bb3df4d7aa66cad" # gte 10M # f_id = "5c135bdb9bb3df454c0157a3" # gte 10M # f_id = "5c135bfc8d7fee73c8f84567" # gte 10M # f_id = "5c135c119bb3df48aeb8fe63" # gte 10M # f_id = "5c135dfe9bb3df4d7aa66cc2" # gte 10M # f_id = "5c13602d8d7fee7f7a48c485" # gte 10M # f_id = "5c1361858d7fee223825f805" # gte 10M # f_id = "5c1361d68d7fee561806fc4d" # gte 10M # f_id = "5c1362068d7fee223825f808" # gte 10M # f_id = "5c1362159bb3df26bba60a05" # gte 10M # f_id = "5c1366248d7fee6741adb5be" # gte 10M # f_id = "5c1366418d7fee673f6c95cb" # gte 10M # f_id = "5c1367099bb3df5a0e013c4d" # gte 10M # f_id = "5c13686d8d7fee76ac78735b" # gte 10M # f_id = "5c1368788d7fee6fcb24daa3" # gte 10M # f_id = "5c1369438d7fee63412b04ff" # gte 10M # f_id = "5c13697b9bb3df60429b5d31" # gte 10M # f_id = "5c1389468d7fee6a94c413c3" # gte 10M # f_id = "5c1389c29bb3df75adc8861a" # gte 10M # f_id = "5c138b039bb3df75adc88620" # gte 10M # f_id = "5c138e3d9bb3df074c4ec0b3" # gte 10M # f_id = "5c138e4d8d7fee06a4f8fd59" # gte 10M # f_id = "5c1391318d7fee168749a96e" # gte 10M # f_id = "5c25a4f19bb3df51eba386b8" # gte 10M # f_id = "5c2601ef9bb3df7d42fe2084" # gte 10M # f_id = "5c2608099bb3df24f5db4527" # gte 10M # f_id = "5c2608be9bb3df2d58d08e32" # gte 10M # f_id = "5c260d2b9bb3df3c084d2a83" # gte 10M # f_id = "5c2615868d7fee2771bb3914" # gte 10M # f_id = "5c261d528d7fee3c1383db85" # gte 10M # f_id = "5c26340e8d7fee66d784fe8a" # gte 10M # f_id = "5c263b818d7fee630f0d3ac4" # gte 10M # f_id = "5c263ee28d7fee04ddc62e31" # gte 10M # f_id = "5c263f269bb3df0d29d1e1e5" # gte 10M # f_id = "5c2766718d7fee2aa36fa166" # gte 10M # f_id = "5c2b79ef8d7fee3025e02575" # gte 10M # f_id = "5c2b854a9bb3df27dc669d5a" # gte 10M # f_id = "5c2e00078d7fee1b60443cf3" # gte 10M # f_id = "5c2f69028d7fee62d31a72db" # gte 10M # f_id = "5c36a7948d7fee18d9333327" # gte 10M # f_id = "5c36b9ff9bb3df332dfebe39" # gte 10M # f_id = "5c3754579bb3df02b680150b" # gte 10M # f_id = "5c375c969bb3df6afd18e22d" # gte 10M # f_id = "5c38a1e59bb3df6b2ff2f269" # gte 10M # f_id = "5c394e058d7fee6a2582d1d3" # gte 10M # f_id = "5c3c983e9bb3df21ddf94a92" # gte 10M # f_id = "5c3ca38a9bb3df60bca07833" # gte 10M f_id = "5c3c983e9bb3df21ddf94a92" # f_id = "" self.data_id = f_id province_list, city_list, area_list, dir_area_list = self.mysql_client.area_from_mysql( ) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection, data_id=self.data_id) self.find_count += mongo_data_list.count() try: self.upsert_and_delete(mongo_data_list=mongo_data_list, province_list=province_list, city_list=city_list, area_list=area_list) except jaydebeapi.DatabaseError: self.logger.info("error id is: {}".format(self.data_id)) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection, data_id=self.data_id) self.upsert_and_delete(mongo_data_list=mongo_data_list, province_list=province_list, city_list=city_list, area_list=area_list) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class Meipian(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="meipian_CCBDATA") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="MEIPIAN_CCBDATA") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def data_shuffle(self, data): re_data = dict() if data["TITLE_"]: # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" 通用列族字段 re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] # re_data["BANK_NAME_"] time_arrary = arrow.get(data["CREATE_TIME"]) period_code = time_arrary.format("YYYYMMDD") publish_time = time_arrary.format("YYYY-MM-DD HH:mm:ss") re_data["PERIOD_CODE_"] = str(period_code) re_data["PUBLISH_TIME_"] = str(publish_time) re_data["STATUS_"] = "UNPROCESSED" re_data["CONTENT_"] = data["CONTENT_"] re_data["REMARK_"] = "" # re_data["AREA_CODE_"] # re_data["UNIT_CODE_"] re_data["CREATE_TIME_"] = data["DATETIME_"] re_data["URL_"] = data["URL_"] re_data["TITLE_"] = data["TITLE_"] re_data["CONTENT_TYPE_"] = data["CONTENT_TYPE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["DEALTIME_"] = str(data["DEALTIME_"]) re_data["VISIT_COUNT_"] = data["VISIT_COUNT"] re_data["PRAISE_COUNT_"] = data["PRAISE_COUNT"] re_data["COMMENT_COUNT_"] = data["COMMENT_COUNT"] re_data["SOURCE_"] = data["SOURCE_"] return re_data else: return None def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # # create table sql # table_sql = ('create table "MEIPIAN_CCBDATA" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."URL_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,' # ' "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, ' # '"C"."TITLE_" varchar, "C"."CONTENT_TYPE_" varchar, "C"."ENTITY_NAME_" varchar,' # '"C"."VISIT_COUNT_" varchar, "C"."PRAISE_COUNT_" varchar, "C"."COMMENT_COUNT_" varchar,' # '"C"."DEALTIME_" varchar, "C"."SOURCE_" varchar, "C"."PUBLISH_TIME_" varchar,' # '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true') # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) # f_id = "5c6fa1328d7fee306de9463d" # quit() # f_id = "5c6fe1ba8d7fee1d44775989" # quit() # f_id = "5c6fdb448d7fee394da6a5fb" # quit() Exception while executing batch. # f_id = "5c6fe1ba8d7fee1d44775989" f_id = "5c6fe11b9bb3df6b0ec6168b" # gt 10M mongo_data_list = self.m_client.all_from_mongodb(self.collection, data_id=f_id) for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # add {d:1} try: self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, data_dict={"d": 1}) self.remove_count += 1 if self.remove_count % 10 == 0: self.logger.info("MongoDB 更新成功, 成功条数 {}".format( self.remove_count)) except Exception as e: self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format( self.data_id, e)) continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="WEIBOBASICINFO") p_client.verify_list = self.verify_list # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="WEIBOBASICINFO") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # # 创建 MongoDB spider_data_old 数据库对象 # old_client = MongoClient(mongo_collection="WEIBOBASICINFO") # # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60, # connectTimeoutMS=60, connect=False) # old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = old_client.client_to_mongodb() # collection_old = db_old["ORGANIZE_FINASSIST"] # 获取地区编码 province_list, city_list, area_list, dir_area_list = (GenericScript( entity_code=None, entity_type=None).area_from_mysql()) # 删除表 p_client.drop_table_phoenix(connection=connection) # quit() # 创建表 sql = ( 'create table "WEIBOBASICINFO" ("ID_" varchar primary key, "C"."BANK_CODE_" varchar,' '"C"."BANK_NAME_" varchar, "C"."PERIOD_CODE_" varchar, "C"."CREATE_TIME_" varchar,' '"C"."UPDATE_TIME_" varchar, "C"."REMARK_" varchar, "C"."WEIBO_CODE_" varchar, "C"."MAIN_URL_" varchar,' '"C"."NAME_" varchar, "C"."FOCUS_" varchar, "C"."FANS_" varchar, "C"."COMPANY_URL_" varchar,' '"C"."COMPANY_" varchar, "C"."DETAILED_URL_" varchar, "C"."VIRIFIED_" varchar,"C"."AREA_CODE_" varchar,' '"C"."BIREF_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,' '"C"."DEALTIME_" varchar,"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar,' '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true') p_client.create_new_table_phoenix(connection=connection, sql=sql) # 增加列 # p_client.add_column_phoenix(connection=connection, column="IMAGE_") # 遍历 ENTITY_CODE_ 列表 status = False self.logger.info("开始进行 WEIBOBASICINFO") try: mongo_data_list = m_client.all_from_mongodb(collection=collection) except pymongo.errors.ServerSelectionTimeoutError: time.sleep(1) mongo_data_list = m_client.all_from_mongodb(collection=collection) # 清洗数据并插入 HBase if mongo_data_list: self.find_count = mongo_data_list.count() for data in mongo_data_list: re_data = "" data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) re_data = self.data_shuffle(data=data, province_list=province_list) if not re_data: self.bad_count += 1 continue except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) # phoenix_HBase 插入数据 try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if self.success_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format( self.success_count)) else: quit() # # 删除数据 # if status: # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection, # remove_id_list=self.remove_id_list) # self.remove_count += delete_count # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # # 将数据插入 spider_data_old 中 # if status: # try: # old_client.mongo_db = "spider_data_old" # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # except pymongo.errors.ServerSelectionTimeoutError as e: # time.sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class JsInsuranceCcbData(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="JSINSURANCE_CCBDATA") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 MySQL 对象 __mysql_config = { "host": MYSQL_HOST_25, "port": MYSQL_PORT_25, "database": MYSQL_DATABASE_25, "user": MYSQL_USER_25, "password": MYSQL_PASSWORD_25, "table": MYSQL_TABLE_25 } __mysql_client = MysqlClient(**__mysql_config) __mysql_connection = __mysql_client.client_to_mysql() self.type = __mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'TYPE\'", connection=__mysql_connection) __mysql_client.close_client(connection=__mysql_connection) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="INSURANCE") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" self.a = list() def data_shuffle(self, data): if data["ENTITY_CODE_"] == "PAINSURANCE": return None elif data["ENTITY_CODE_"] == "BJBINSURANCE": data["CONTET_"] = data["CONTET_"].replace("|主险2:", "主险2:") first_shuffle = data["CONTET_"].split("|") data_list = list() company_dict = dict() index_list = list() for first in first_shuffle: if first[-2:] == "公司": company_index = first_shuffle.index(first) company_dict[company_index] = first index_list.append(company_index) else: continue for key in index_list: # print(index_list) j = key + 1 for i in range(100): if index_list.index(key) == len(index_list) - 1: if j == len(first_shuffle) - 1: break else: if j == index_list[index_list.index(key) + 1]: break data_dict = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(first_shuffle[j].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" data_dict["ID_"] = row_key data_dict["ENTITY_CODE_"] = data["ENTITY_CODE_"] data_dict["ENTITY_NAME_"] = data["ENTITY_NAME_"].replace( "模板", "产品") data_dict["BANK_CODE_"] = "BJB" data_dict["BANK_NAME_"] = "北京银行" data_dict["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") data_dict["URL_"] = data["URL_"] data_dict["PRODUCT_NAME_"] = first_shuffle[j] j += 1 # data_dict["TYPE_"] = first_shuffle[j] data_dict["TYPE_"] = "" data_dict["TYPE_CODE_"] = "" for i in self.type: if i["ITEM_LABEL_"][:-1] in first_shuffle[j]: data_dict["TYPE_"] = data_dict["TYPE_"] + i[ "ITEM_LABEL_"] + "|" data_dict["TYPE_CODE_"] = data_dict[ "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|" data_dict["TYPE_"] = data_dict["TYPE_"][:-1] data_dict["TYPE_CODE_"] = data_dict["TYPE_CODE_"][:-1] j += 1 # data_dict["RISK_LEVEL_"] = first_shuffle[j] j += 1 data_dict["PAY_METHOD_"] = first_shuffle[j] j += 1 # data_dict["INSURANCE_DATE_"] = first_shuffle[j] j += 1 # data_dict["TOUZIZHE_TYPE_"] = first_shuffle[j] j += 1 data_dict["COM_NAME_"] = company_dict[key] # data_dict["CONSIGNMENT_"] = "代销" # if "CONTENT_" in data: # data_dict["CONTENT_"] = data["CONTENT_"] data_dict["DEALTIME_"] = data["DEALTIME_"] data_dict["CREATE_TIME_"] = data["DATETIME_"] data_dict["STATUS_"] = "1" # print(data_dict) data_list.append(data_dict) return data_list elif data["ENTITY_CODE_"] == "CIBINSURANCE": data_list = list() insurance_name = re.findall(r".*?计划", data["PRODUCT_NAME_"]) for name in insurance_name: re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(name.encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["PRODUCT_NAME_"] = name re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["BANK_CODE_"] = "CIB" re_data["BANK_NAME_"] = "兴业银行" re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] re_data["CREATE_TIME_"] = data["DATETIME_"] re_data["STATUS_"] = "1" re_data["TYPE_"] = "" re_data["TYPE_CODE_"] = "" for i in self.type: if i["ITEM_LABEL_"][:-1] in name: re_data["TYPE_"] = re_data["TYPE_"] + i[ "ITEM_LABEL_"] + "|" re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"] + i[ "ITEM_VALUE_"] + "|" re_data["TYPE_"] = re_data["TYPE_"][:-1] re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1] data_list.append(re_data) return data_list else: if "INSURANCE_NAME_" not in data and ("PRODUCT_NAME_" not in data): return None else: if "INSURANCE_NAME_" in data: # # 承保年龄 # if ("INSURANCE_AGE_" not in data) or (not data["INSURANCE_AGE_"]): # age = re.findall(r"(\d*)周岁", data["INSURANCE_NAME_"]) # if age: # data["INSURANCE_AGE_"] = age[0] # 保障期限 # if ("INSURANCE_DATE_" not in data) or (not data["INSURANCE_DATE_"]): # limit = re.findall(r"保(终身)|保(\d*年)|(\d*年)期", data["INSURANCE_NAME_"]) # if limit: # for l in limit[0]: # if l: # data["INSURANCE_DATE_"] = l # break re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["INSURANCE_NAME_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["BANK_CODE_"] = data["ENTITY_CODE_"].replace( "INSURANCE", "") re_data["BANK_NAME_"] = data["ENTITY_NAME_"].replace( "保险产品", "") if "INSURANCE_NAME_" in data: re_data["PRODUCT_NAME_"] = data["INSURANCE_NAME_"] if ("INSURANCE_AGE_" in data) or ("AGE_" in data): re_data["AGE_"] = data["INSURANCE_AGE_"] if "TYPE_" in data: re_data["TYPE_"] = "" re_data["TYPE_CODE_"] = "" if data["TYPE_"] == "财险": re_data["TYPE_"] = "财产险" re_data["TYPE_CODE_"] = "PROPERTY_INSURANCE" else: for i in self.type: if i["ITEM_LABEL_"][:-1] in data["TYPE_"]: re_data["TYPE_"] = re_data["TYPE_"] + i[ "ITEM_LABEL_"] + "|" re_data["TYPE_CODE_"] = re_data[ "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|" re_data["TYPE_"] = re_data["TYPE_"][:-1] re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1] else: re_data["TYPE_"] = "" re_data["TYPE_CODE_"] = "" for i in self.type: if i["ITEM_LABEL_"][:-1] in data["ENTITY_NAME_"]: re_data["TYPE_"] = re_data["TYPE_"] + i[ "ITEM_LABEL_"] + "|" re_data["TYPE_CODE_"] = re_data[ "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|" re_data["TYPE_"] = re_data["TYPE_"][:-1] re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1] # if "INSURANCE_DATE_" in data: # re_data["INSURANCE_DATE_"] = data["INSURANCE_DATE_"] if "INSURANCE_DETAIL_" in data: re_data["PRODUCT_DETAIL_"] = data["INSURANCE_DETAIL_"] if "COMPANY_NAME_" in data: re_data["COM_NAME_"] = data["COMPANY_NAME_"] if "LIMIT_NUMBER_" in data: re_data["BUY_LIMIT_"] = data["LIMIT_NUMBER_"] # re_data["AREA_CODE_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") if "CONTENT_" in data: re_data["CONTENT_"] = data["CONTENT_"] # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"] re_data["STATUS_"] = "1" # re_data["REMARK_"] = "" re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] # re_data["TITLE_"] = data["TITLE_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] # re_data["DATETIME_"] = data["DATETIME_"] return re_data elif "PRODUCT_NAME_" in data: re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["PRODUCT_NAME_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] # re_data["BANK_CODE_"] = data["ENTITY_CODE_"] # re_data["BANK_NAME_"] = data["ENTITY_NAME_"] if "PRODUCT_NAME_" in data: re_data["PRODUCT_NAME_"] = data["PRODUCT_NAME_"] if "FEATURE_NAME_" in data: re_data["FEATURE_NAME_"] = data["FEATURE_NAME_"] if "TYPE_" in data: re_data["TYPE_"] = "" re_data["TYPE_CODE_"] = "" if data["TYPE_"] == "财险": re_data["TYPE_"] = "财产险" re_data["TYPE_CODE_"] = "PROPERTY_INSURANCE" elif data["TYPE_"] == "100种疾病保障": re_data["TYPE_"] = "健康险" re_data["TYPE_CODE_"] = "HEALTH_INSURANCE" else: for i in self.type: if i["ITEM_LABEL_"][:-1] in data["TYPE_"]: re_data["TYPE_"] = re_data["TYPE_"] + i[ "ITEM_LABEL_"] + "|" re_data["TYPE_CODE_"] = re_data[ "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|" re_data["TYPE_"] = re_data["TYPE_"][:-1] re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1] if "POLICY_DUTY_" in data: re_data["POLICY_DUTY_"] = data["POLICY_DUTY_"] if "PRODUCT_CASE_" in data: re_data["PRODUCT_CASE_"] = data["PRODUCT_CASE_"] if "BUY_LIMIT_" in data: re_data["BUY_LIMIT_"] = data["BUY_LIMIT_"] if "ENSURE_PRICE_" in data: re_data["ENSURE_PRICE_"] = data["ENSURE_PRICE_"] # re_data["AREA_CODE_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") if "PRODUCT_PRICE_" in data: re_data["PRODUCT_PRICE_"] = data["PRODUCT_PRICE_"] if "PRODUCT_ID_" in data: re_data["PRODUCT_ID_"] = data["PRODUCT_ID_"] if "PRODUCT_CLAUSE_" in data: re_data["PRODUCT_CLAUSE_"] = data["PRODUCT_CLAUSE_"] if "GENDER_" in data: re_data["GENDER_"] = data["GENDER_"] if "AGE_" in data: re_data["AGE_"] = data["AGE_"] if "COM_NAME_" in data: re_data["COM_NAME_"] = data["COM_NAME_"] if "PAY_METHOD_" in data: re_data["PAY_METHOD_"] = data["PAY_METHOD_"] if "PROBLEM_" in data: re_data["PROBLEM_"] = data["PROBLEM_"] if "CLAIM_" in data: re_data["CLAIM_"] = data["CLAIM_"] if "COMMENT_" in data: re_data["COMMENT_"] = data["COMMENT_"] if "ENSURE_CONTENT_" in data: re_data["ENSURE_CONTENT_"] = data["ENSURE_CONTENT_"] if "INSURE_INFO_" in data: re_data["INSURE_INFO_"] = data["INSURE_INFO_"] if "RATE_INFO_" in data: re_data["RATE_INFO_"] = data["RATE_INFO_"] if "SALE_SERVICE_" in data: re_data["SALE_SERVICE_"] = data["SALE_SERVICE_"] # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"] re_data["STATUS_"] = "1" # re_data["REMARK_"] = "" re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] # re_data["TITLE_"] = data["TITLE_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] # re_data["DATETIME_"] = data["DATETIME_"] return re_data def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # # create table sql # table_sql = ('create table "INSURANCE" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,' # ' "C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, ' # '"C"."REMARK_" varchar, "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar,' # '"C"."TYPE_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C".PRODUCT_CLAUSE_ varchar,' # '"C"."SOURCE_" varchar, "C"."PRODUCT_NAME_" varchar, "C"."FEATURE_NAME_" varchar,' # '"C"."POLICY_DUTY_" varchar, "C"."PRODUCT_CASE_" varchar, "C"."BUY_LIMIT_" varchar,' # '"C"."ENSURE_PRICE_" varchar, "C"."PRODUCT_PRICE_" varchar, "C"."PRODUCT_ID_" varchar,' # '"C"."GENDER_" varchar, "C"."AGE_" varchar, "C"."COM_NAME_" varchar, "C"."TYPE_CODE_" varchar,' # '"C"."PAY_METHOD_" varchar, "C"."PRODUCT_DETAIL_" varchar, "C"."PROBLEM_" varchar,' # '"C"."CLAIM_" varchar, "C"."COMMENT_" varchar, "C"."STATUS_" varchar,' # '"C"."ENSURE_CONTENT_" varchar, "C"."INSURE_INFO_" varchar, "C"."RATE_INFO_" varchar,' # '"C"."SALE_SERVICE_" varchar) IMMUTABLE_ROWS = true') # # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection) for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) # print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: if isinstance(re_data, dict): # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue elif isinstance(re_data, list): for r_d in re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=r_d) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # # add {d:1} # try: # self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, # data_dict={"d": 1}) # self.remove_count += 1 # if self.remove_count % 10 == 0: # self.logger.info("MongoDB 更新成功, 成功条数 {}".format(self.remove_count)) # except Exception as e: # self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e)) # continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class Entrust(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="JSENTRUST_CCBDATA") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="ENTRUST") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def data_shuffle(self, data): re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["NAME_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["CREATE_TIME_"] = data["DATETIME_"] re_data["STATUS_"] = "1" re_data["DEALTIME_"] = data["DEALTIME_"] re_data["URL_"] = data["URL_"] if data["ENTITY_CODE_"] == "CHINATRC": # "C" # re_data["AREA_CODE_"] # re_data["BANK_CODE_"] # re_data["BANK_NAME_"] # re_data["UNIT_CODE_"] pub_date = eval(data["PUB_DATE_"]) date = str(pub_date["time"])[:-3] t = arrow.get(int(date)) publish_date = str(t)[:10] period_code = publish_date.replace("-", "") re_data["PERIOD_CODE_"] = period_code # re_data["REMARK_"] # re_data["UPDATE_TIME_"] re_data["CODE_"] = data["CODE_"] re_data["NAME_"] = data["NAME_"] re_data["ISSUER_"] = data["ISSUER_"] re_data["FUNCTION_"] = data["FUNCTION_"] pro_date = eval(data["PRO_START_"]) pro_date = str(pro_date["time"])[:-3] p_t = arrow.get(int(pro_date)) product_date = str(p_t)[:10] re_data["PRO_START_"] = product_date re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"] re_data["RUN_MODE_"] = data["RUN_MODE_"] re_data["INDUSTRY_"] = data["INDUSTRY_"] re_data["PUB_DATE_"] = publish_date # re_data["SCALE_"] = data[""] # re_data["MONTH_"] # re_data["YIELD_RATE_"] # re_data["START_FUNDS_"] # re_data["PURPOSE_"] # re_data["ESTAB_ANNOUNCEMENT_"] # re_data["ENTRUST_STATUS_"] # # re_data["DISTRIBU_MODE_"] # re_data["INVEST_AREA_"] # re_data["TERM_TYPE_"] = data["TERM_TYPE_"] # re_data["INVEST_DIRECTION_"] # re_data["INVEST_MODE_"] = data["INVEST_MODE_"] # re_data["CURRENCY_"] # re_data["MANAGE_TYPE_"] # re_data["SALE_TARGET_"] # re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"] # re_data["ISSUER_AREA_"] # re_data["RESERVE_INFO_"] # re_data["TRUSTEESHIP_BANK_"] # re_data["OTHER_INFO_"] # re_data["OTHER_INFO_"] elif data["ENTITY_CODE_"] == "TRUSTHEXUN": # "C" # re_data["AREA_CODE_"] # re_data["BANK_CODE_"] # re_data["BANK_NAME_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "") # re_data["REMARK_"] # re_data["UPDATE_TIME_"] # re_data["CODE_"] = data["CODE_"] re_data["NAME_"] = data["NAME_"] re_data["ISSUER_"] = data["ISSUER_"] # re_data["FUNCTION_"] = data["FUNCTION_"] # re_data["PRO_START_"] re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"].replace( "至月", "") # re_data["RUN_MODE_"] = data["RUN_MODE_"] re_data["INDUSTRY_"] = data["INDUSTRY_"] re_data["PUB_DATE_"] = data["PUB_DATE_"] re_data["SCALE_"] = data["SCALE_"] # re_data["MONTH_"] re_data["YIELD_RATE_"] = data["YIELD_RATE_"] re_data["START_FUNDS_"] = data["START_FUNDS_"] # re_data["PURPOSE_"] # re_data["ESTAB_ANNOUNCEMENT_"] # re_data["ENTRUST_STATUS_"] # # re_data["DISTRIBU_MODE_"] # re_data["INVEST_AREA_"] # re_data["TERM_TYPE_"] = data["TERM_TYPE_"] # re_data["INVEST_DIRECTION_"] re_data["INVEST_MODE_"] = data["INVEST_MODE_"] re_data["CURRENCY_"] = data["CURRENCY_"] re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"] re_data["SALE_TARGET_"] = data["SALE_TARGET_"] re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"] re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"] re_data["RESERVE_INFO_"] = data["RESERVE_INFO_"] # re_data["TRUSTEESHIP_BANK_"] re_data["OTHER_INFO_"] = data["OTHER_INFO_"] elif data["ENTITY_CODE_"] == "YANGLEE": # "C" # re_data["AREA_CODE_"] # re_data["BANK_CODE_"] # re_data["BANK_NAME_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "") # # re_data["REMARK_"] # # re_data["UPDATE_TIME_"] # re_data["CODE_"] = data["CODE_"] re_data["NAME_"] = data["NAME_"] re_data["ISSUER_"] = data["ISSUER_"] # re_data["FUNCTION_"] = data["FUNCTION_"] # # re_data["PRO_START_"] re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"] # re_data["RUN_MODE_"] = data["RUN_MODE_"] re_data["INDUSTRY_"] = data["INDUSTRY_"] re_data["PUB_DATE_"] = data["PUB_DATE_"] # re_data["SCALE_"] = data["SCALE_"] # # re_data["MONTH_"] re_data["YIELD_RATE_"] = data["YIELD_RATE_"] re_data["START_FUNDS_"] = data["START_FUNDS_"] # # re_data["PURPOSE_"] # # re_data["ESTAB_ANNOUNCEMENT_"] re_data["ENTRUST_STATUS_"] = data["STATUS_"] # re_data["DISTRIBU_MODE_"] = data["DISTRIBU_MODE_"] # # re_data["INVEST_AREA_"] re_data["TERM_TYPE_"] = data["TERM_TYPE_"] # # re_data["INVEST_DIRECTION_"] # re_data["INVEST_MODE_"] = data["INVEST_MODE_"] # re_data["CURRENCY_"] = data["CURRENCY_"] # re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"] # re_data["SALE_TARGET_"] = data["SALE_TARGET_"] # re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"] re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"] # re_data["RESERVE_INFO_"] = data["RESERVE_INFO_"] re_data["TRUSTEESHIP_BANK_"] = data["TRUSTEESHIP_BANK_"] re_data["OTHER_INFO_"] = data["OTHER_INFO_"] elif data["ENTITY_CODE_"] == "TRUSTONE": # "C" # re_data["AREA_CODE_"] # re_data["BANK_CODE_"] # re_data["BANK_NAME_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "") # # re_data["REMARK_"] # # re_data["UPDATE_TIME_"] # re_data["CODE_"] = data["CODE_"] re_data["NAME_"] = data["NAME_"] re_data["ISSUER_"] = data["ISSUER_"] # re_data["FUNCTION_"] = data["FUNCTION_"] # # re_data["PRO_START_"] # re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"] # re_data["RUN_MODE_"] = data["RUN_MODE_"] # re_data["INDUSTRY_"] = data["INDUSTRY_"] re_data["PUB_DATE_"] = data["PUB_DATE_"] re_data["SCALE_"] = data["SCALE_"] # # re_data["MONTH_"] re_data["YIELD_RATE_"] = data["YIELD_RATE_"] re_data["START_FUNDS_"] = data["START_FUNDS_"] # # re_data["PURPOSE_"] # # re_data["ESTAB_ANNOUNCEMENT_"] # re_data["ENTRUST_STATUS_"] = data["STATUS_"] # # re_data["DISTRIBU_MODE_"] = data["DISTRIBU_MODE_"] re_data["INVEST_AREA_"] = data["INVEST_AREA_"] re_data["TERM_TYPE_"] = data["TERM_TYPE_"] re_data["INVEST_DIRECTION_"] = data["INVEST_DIRECTION_"] re_data["INVEST_MODE_"] = data["INVEST_MODE_"] # re_data["CURRENCY_"] = data["CURRENCY_"] # re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"] # re_data["SALE_TARGET_"] = data["SALE_TARGET_"] re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"] # re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"] re_data["RESERVE_INFO_"] = re.sub(r"</?\w*>", "", data["RESERVE_INFO_"]) # re_data["TRUSTEESHIP_BANK_"] = data["TRUSTEESHIP_BANK_"] # re_data["OTHER_INFO_"] = data["OTHER_INFO_"] return re_data def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # # create table sql # table_sql = ('create table "ENTRUST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."CREATE_TIME_" varchar, "C"."STATUS_" varchar,' # '"C"."DEALTIME_" varchar, "C"."URL_" varchar, "C"."AREA_CODE_" varchar, "C"."FUNCTION_" varchar,' # '"C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar,' # '"C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, "C"."UPDATE_TIME_" varchar,' # '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."ISSUER_" varchar, "C"."PRO_START_" varchar,' # '"C"."INVEST_PERIOD_" varchar,"C"."RUN_MODE_" varchar, "C"."INDUSTRY_" varchar,' # '"C"."PUB_DATE_" varchar, "C"."SCALE_" varchar, "C"."MONTH_" varchar, "C"."YIELD_RATE_" varchar,' # '"C"."START_FUNDS_" varchar, "C"."PURPOSE_" varchar, "C"."ESTAB_ANNOUNCEMENT_" varchar,' # '"C"."ENTRUST_STATUS_" varchar, "C"."DISTRIBU_MODE_" varchar, "C"."INVEST_AREA_" varchar,' # '"C"."TERM_TYPE_" varchar, "C"."INVEST_DIRECTION_" varchar, "C"."INVEST_MODE_" varchar,' # '"C"."CURRENCY_" varchar, "C"."MANAGE_TYPE_" varchar, "C"."SALE_TARGET_" varchar,' # '"C"."PROFIT_TYPE_" varchar, "C"."ISSUER_AREA_" varchar, "C"."RESERVE_INFO_" varchar,' # '"C"."TRUSTEESHIP_BANK_" varchar, "C"."OTHER_INFO_" varchar) IMMUTABLE_ROWS = true') # # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection, data_id="5c67307d9bb3df76b4229f79") for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data # try: re_data = self.data_shuffle(data=data) # except Exception as e: # self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) # continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # # add {d:1} # try: # self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, # data_dict={"d": 1}) # self.remove_count += 1 # if self.remove_count % 10 == 0: # self.logger.info("MongoDB 更新成功, 成功条数 {}".format(self.remove_count)) # except Exception as e: # self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e)) # continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class HexunOpinion(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="HEXUNOPINION") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="SENTIMENT") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def data_shuffle(self, data): if ":" not in data["NOTICE_TIME_"]: return None re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] # re_data["AREA_CODE_"] # re_data["UNIT_CODE_"] period_code = data["NOTICE_TIME_"][:10].replace("-", "") re_data["PERIOD_CODE_"] = period_code re_data["CONTENT_"] = re.sub(r"本报告版权归和讯财经传播研究所所有,未经书面授权允许,不得复制转载。\|.*", "", data["CONTENT_"]) re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"][:10] re_data["STATUS_"] = "1" # re_data["REMARK_"] = "" re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] re_data["TITLE_"] = data["TITLE_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] # re_data["DATETIME_"] = data["DATETIME_"] return re_data def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # create table sql # table_sql = ('create table "SENTIMENT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."REMARK_" varchar, "C"."PERIOD_CODE_" varchar,' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, ' # '"C"."SOURCE_" varchar, "C"."BRIEF_" varchar, "C"."IMAGE_" varchar, "C"."IMPORTANCE_" varchar,' # '"C"."TITLE_" varchar, "C"."URL_" varchar, "C"."NOTICE_TIME_" varchar,' # '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar, "C"."HOME_PAGE_" varchar) IMMUTABLE_ROWS = true') # # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection) for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) if i < 4: re_data["HOME_PAGE_"] = "Y" except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # add {d:1} try: self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, data_dict={"d": 1}) self.remove_count += 1 if self.remove_count % 10 == 0: self.logger.info("MongoDB 更新成功, 成功条数 {}".format( self.remove_count)) except Exception as e: self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format( self.data_id, e)) continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()