class BaiduSearch(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="BAIDU_SEARCH") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="BAIDU_SEARCH") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" # BANK_NAME_ 字典 交通银行 BOCOM 改为 COMM 中信银行 ECITIC 改为 CITIC 增加 平安银行 北京银行 上海银行 self.name_dict = { "ICBC": "中国工商银行", "ABC": "中国农业银行", "BOC": "中国银行", "CCB": "中国建设银行", "COMM": "交通银行", "PSBC": "中国邮政储蓄银行", "CZB": "浙商银行", "CBHB": "渤海银行", "CITIC": "中信银行", "CEB": "中国光大银行", "HXB": "华夏银行", "CMBC": "中国民生银行", "CMB": "招商银行", "CIB": "兴业银行", "CGB": "广发银行", "PAB": "平安银行", "SPDB": "浦发银行", "EBCL": "恒丰银行", "PINGAN": "平安银行", "LTD": "中国光大银行", "BEIJING": "北京银行", "BOSC": "上海银行" } # TYPE_ 列表 self.type_list = [ "Market", "Activity", "GoodStart", "MidSeason", "PrivateBank", "Recommendation" ] def data_shuffle(self, data): re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] # re_data["AREA_CODE_"] for bank_c in self.name_dict: if bank_c in data["ENTITY_CODE_"]: re_data["BANK_CODE_"] = bank_c break if "BANK_CODE_" in re_data: re_data["BANK_NAME_"] = self.name_dict[re_data["BANK_CODE_"]] else: print(data["ENTITY_CODE_"]) # re_data["UNIT_CODE_"] # re_data["PERIOD_CODE_"] = "" re_data["CONTENT_"] = data["CONTENT_"] re_data["STATUS_"] = "UNPROCESSED" # re_data["REMARK_"] = "" re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] for type in self.type_list: if type in data["ENTITY_CODE_"]: re_data["TYPE_"] = type break re_data["TITLE_"] = data["TITLE_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] # re_data["DATETIME_"] = data["DATETIME_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] return re_data def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # # create table sql # table_sql = ('create table "BAIDU_SEARCH" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,"C"."REMARK_" varchar,' # ' "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, ' # '"C"."TITLE_" varchar, "C"."URL_" varchar, "C"."ENTITY_NAME_" varchar,"C"."TYPE_" varchar,' # '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar) IMMUTABLE_ROWS = true') # # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection) for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # add {d:1} try: self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, data_dict={"d": 1}) self.remove_count += 1 if self.remove_count % 10 == 0: self.logger.info("MongoDB 更新成功, 成功条数 {}".format( self.remove_count)) except Exception as e: self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format( self.data_id, e)) continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class Trend(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="TREND") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="CHA_BRANCH_MARKET_ACT") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def data_shuffle(self, data): re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # 分行 copy_result = dict() copy_result["ID_"] = row_key copy_result["ENTITY_CODE_"] = data["ENTITY_CODE_"] copy_result["ENTITY_NAME_"] = data["ENTITY_NAME_"] copy_result["URL_"] = data["URL_"] # copy_result["PROVINCE_CODE_"] = result[""] # copy_result["PROVINCE_NAME_"] = result[""] # copy_result["CITY_CODE_"] = result[""] # copy_result["CITY_NAME_"] = result[""] # copy_result["AREA_CODE_"] = result[""] # copy_result["AREA_NAME_"] = result[""] # copy_result["LAT_"] = result[""] # copy_result["LNG_"] = result[""] copy_result["APP_VERSION_"] = "BRANCH" copy_result["BANK_CODE_"] = data["ENTITY_CODE_"].replace( "PRIVATEINFO", "") copy_result["BANK_NAME_"] = data["ENTITY_NAME_"].replace("私行动态", "") # copy_result["UNIT_CODE_"] = result["UNIT_CODE_"] # copy_result["UNIT_NAME_"] = result[""] copy_result["PERIOD_CODE_"] = data["NOTICE_TIME_"].replace("-", "") # copy_result["REMARK_"] = result[""] time_array = time.localtime() create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) copy_result["CREATE_TIME_"] = create_time copy_result["SPIDER_TIME_"] = data["DATETIME_"] # copy_result["MODIFIED_TIME_"] = result[""] copy_result["CREATE_BY_ID_"] = "P0131857" copy_result["CREATE_BY_NAME_"] = "钟楷文" # copy_result["MODIFIED_BY_ID_"] = result[""] # copy_result["MODIFIED_BY_NAME_"] = result[""] copy_result["M_STATUS_"] = "0" copy_result["DELETE_STATUS_"] = "0" copy_result["DATA_STATUS_"] = "uncheck" # copy_result["TAGS_"] = result[""] source = re.findall(r"(https?://.*?)/", data["URL_"]) copy_result["SOURCE_"] = source[0] copy_result["SOURCE_NAME_"] = data["ENTITY_NAME_"] # copy_result["SOURCE_TYPE_"] = result[""] # copy_result["HOT_"] = result[""] # copy_result["IMPORTANCE_"] = result[""] copy_result["ACT_NAME_"] = data["TITLE_"] # copy_result["IMAGES_"] = data[""] # copy_result["TARGET_"] = data[""] # copy_result["BRIEF_"] = data[""] copy_result["DETAILS_"] = data["CONTENT_"] # copy_result["RULE_"] = data[""] # copy_result["START_TIME_"] = data[""] # copy_result["END_TIME_"] = data[""] # copy_result["ACT_TYPE1_"] = data[""] # copy_result["ACT_TYPE2_"] = data[""] # copy_result["ACT_TYPE3_"] = data[""] copy_result["PUBLISH_TIME_"] = data["NOTICE_TIME_"] # copy_result["READS_"] = data[""] # copy_result["LIKES_"] = data[""] # copy_result["COMMENTS_"] = data[""] # copy_result["JOINS_"] = data[""] # copy_result["RELAYS_"] = data[""] # copy_result["SOURCE_ID_"] = data[""] # copy_result["HTML_"] = data[""] # copy_result["SOURCE_OWN_NAME_"] = data[""] # copy_result["SOURCE_OWN_ID_"] = data[""] return copy_result # "C" # re_data["ID_"] = row_key # re_data["TYPE_"] = random.choice( # ["税务法律", "子女教育", "健康医养", "财富管理", "生活娱乐", "旅游出行", "艺术/艺术品", "节日庆贺", "其他"]) # re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] # re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] # re_data["BANK_CODE_"] = data["ENTITY_CODE_"].replace("PRIVATEINFO", "") # re_data["BANK_NAME_"] = data["ENTITY_NAME_"].replace("私行动态", "") # # re_data["AREA_CODE_"] # # re_data["UNIT_CODE_"] # period_code = data["NOTICE_TIME_"].replace("-", "") # re_data["PERIOD_CODE_"] = period_code # re_data["CONTENT_"] = data["CONTENT_"] # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"] # re_data["STATUS_"] = "1" # # re_data["REMARK_"] = "" # re_data["CREATE_TIME_"] = data["DATETIME_"] # # re_data["UPDATE_TIME_"] # re_data["TITLE_"] = data["TITLE_"] # re_data["URL_"] = data["URL_"] # re_data["DEALTIME_"] = data["DEALTIME_"] # # re_data["DATETIME_"] = data["DATETIME_"] # # re_data["SOURCE_TYPE_"] # # return re_data def run(self): # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # quit() # add colum # self.p_client.add_column_phoenix(connection=self.connection, column="SOURCE_TYPE_") # quit() # create table sql # table_sql = ('create table "MARKETING_ACT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."TITLE_" varchar,"C"."NOTICE_TIME_" varchar,' # '"T"."CONTENT_" varchar,"C"."OBJ_" varchar, "C"."ATENDANCE_" varchar, "C"."PERIOD_CODE_" varchar,' # '"C"."IMAGES_" varchar, "C"."RESULTS_" varchar,"C"."PLACE_" varchar, "C"."TYPE_" varchar,' # '"C"."READ_NUM_" varchar, "C"."CONTENT_NUM_" varchar, "C"."COMMENT_CONTENT_" varchar, ' # '"C"."FORWARD_NUM_" varchar, "C"."COLLECTION_NUM_" varchar, "C"."PRAISE_NUM_" varchar,' # '"C"."BANK_NAME_" varchar, "C"."STATUS_" varchar, "C"."REMARK_" varchar, "C"."SOURCE_ID_" varchar,' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar,"C"."SOURCE_" varchar,' # '"C"."URL_" varchar, "C"."BANK_CODE_" varchar, "C"."DEALTIME_" varchar, ' # '"C"."SOURCE_TYPE_" varchar, "C"."IMPROTANCE_" varchar) IMMUTABLE_ROWS = true') # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection) # for i in range(mongo_data_list.count() + 100): for i in range(100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # add {d:1} try: self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, data_dict={"d": 1}) self.remove_count += 1 if self.remove_count % 10 == 0: self.logger.info("MongoDB 更新成功, 成功条数 {}".format( self.remove_count)) except Exception as e: self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format( self.data_id, e)) continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
def run(self): count = 0 # # 创建 Phoenix 对象-注意表格名字 p_client = PhoenixHbase(table_name="FUND") p_client.verify_list = self.verify_list # # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="JSFUND_CCBDATA") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) #查询省市区的编码列表 # script = GenericScript(entity_code="ICBCFUND", entity_type="JSFUND_CCBDATA") # province_list, city_list, area_list, dir_area_list = script.area_from_mysql() list_SUBS_STATUS = self.dict_from_mysql("FUND_SUBS_STATUS") list_TYPE = self.dict_from_mysql("FUND_TYPE") # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # 基金表创建语句 # sql = ('create table "FUND" ("ID_" varchar primary key,"C"."ENTITY_CODE_" varchar,"C"."AREA_CODE_" varchar,' # '"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,"C"."UNIT_CODE_" varchar,"C"."PERIOD_CODE_" varchar,"C"."REMARK_" varchar,' # '"C"."CREATE_TIME_" varchar,"C"."UPDATE_TIME_" varchar,"C"."STATUS_" varchar,"C"."CODE_" varchar,"C"."NAME_" varchar,' # '"C"."FUND_OLD_VALUE_" varchar,"C"."TOTAL_OLD_VALUE_" varchar,"C"."FUND_NEW_VALUE_" varchar,"C"."TOTAL_NEW_VALUE_" varchar,' # '"C"."INVEST_PERIOD_" varchar,"C"."DAILY_RATE_" varchar,"C"."YEAR_REWARD_" varchar,"C"."SUBS_STATUS_" varchar,' # '"C"."ATONEM_STATUS_" varchar,"C"."TYPE_" varchar,"C"."NEWEST_VALUE_" varchar,"C"."TOTAL_VALUE_" varchar,' # '"C"."POPULARITY_" varchar,"C"."RATING_" varchar,"C"."ENTITY_NAME_" varchar,"C"."OLD_VALUE_" varchar,' # '"C"."UNIT_VALUE_" varchar,"C"."SCALE_" varchar,"C"."ESTABLISH_DATE_" varchar,"C"."RISK_LEVEL_" varchar,' # '"C"."BASE_INFO_" varchar,"C"."YIELD_" varchar,"C"."INVEST_" varchar,"C"."MONTH_RATE_" varchar,' # '"C"."QUARTER_RATE_" varchar,"C"."HALF_YEAR_RATE_" varchar,"C"."URL_" varchar,"C"."HISTORY_RATE_" varchar,' # '"C"."FUND_STATUS_" varchar,"C"."COMPANY_" varchar,"C"."SUBS_STATUS_CODE_" varchar,"C"."TYPE_CODE_" varchar)IMMUTABLE_ROWS = true') # # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 遍历 ENTITY_CODE_ 列表 # self.code_list = ["ABCORGANIZE"] for entity_code in self.code_list: status = False module_name = __import__(entity_code) self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code)) self.remove_id_list = [] self.copy_mongo_data_list = [] self.branch_code_list = [] try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count = mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) # re_data = module_name.data_shuffle(data_list,province_list, city_list, area_list,list_SUBS_STATUS,list_TYPE) re_data = module_name.data_shuffle( data, list_SUBS_STATUS, list_TYPE) # re_data = module_name.data_shuffle(data_list) if not re_data: self.bad_count += 1 continue except Exception as e: # except jpype._jexception.SQLExceptionPyRaisable as e: # except org.apache.phoenix.exception.BatchUpdateExecution as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue if isinstance(re_data, list): for list_data in re_data: # try: # area_data = self.shuffle_for_area(list_data) # except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) # self.logger.warning("_id:{} 获取经纬度失败, {}".format(data_id, e)) # continue # except ValueError: # pass # phoenix_HBase 插入数据 if list_data: try: count += 1 print(count) # print(list_data) success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=list_data) # 导出csv # pd.DataFrame(area_data).to_csv("E:\\NEWS_CLEAN_\\" + module_name+ ".csv") once_count += success_count self.success_count += success_count self.logger.info( "HBase 插入成功, 成功条数 {} 条".format( success_count)) if self.success_count % 50 == 0: update_count = m_client.update_to_mongodb( collection=collection, data_id=self.remove_id_list, data_dict={"d": 1}) self.remove_count += update_count self.logger.info("MongoDB 更新成功") except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue elif isinstance(re_data, dict): # try: # area_data = self.shuffle_for_area(re_data) # except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) # self.logger.warning("_id: {}获取经纬度失败, {}".format(data_id, e)) # continue # phoenix_HBase 插入数据 if re_data: try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) once_count += success_count self.success_count += success_count if self.success_count % 100 == 0: self.logger.info( "HBase 插入成功, 成功条数 {} 条".format( self.success_count)) # 添加 {d:1} if self.success_count % 50 == 0: update_count = m_client.update_to_mongodb( collection=collection, data_id=self.remove_id_list, data_dict={"d": 1}) self.remove_count += update_count self.logger.info("MongoDB 更新成功") except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) else: continue # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
def run(self): # # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="NEWS_FINASSIST") p_client.verify_list = self.verify_list # # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="NEWS_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # # 表创建语句 # sql = ('create table "NEWS_FINASSIST" ("ID_" varchar primary key, "T"."CONTENT_" varchar, ' # '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."TITLE_" varchar, "C"."BRIEF_" varchar, ' # '"C"."PUBLISH_TIME_" varchar, "C"."KEYWORDS_" varchar, "C"."URL_" varchar, "C"."DATA_SOURCE_" varchar,' # '"C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,' # '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "F"."STATUS_" varchar)' # 'IMMUTABLE_ROWS = true') # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 遍历 ENTITY_CODE_ 列表 for entity_code in self.code_list: status = False module_name = __import__(entity_code) self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code)) # self.remove_id_list = [] # self.copy_mongo_data_list = [] # self.branch_code_list = [] if entity_code == "CAIJINGNEWS": find_id = "5c6bfa508d7fee512a4ca68f" else: find_id = "" # find_id = "" try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, find_id=find_id) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, find_id=find_id) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count = mongo_data_list.count() for i in range(1000000): try: data = mongo_data_list.__next__() except pymongo.errors.ServerSelectionTimeoutError: continue except StopIteration: break # for data in mongo_data_list: data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info( "running on data_id: {}".format(data_id)) # print(data_id) # copy_data = {} # self.remove_id_list.append(data_id) try: del data["_id"] # copy_data = deepcopy(data) # self.copy_mongo_data_list.append(copy_data) data_list = [data] re_data = module_name.data_shuffle(data_list) if not re_data: self.bad_count += 1 # self.remove_id_list.remove(data_id) continue except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue if isinstance(re_data, list): for list_data in re_data: # phoenix_HBase 插入数据 if list_data: try: if entity_code != "CNINFONEWS": ai_data = self.get_brief_from_ai( data=list_data) else: ai_data = list_data # print(ai_data["CONTENT_"]) except Exception as e: self.logger.info("AI 调取失败, 错误信息", e) ai_data = re_data try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=ai_data) once_count += success_count self.success_count += success_count if self.success_count % 10 == 0: self.logger.info( "HBase 插入成功, 成功条数{}条".format( once_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue try: # 添加 {d:1} update_count = m_client.update_to_mongodb( collection=collection, data_id=data_id, data_dict={"d": 1}) self.remove_count += update_count # self.logger.info("MongoDB 更新成功") if self.remove_count % 10 == 0: self.logger.info( "MongoDB 更新成功, 成功条数 {} 条".format( "10")) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "MongoDB 更新 _id 为 {} 的数据失败, {}".format( data_id, e)) continue elif isinstance(re_data, dict): # phoenix_HBase 插入数据 if re_data: try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) once_count += success_count self.success_count += success_count self.logger.info( "HBase 插入成功, 成功条数 {} 条".format( success_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if once_count > 0: status = True self.logger.info("ENTITY_CODE_: {} 插入成功条数 {}".format( entity_code, once_count)) mongo_data_list.close() else: continue # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class Meipian(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="meipian_CCBDATA") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="MEIPIAN_CCBDATA") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def data_shuffle(self, data): re_data = dict() if data["TITLE_"]: # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" 通用列族字段 re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] # re_data["BANK_NAME_"] time_arrary = arrow.get(data["CREATE_TIME"]) period_code = time_arrary.format("YYYYMMDD") publish_time = time_arrary.format("YYYY-MM-DD HH:mm:ss") re_data["PERIOD_CODE_"] = str(period_code) re_data["PUBLISH_TIME_"] = str(publish_time) re_data["STATUS_"] = "UNPROCESSED" re_data["CONTENT_"] = data["CONTENT_"] re_data["REMARK_"] = "" # re_data["AREA_CODE_"] # re_data["UNIT_CODE_"] re_data["CREATE_TIME_"] = data["DATETIME_"] re_data["URL_"] = data["URL_"] re_data["TITLE_"] = data["TITLE_"] re_data["CONTENT_TYPE_"] = data["CONTENT_TYPE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["DEALTIME_"] = str(data["DEALTIME_"]) re_data["VISIT_COUNT_"] = data["VISIT_COUNT"] re_data["PRAISE_COUNT_"] = data["PRAISE_COUNT"] re_data["COMMENT_COUNT_"] = data["COMMENT_COUNT"] re_data["SOURCE_"] = data["SOURCE_"] return re_data else: return None def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # # create table sql # table_sql = ('create table "MEIPIAN_CCBDATA" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."URL_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,' # ' "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, ' # '"C"."TITLE_" varchar, "C"."CONTENT_TYPE_" varchar, "C"."ENTITY_NAME_" varchar,' # '"C"."VISIT_COUNT_" varchar, "C"."PRAISE_COUNT_" varchar, "C"."COMMENT_COUNT_" varchar,' # '"C"."DEALTIME_" varchar, "C"."SOURCE_" varchar, "C"."PUBLISH_TIME_" varchar,' # '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true') # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) # f_id = "5c6fa1328d7fee306de9463d" # quit() # f_id = "5c6fe1ba8d7fee1d44775989" # quit() # f_id = "5c6fdb448d7fee394da6a5fb" # quit() Exception while executing batch. # f_id = "5c6fe1ba8d7fee1d44775989" f_id = "5c6fe11b9bb3df6b0ec6168b" # gt 10M mongo_data_list = self.m_client.all_from_mongodb(self.collection, data_id=f_id) for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # add {d:1} try: self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, data_dict={"d": 1}) self.remove_count += 1 if self.remove_count % 10 == 0: self.logger.info("MongoDB 更新成功, 成功条数 {}".format( self.remove_count)) except Exception as e: self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format( self.data_id, e)) continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class HexunOpinion(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="HEXUNOPINION") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="SENTIMENT") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def data_shuffle(self, data): if ":" not in data["NOTICE_TIME_"]: return None re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] # re_data["AREA_CODE_"] # re_data["UNIT_CODE_"] period_code = data["NOTICE_TIME_"][:10].replace("-", "") re_data["PERIOD_CODE_"] = period_code re_data["CONTENT_"] = re.sub(r"本报告版权归和讯财经传播研究所所有,未经书面授权允许,不得复制转载。\|.*", "", data["CONTENT_"]) re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"][:10] re_data["STATUS_"] = "1" # re_data["REMARK_"] = "" re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] re_data["TITLE_"] = data["TITLE_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] # re_data["DATETIME_"] = data["DATETIME_"] return re_data def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # create table sql # table_sql = ('create table "SENTIMENT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."REMARK_" varchar, "C"."PERIOD_CODE_" varchar,' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, ' # '"C"."SOURCE_" varchar, "C"."BRIEF_" varchar, "C"."IMAGE_" varchar, "C"."IMPORTANCE_" varchar,' # '"C"."TITLE_" varchar, "C"."URL_" varchar, "C"."NOTICE_TIME_" varchar,' # '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar, "C"."HOME_PAGE_" varchar) IMMUTABLE_ROWS = true') # # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection) for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) if i < 4: re_data["HOME_PAGE_"] = "Y" except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # add {d:1} try: self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, data_dict={"d": 1}) self.remove_count += 1 if self.remove_count % 10 == 0: self.logger.info("MongoDB 更新成功, 成功条数 {}".format( self.remove_count)) except Exception as e: self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format( self.data_id, e)) continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()