def match_weibo_code(self, match): mongo_client = MongoClient(mongo_collection="WEIBOBASICINFO") db, collection_list = mongo_client.client_to_mongodb() collection = mongo_client.get_check_collection(db, collection_list) result = mongo_client.match_from_mongo(collection=collection, match=match, output="WEIBO_CODE_") return result
class Generate_task(threading.Thread): def __init__(self, store, queue, values=()): ''' 消费者 FIFO即 First in First Out,先进先出。Queue提供了一个基本的FIFO容器,使用方法很简单,maxsize是个整数,指明了队列中能存放的数据个数的上限。一旦达到上限,插入会导致阻塞,直到队列中的数据被消费掉。如果maxsize小于或者等于0,队列大小没有限制 可自定义任务函数, :param store: :param queue: 队列对象 :param values: 用于切片的值 int ''' threading.Thread.__init__(self) self.queue = queue self.store = store self.data = [] self.values = values self.result = '' collection_name = '' # MongoDB connection self.m_client = MongoClient(mongo_collection=collection_name) self.db, self.collection_list = self.m_client.client_to_mongodb() # MongoDB old connection # spider_data_old 的表连接是遍历查询出来的,,所以需要手动建立 self.old_client = MongoClient(mongo_collection=collection_name) self.old_client.mongo_db = "spider_data_old" def rask(self, *args, **kwargs): pass def get_result(self, result): ''' 线程你不好追踪返回值, 所有座位属性返回 :param result: :return: ''' self.result = result def run(self): try: self.queue.get() # 获取任务 # 写入函数 print('This is store %s' % self.store) result = self.rask() self.get_result(result) except Exception as e: print(e) finally: self.queue.task_done() # 通知任务结束
def run(self): # 创建 MongoDB 对象 m_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # 遍历 ENTITY_CODE_ 列表 # self.code_list = self.code_list[14:] for entity_code in self.code_list: self.count = 0 hash_list = list() status = False mongo_data_list = self.get_data_from_mongo(m_client=m_client, collection=collection, entity_code=entity_code) if mongo_data_list: self.logger.warning("{} 查取成功".format(entity_code)) self.logger.warning("当前共有{}条".format(mongo_data_list.count())) status = True else: self.logger.warning("{} 无数据".format(entity_code)) if status: for data in mongo_data_list: if "ADDR_" in data: hash_m = hashlib.md5() hash_m.update(data["ADDR_"].encode("utf-8")) hash_title = hash_m.hexdigest() if hash_title in hash_list: self.count += 1 else: hash_list.append(hash_title) else: if "CONTENT_" in data: hash_m = hashlib.md5() hash_m.update(data["CONTENT_"].encode("utf-8")) hash_title = hash_m.hexdigest() if hash_title in hash_list: self.count += 1 else: hash_list.append(hash_title) self.logger.warning("重复数据{}条".format(self.count)) # 关闭连接 m_client.client_close() self.logger.handlers.clear()
def run(): shuffle_list = list() count = 0 # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="CommonBidding") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection(db=db, collection_list=collection_list) m_client.mongo_db = "spider_data" m_client.mongo_entity_code = "500000CQGGZY" try: mongo_data_list = m_client.search_title_from_mongodb(collection) except pymongo.errors.ServerSelectionTimeoutError: print("正在重新连接") time.sleep(1) mongo_data_list = m_client.search_title_from_mongodb(collection) for data in mongo_data_list: data_list = data_shuffle(data)
def run(self): count = 0 # # 创建 Phoenix 对象-注意表格名字 p_client = PhoenixHbase(table_name="FUND") p_client.verify_list = self.verify_list # # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="JSFUND_CCBDATA") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) #查询省市区的编码列表 # script = GenericScript(entity_code="ICBCFUND", entity_type="JSFUND_CCBDATA") # province_list, city_list, area_list, dir_area_list = script.area_from_mysql() list_SUBS_STATUS = self.dict_from_mysql("FUND_SUBS_STATUS") list_TYPE = self.dict_from_mysql("FUND_TYPE") # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # 基金表创建语句 # sql = ('create table "FUND" ("ID_" varchar primary key,"C"."ENTITY_CODE_" varchar,"C"."AREA_CODE_" varchar,' # '"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,"C"."UNIT_CODE_" varchar,"C"."PERIOD_CODE_" varchar,"C"."REMARK_" varchar,' # '"C"."CREATE_TIME_" varchar,"C"."UPDATE_TIME_" varchar,"C"."STATUS_" varchar,"C"."CODE_" varchar,"C"."NAME_" varchar,' # '"C"."FUND_OLD_VALUE_" varchar,"C"."TOTAL_OLD_VALUE_" varchar,"C"."FUND_NEW_VALUE_" varchar,"C"."TOTAL_NEW_VALUE_" varchar,' # '"C"."INVEST_PERIOD_" varchar,"C"."DAILY_RATE_" varchar,"C"."YEAR_REWARD_" varchar,"C"."SUBS_STATUS_" varchar,' # '"C"."ATONEM_STATUS_" varchar,"C"."TYPE_" varchar,"C"."NEWEST_VALUE_" varchar,"C"."TOTAL_VALUE_" varchar,' # '"C"."POPULARITY_" varchar,"C"."RATING_" varchar,"C"."ENTITY_NAME_" varchar,"C"."OLD_VALUE_" varchar,' # '"C"."UNIT_VALUE_" varchar,"C"."SCALE_" varchar,"C"."ESTABLISH_DATE_" varchar,"C"."RISK_LEVEL_" varchar,' # '"C"."BASE_INFO_" varchar,"C"."YIELD_" varchar,"C"."INVEST_" varchar,"C"."MONTH_RATE_" varchar,' # '"C"."QUARTER_RATE_" varchar,"C"."HALF_YEAR_RATE_" varchar,"C"."URL_" varchar,"C"."HISTORY_RATE_" varchar,' # '"C"."FUND_STATUS_" varchar,"C"."COMPANY_" varchar,"C"."SUBS_STATUS_CODE_" varchar,"C"."TYPE_CODE_" varchar)IMMUTABLE_ROWS = true') # # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 遍历 ENTITY_CODE_ 列表 # self.code_list = ["ABCORGANIZE"] for entity_code in self.code_list: status = False module_name = __import__(entity_code) self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code)) self.remove_id_list = [] self.copy_mongo_data_list = [] self.branch_code_list = [] try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count = mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) # re_data = module_name.data_shuffle(data_list,province_list, city_list, area_list,list_SUBS_STATUS,list_TYPE) re_data = module_name.data_shuffle( data, list_SUBS_STATUS, list_TYPE) # re_data = module_name.data_shuffle(data_list) if not re_data: self.bad_count += 1 continue except Exception as e: # except jpype._jexception.SQLExceptionPyRaisable as e: # except org.apache.phoenix.exception.BatchUpdateExecution as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue if isinstance(re_data, list): for list_data in re_data: # try: # area_data = self.shuffle_for_area(list_data) # except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) # self.logger.warning("_id:{} 获取经纬度失败, {}".format(data_id, e)) # continue # except ValueError: # pass # phoenix_HBase 插入数据 if list_data: try: count += 1 print(count) # print(list_data) success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=list_data) # 导出csv # pd.DataFrame(area_data).to_csv("E:\\NEWS_CLEAN_\\" + module_name+ ".csv") once_count += success_count self.success_count += success_count self.logger.info( "HBase 插入成功, 成功条数 {} 条".format( success_count)) if self.success_count % 50 == 0: update_count = m_client.update_to_mongodb( collection=collection, data_id=self.remove_id_list, data_dict={"d": 1}) self.remove_count += update_count self.logger.info("MongoDB 更新成功") except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue elif isinstance(re_data, dict): # try: # area_data = self.shuffle_for_area(re_data) # except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) # self.logger.warning("_id: {}获取经纬度失败, {}".format(data_id, e)) # continue # phoenix_HBase 插入数据 if re_data: try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) once_count += success_count self.success_count += success_count if self.success_count % 100 == 0: self.logger.info( "HBase 插入成功, 成功条数 {} 条".format( self.success_count)) # 添加 {d:1} if self.success_count % 50 == 0: update_count = m_client.update_to_mongodb( collection=collection, data_id=self.remove_id_list, data_dict={"d": 1}) self.remove_count += update_count self.logger.info("MongoDB 更新成功") except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) else: continue # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class WechatScript(object): def __init__(self, entity_type="WECHAT"): """ 初始化参数 :param entity_type: WECHAT """ self.entity_type = entity_type self.logger = Logger().logger # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name=self.entity_type) # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="WECHAT") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # # 创建 MongoDB spider_data_old 数据库对象 # self.old_client = MongoClient(mongo_collection="WECHAT") # # 本地测试 # self.old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60, # connectTimeoutMS=60, connect=False) # self.old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = self.old_client.client_to_mongodb() # self.collection_old = db_old["WECHAT"] # 创建 MySQL 对象 self.mysql_client = GenericScript(entity_code=None, entity_type=None) self.remove_id_list = list() self.copy_mongo_data_list = list() self.verify_list = [ "ID_", "ENTITY_CODE_", "URL_", "AREA_CODE_", "BANK_CODE_", "BANK_NAME_", "UNIT_CODE_", "PERIOD_CODE_", "CONTENT_", "CONTENT_TYPE_", "REMARK_", "CREATE_TIME_", "UPDATE_TIME_", "TITLE_", "ENTITY_NAME_", "DEALTIME_", "DATETIME_", "STATUS_", "WECHAT_NAME_", "WECHAT_ID_" ] # BANK_NAME_ 字典 self.name_dict = { "ICBC": "中国工商银行", "ABC": "中国农业银行", "BOC": "中国银行", "CCB": "中国建设银行", "BOCOM": "交通银行", "PSBC": "中国邮政储蓄银行", "CZB": "浙商银行", "CBHB": "渤海银行", "ECITIC": "中信银行", "CEB": "中国光大银行", "HXB": "华夏银行", "CMBC": "中国民生银行", "CMB": "招商银行", "CIB": "兴业银行", "CGB": "广发银行", "PAB": "平安银行", "SPDB": "浦发银行", "EBCL": "恒丰银行" } self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", "Accept-Language": "zh-CN,zh;q=0.9", "Host": "weixin.sogou.com", "Referer": "http://weixin.sogou.com/" } self.url = "http://weixin.sogou.com/weixin?type=1&query={}&ie=utf8&s_from=input&_sug_=y&_sug_type_=" self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def check_name(self, wechat_id): check_dict = dict() with open("wechat_id_name.txt", "r", encoding="utf-8") as rf: read_data = rf.read() if read_data: read_data = read_data.replace("\'", "\"") read_data = read_data.replace(": None", ": \"None\"") # print(read_data) check_dict = json.loads(read_data) else: wechat_name = self.req_for_name(wechat_id) check_dict[wechat_id] = wechat_name with open("wechat_id_name.txt", "w", encoding="utf-8") as wf: wf.write(str(check_dict)) return check_dict[wechat_id] if wechat_id in check_dict: return check_dict[wechat_id] else: wechat_name = self.req_for_name(wechat_id) check_dict[wechat_id] = wechat_name with open("wechat_id_name.txt", "w", encoding="utf-8") as wf: wf.write(str(check_dict)) return check_dict[wechat_id] def req_for_name(self, wechat_id): url = self.url.format(wechat_id) # response = WanDou().http_client(url=url, param=self.headers) resp1 = requests.get( url= r"http://h.wandouip.com/get/ip-list?pack=853&num=1&xy=1&type=2&lb=\r\n&mr=1&" ) resp2 = resp1.json()["data"][0] # print(resp2) # resp1.close() time.sleep(2) try: response = requests.get( url=url, headers=self.headers, proxies={"http": "{}:{}".format(resp2["ip"], resp2["port"])}) except Exception as e: print(1, e) self.logger.info("error ip: {}".format(resp2)) time.sleep(5) return self.req_for_name(wechat_id) html = HTML(response.content.decode()) # response.close() name = html.xpath('//p[@class="tit"]/a/text()') if name: # print(name) self.error_count = 0 return name[0] else: self.error_count += 1 if self.error_count == 5: self.logger.info("wetchat id error: \"{}\"".format(wechat_id)) return "None" else: time.sleep(2) self.req_for_name(wechat_id) # if response is None: # self.logger.info("ip_prox error") # return self.req_for_name(wechat_id) # if isinstance(response, str): # html = HTML(response) # name = html.xpath('//p[@class="tit"]/a/text()') # if name: # print(name) # return name[0] # else: # self.logger.info("ip_prox error2") # return self.req_for_name(wechat_id) # else: # self.logger.info("ip_prox error change") # return self.req_for_name(wechat_id) def data_shuffle(self, data, province_list, city_list, area_list): """ 数据清洗 :param data: :param province_list: :param city_list: :param area_list: :return: re_data or None """ # BANK_CODE_正则匹配规则 pattern = re.compile( r'ICBC|ABC|BOCOM|CCB|BOC|PSBC|CZB|CBHB|ECITIC|CEB|HXB|CMBC|CMB|CIB|CGB|PAB|SPDB|EBCL' ) re_data = dict() if data["TITLE_"]: # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" 通用列族字段 re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] # re_data["URL_"] = "" prov_c = None prov_n = None city_c = None city_n = None area_c = None area_n = None bank_n = None bank_c = pattern.match(data["ENTITY_CODE_"]) if bank_c: re_data["BANK_CODE_"] = bank_c.group() else: return None # 正则去除银行名称,方便匹配地区编码 bank_n = re.sub( r"{}银?行?|微信|[总分支]行".format( self.name_dict[re_data["BANK_CODE_"]][:-2]), "", data["ENTITY_NAME_"]) re_data["BANK_NAME_"] = self.name_dict[re_data["BANK_CODE_"]] re_data["PERIOD_CODE_"] = data["PERIOD_CODE_"].replace("-", "") re_data["NOTICE_TIME_"] = data["PERIOD_CODE_"] re_data["STATUS_"] = "1" re_data["CONTENT_"] = data["CONTENT_"] re_data["REMARK_"] = "" for area in area_list: if area["NAME_"] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] if area_c: pass else: for prov in province_list: if prov["NAME_"] in bank_n: prov_c = prov["CODE_"] prov_n = prov["NAME_"] bank_n = bank_n.replace(prov_n, "") break elif prov["NAME_"][:-1] in bank_n: prov_c = prov["CODE_"] prov_n = prov["NAME_"] bank_n = bank_n.replace(prov_n[:-1], "") break elif prov["NAME_"][:4] in bank_n: prov_c = prov["CODE_"] prov_n = prov["NAME_"] bank_n = bank_n.replace(prov_n[:4], "") break elif prov["NAME_"][:3] in bank_n: prov_c = prov["CODE_"] prov_n = prov["NAME_"] bank_n = bank_n.replace(prov_n[:3], "") break elif prov["NAME_"][:2] in bank_n: prov_c = prov["CODE_"] prov_n = prov["NAME_"] bank_n = bank_n.replace(prov_n[:2], "") break for city in city_list: if len(city["NAME_"]) == 1: continue if prov_c: if city["CODE_"][:2] == prov_c[:2]: if city["NAME_"] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n, "") break elif city["NAME_"][:-1] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:-1], "") break elif city["NAME_"][:4] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:4], "") break elif city["NAME_"][:3] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:3], "") break elif city["NAME_"][:2] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:2], "") break else: if city["NAME_"] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n, "") break elif city["NAME_"][:-1] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:-1], "") break elif city["NAME_"][:4] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:4], "") break elif city["NAME_"][:3] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:3], "") break elif city["NAME_"][:2] in bank_n: city_c = city["CODE_"] city_n = city["NAME_"] bank_n = bank_n.replace(city_n[:2], "") break for area in area_list: if city_c: if area["CODE_"][:2] == city_c[:2]: if area["NAME_"] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:-1] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:4] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:3] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:2] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif prov_c: if area["CODE_"][:2] == prov_c[:2]: if area["NAME_"] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:-1] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:4] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:3] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:2] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break else: if area["NAME_"][:-1] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:4] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:3] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break elif area["NAME_"][:2] in bank_n: area_c = area["CODE_"] area_n = area["NAME_"] break # 特殊情况 星子县现为庐山市 喻家山位于武汉洪山区 if "星子县" in data["ENTITY_NAME_"]: area_c = "360483" area_n = "庐山市" elif "喻家山" in data["ENTITY_NAME_"]: area_c = "420111" area_n = "洪山区" elif "江南西" in data["ENTITY_NAME_"]: area_c = "440105" area_n = "海珠区" elif "两路口" in data["ENTITY_NAME_"]: area_c = "500103" area_n = "渝中区" elif "大兴安岭" in data["ENTITY_NAME_"]: area_c = "232700" area_n = "大兴安岭地区" elif "张家港" in data["ENTITY_NAME_"]: area_c = "320582" area_n = "张家港市" elif "兴业银行新阳支行" in data["ENTITY_NAME_"]: area_c = "230102" area_n = "道里区" if area_c: pass elif (not area_c) and city_c: area_c = city_c area_n = city_n elif (not area_c) and (not city_c) and prov_c: area_c = prov_c area_n = prov_n # 总行地区处理 elif (not area_c) and (not city_c) and (not prov_c): if re_data["BANK_CODE_"] == "ICBC": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "ABC": area_c = "110101" area_n = "东城区" elif re_data["BANK_CODE_"] == "BOCOM": area_c = "310115" area_n = "浦东新区" elif re_data["BANK_CODE_"] == "CCB": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "BOC": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "PSBC": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "CZB": area_c = "330103" area_n = "下城区" elif re_data["BANK_CODE_"] == "CBHB": area_c = "120103" area_n = "河西区" elif re_data["BANK_CODE_"] == "ECITIC": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "CEB": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "HXB": area_c = "110101" area_n = "东城区" elif re_data["BANK_CODE_"] == "CMBC": area_c = "110102" area_n = "西城区" elif re_data["BANK_CODE_"] == "CMB": area_c = "440304" area_n = "福田区" elif re_data["BANK_CODE_"] == "CIB": area_c = "350102" area_n = "鼓楼区" elif re_data["BANK_CODE_"] == "CGB": area_c = "440104" area_n = "越秀区" elif re_data["BANK_CODE_"] == "PAB": area_c = "440303" area_n = "罗湖区" elif re_data["BANK_CODE_"] == "SPDB": area_c = "310101" area_n = "黄浦区" elif re_data["BANK_CODE_"] == "EBCL": area_c = "370602" area_n = "芝罘区" re_data["AREA_CODE_"] = area_c if area_c: re_data["UNIT_CODE_"] = re_data[ "BANK_CODE_"] + "_" + area_c[:4] + "00" if ("b" in data["BANK_NAME_"]) or ("B" in data["BANK_NAME_"]): return None if "DATETIME_" not in data: time_array = time.localtime(int(float(data["DEALTIME_"]))) value_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) re_data["CREATE_TIME_"] = value_time else: re_data["CREATE_TIME_"] = data["DATETIME_"] # data["UPDATE_TIME_"] = "" re_data["TITLE_"] = data["TITLE_"] re_data["CONTENT_TYPE_"] = data["CONTENT_TYPE_"] re_data["WECHAT_ID_"] = data["WECHAT_"].replace(" ", "") re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["DEALTIME_"] = str(data["DEALTIME_"]) # print(area_c, area_n, data["ENTITY_NAME_"]) return re_data else: return None def delete_data_from_mongo(self): """ 从 MongoDB 删除数据 :return: delete_count """ try: remove_count = self.m_client.remove_from_mongo( collection=self.collection, remove_id_list=self.remove_id_list) return remove_count except pymongo.errors.ServerSelectionTimeoutError: mongo_data_list = self.m_client.remove_from_mongo( collection=self.collection, remove_id_list=self.remove_id_list) return mongo_data_list except Exception as e: self.logger.info(e) return None except KeyError as e: self.logger.info(e) return None def upsert_and_delete(self, mongo_data_list, province_list, city_list, area_list): """ 插入和删除 :param mongo_data_list: :param province_list: :param city_list: :param area_list: :return: """ for i in range(1000000): status = False self.data_id = "" success_count = 0 try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) # print(data["_id"]) # self.remove_id_list.append(self.data_id) # del data["_id"] # copy_data = deepcopy(data) # self.copy_mongo_data_list.append(copy_data) # 清洗数据 try: re_data = self.data_shuffle(data=data, province_list=province_list, city_list=city_list, area_list=area_list) except Exception as e: # self.remove_id_list.remove(self.data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # 获取公众号名称 # try: # print(re_data["WECHAT_ID_"]) re_data["WECHAT_NAME_"] = self.check_name( re_data["WECHAT_ID_"]) # re_data["WECHAT_NAME_"] = self.req_for_name(re_data["WECHAT_ID_"]) # print(re_data["WECHAT_ID_"]) # print(re_data["WECHAT_NAME_"]) # except Exception as e: # 向 HBase 插入数据 try: count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) success_count += count except jaydebeapi.DatabaseError as e: # self.logger.info("error: {}".format(e)) # self.remove_id_list.remove(self.data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # # Phoenix 连接关闭 # p_client.close_client_phoenix(connection=connection) # time.sleep(10) # # 连接 Phoenix # connection = p_client.connect_to_phoenix() # # 向 HBase 插入数据 # count = p_client.upsert_to_phoenix_by_one(connection=connection, data=re_data) # success_count += count # try: # # 添加 {d:1} # update_count = self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, # data_dict={"d": 1}) # self.remove_count += update_count # # self.logger.info("MongoDB 更新成功") # if self.remove_count % 10 == 0: # self.logger.info("MongoDB 更新成功, 成功条数 {} 条".format("10")) # except Exception as e: # # self.remove_id_list.remove(data_id) # # self.copy_mongo_data_list.remove(copy_data) # self.logger.warning("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e)) # continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 # self.remove_id_list.remove(self.data_id) # self.copy_mongo_data_list.remove(copy_data) continue # # 删除数据 # if status: # delete_count = self.delete_data_from_mongo() # self.remove_count += delete_count # self.logger.info("MongoDB 删除成功") # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # # 将数据插入 spider_data_old 中 # if status: # try: # self.old_client.mongo_db = "spider_data_old" # insert_count = self.old_client.all_to_mongodb(collection=self.collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except pymongo.errors.ServerSelectionTimeoutError as e: # time.sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = self.old_client.all_to_mongodb(collection=self.collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) def main(self): """ :return: """ # # 删除表 # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # 建表语句 # table_sql = ('create table "WECHAT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."URL_" varchar, "C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar,' # '"C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar,' # '"C"."REMARK_" varchar, "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, ' # '"T"."CONTENT_" varchar, "C"."CONTENT_TYPE_" varchar, "C"."TITLE_" varchar,' # '"C"."WECHAT_ID_" varchar, "C"."WECHAT_NAME_" varchar, "C"."ENTITY_NAME_" varchar,' # '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar, "C"."PRAISES_" varchar,' # '"C"."READ_NUM_" varchar, "C"."REPLIES_" varchar, "C"."RELAYS_" varchar,' # '"C"."NOTICE_TIME_" varchar, "C"."IMPROTANCE_" varchar) IMMUTABLE_ROWS = true') # # # 创建表 # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) # f_id = "5c1267258d7fee59f7d089f8" # gte 10M # f_id = "5c1271a28d7fee66df0fdd83" # gte 10M # f_id = "5c127e7b9bb3df7412b53b04" # gte 10M # f_id = "5c1330d28d7fee4d9c87d6e1" # gte 10M # f_id = "5c1330ed9bb3df2de33bb746" # gte 10M # f_id = "5c13490a8d7fee79f1d9e87f" # gte 10M # f_id = "5c1350ee8d7fee2d29b601ef" # gte 10M # f_id = "5c1351c79bb3df0e23ee68c1" # gte 10M # f_id = "5c13547d9bb3df06d41997d5" # gte 10M # f_id = "5c1354849bb3df202508ee3e" # gte 10M # f_id = "5c1354bd8d7fee44b881b11a" # gte 10M # f_id = "5c1354e89bb3df1b2a6ef59c" # gte 10M # f_id = "5c1355139bb3df197beb11c0" # gte 10M # f_id = "5c1355328d7fee2f0997a3ac" # gte 10M # f_id = "5c13558e8d7fee50ea04bd0a" # gte 10M # f_id = "5c135a5f8d7fee5bf7db91b8" # gte 10M # f_id = "5c135b0c8d7fee697fa5bd80" # gte 10M # f_id = "5c135bd59bb3df4d7aa66cad" # gte 10M # f_id = "5c135bdb9bb3df454c0157a3" # gte 10M # f_id = "5c135bfc8d7fee73c8f84567" # gte 10M # f_id = "5c135c119bb3df48aeb8fe63" # gte 10M # f_id = "5c135dfe9bb3df4d7aa66cc2" # gte 10M # f_id = "5c13602d8d7fee7f7a48c485" # gte 10M # f_id = "5c1361858d7fee223825f805" # gte 10M # f_id = "5c1361d68d7fee561806fc4d" # gte 10M # f_id = "5c1362068d7fee223825f808" # gte 10M # f_id = "5c1362159bb3df26bba60a05" # gte 10M # f_id = "5c1366248d7fee6741adb5be" # gte 10M # f_id = "5c1366418d7fee673f6c95cb" # gte 10M # f_id = "5c1367099bb3df5a0e013c4d" # gte 10M # f_id = "5c13686d8d7fee76ac78735b" # gte 10M # f_id = "5c1368788d7fee6fcb24daa3" # gte 10M # f_id = "5c1369438d7fee63412b04ff" # gte 10M # f_id = "5c13697b9bb3df60429b5d31" # gte 10M # f_id = "5c1389468d7fee6a94c413c3" # gte 10M # f_id = "5c1389c29bb3df75adc8861a" # gte 10M # f_id = "5c138b039bb3df75adc88620" # gte 10M # f_id = "5c138e3d9bb3df074c4ec0b3" # gte 10M # f_id = "5c138e4d8d7fee06a4f8fd59" # gte 10M # f_id = "5c1391318d7fee168749a96e" # gte 10M # f_id = "5c25a4f19bb3df51eba386b8" # gte 10M # f_id = "5c2601ef9bb3df7d42fe2084" # gte 10M # f_id = "5c2608099bb3df24f5db4527" # gte 10M # f_id = "5c2608be9bb3df2d58d08e32" # gte 10M # f_id = "5c260d2b9bb3df3c084d2a83" # gte 10M # f_id = "5c2615868d7fee2771bb3914" # gte 10M # f_id = "5c261d528d7fee3c1383db85" # gte 10M # f_id = "5c26340e8d7fee66d784fe8a" # gte 10M # f_id = "5c263b818d7fee630f0d3ac4" # gte 10M # f_id = "5c263ee28d7fee04ddc62e31" # gte 10M # f_id = "5c263f269bb3df0d29d1e1e5" # gte 10M # f_id = "5c2766718d7fee2aa36fa166" # gte 10M # f_id = "5c2b79ef8d7fee3025e02575" # gte 10M # f_id = "5c2b854a9bb3df27dc669d5a" # gte 10M # f_id = "5c2e00078d7fee1b60443cf3" # gte 10M # f_id = "5c2f69028d7fee62d31a72db" # gte 10M # f_id = "5c36a7948d7fee18d9333327" # gte 10M # f_id = "5c36b9ff9bb3df332dfebe39" # gte 10M # f_id = "5c3754579bb3df02b680150b" # gte 10M # f_id = "5c375c969bb3df6afd18e22d" # gte 10M # f_id = "5c38a1e59bb3df6b2ff2f269" # gte 10M # f_id = "5c394e058d7fee6a2582d1d3" # gte 10M # f_id = "5c3c983e9bb3df21ddf94a92" # gte 10M # f_id = "5c3ca38a9bb3df60bca07833" # gte 10M f_id = "5c3c983e9bb3df21ddf94a92" # f_id = "" self.data_id = f_id province_list, city_list, area_list, dir_area_list = self.mysql_client.area_from_mysql( ) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection, data_id=self.data_id) self.find_count += mongo_data_list.count() try: self.upsert_and_delete(mongo_data_list=mongo_data_list, province_list=province_list, city_list=city_list, area_list=area_list) except jaydebeapi.DatabaseError: self.logger.info("error id is: {}".format(self.data_id)) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection, data_id=self.data_id) self.upsert_and_delete(mongo_data_list=mongo_data_list, province_list=province_list, city_list=city_list, area_list=area_list) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="FINPRODUCT_FINASSIST") p_client.verify_list = self.verify_list # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection(db=db, collection_list=collection_list) # 创建 MongoDB spider_data_old 数据库对象 # old_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST") # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, # serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) # old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = old_client.client_to_mongodb() # collection_old = db_old["FINPRODUCT_FINASSIST"] # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # 表创建语句 # sql = ('create table "FINPRODUCT_FINASSIST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,' # '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, ' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "C"."STATUS_" varchar,' # '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."TIME_LIMIT_" varchar,' # '"C"."YIELD_RATE_" varchar, "C"."BREAKEVEN_" varchar, "C"."START_FUNDS_" varchar,' # '"C"."INVEST_PERIOD_" varchar, "C"."SALE_DISTRICT_" varchar, "C"."SALE_START_" varchar,' # '"C"."SALE_END_" varchar, "C"."RISK_LEVEL_" varchar, "C"."REDEMING_MODE_" varchar,' # '"C"."PRIVATE_BANK_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."CURRENCY_TYPE_" varchar, "C"."INCREASE_UNIT_" varchar,' # '"C"."YIELD_START_DATE_" varchar, "C"."YIELD_END_DATE_" varchar, "C"."YIELD_TYPE_" varchar,' # '"C"."TARGET_" varchar, "C"."PRODUCT_TYPE_" varchar, "C"."YIELD_STATMENT_" varchar,' # '"C"."INVEST_RANGE_" varchar, "C"."PRE_STOP_" varchar, "C"."RASE_PLAN_" varchar,' # '"C"."PURCHASE_" varchar, "T"."CONTENT_" varchar, "C"."IMAGE_" varchar) IMMUTABLE_ROWS = true') # # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 增加列 # p_client.add_column_phoenix(connection=connection, column="IMAGE_") for entity in ["CHINANETFINANCIAL", "JSFIN_CCBDATA"]: # for entity in self.entity_list: status = False module_name = __import__(entity) self.logger.info("开始进行 ENTITY_CODE_: {}".format(entity)) self.remove_id_list = [] self.copy_mongo_data_list = [] # find_id = "5c3f118f8d7fee068da6ef53" find_id = None try: if entity == "JSFIN_CCBDATA": m_client.mongo_collection = "JSFIN_CCBDATA" collection = m_client.get_check_collection(db=db, collection_list=collection_list) mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client, collection=collection, data_id=None) else: m_client.mongo_collection = "FINPRODUCT_FINASSIST" collection = m_client.get_check_collection(db=db, collection_list=collection_list) mongo_data_list = self.get_data_from_mongo(m_client=m_client, collection=collection, entity_code=entity, data_id=find_id) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) if entity == "JSFIN_CCBDATA": m_client.mongo_collection = "JSFIN_CCBDATA" mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client, collection=collection, data_id=None) else: m_client.mongo_collection = "FINPRODUCT_FINASSIST" collection = m_client.get_check_collection(db=db, collection_list=collection_list) mongo_data_list = self.get_data_from_mongo(m_client=m_client, collection=collection, entity_code=entity, data_id=find_id) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count += mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) if entity == "CHINANETFINANCIAL": re_data = module_name.data_shuffle(data=data, sales_status=self.sales_status, produc_category=self.produc_category, revenue_type=self.revenue_type, operaton_pattern=self.operaton_pattern, purchase_amount=self.purchase_amount, duration_type=self.duration_type) elif entity == "JSFIN_CCBDATA": re_data = module_name.ScriptCCB.data_shuffle(self=self, data=data) else: re_data = module_name.data_shuffle(data) if not re_data: self.bad_count += 1 continue except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format(data_id, e)) continue print(data_id) # phoenix_HBase 插入数据 if isinstance(re_data, dict): try: success_count = p_client.upsert_to_phoenix_by_one(connection=connection, data=re_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e)) continue elif isinstance(re_data, list): for r_data in re_data: try: success_count = p_client.upsert_to_phoenix_by_one(connection=connection, data=r_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e)) continue if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) else: continue break # # 删除数据 # if status: # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection, # entity_code=entity, # remove_id_list=self.remove_id_list) # self.remove_count += delete_count # # self.logger.info("MongoDB 删除成功") # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # 将数据插入 spider_data_old 中 # if status: # try: # old_client.mongo_db = "spider_data_old" # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except pymongo.errors.ServerSelectionTimeoutError as e: # sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) # 关闭连接 m_client.client_close() # p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.handlers.clear()
class WeiboBasicInfoUpdate(object): def __init__(self, table_name="CHA_BRANCH_WEIBO_BASIC", collection_name="WEIBOBASICINFO"): # phoenix connection self.p_client = PhoenixHbase(table_name=table_name) self.connection = self.p_client.connect_to_phoenix() # Mongo connection self.m_client = MongoClient(entity_code="CMBCMICROBLOG", mongo_collection=collection_name) self.mongo_host = "172.22.69.35" self.mongo_port = 20000 self.m_client.client = pymongo.MongoClient(host="172.22.69.35", port=20000, serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) self.db, self.collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=self.db, collection_list=self.collection_list) # Log self.logger = Logger().logger def get_mongo_column_dict(self, collection, column1, column2): mon_logger = Logger().logger try: mon_logger.info("开始查取数据") result = collection.aggregate([{ "$project": { "_id": 0, column1: 1, column2: 1 } }]) return result except TypeError as e: mon_logger.error( "WEIBO_CODE_ 数据查取失败,错误信息为{}, 请检查匹配规则是否正确".format(e)) raise Exception("WEIBO_CODE_ 查取失败, 错误信息为{}".format(e)) finally: self.m_client.client.close() def main(self): mongo_data_list = self.get_mongo_column_dict( collection=self.collection, column1="WEIBO_CODE_", column2="FANS_") # update to hbase result_generator = self.p_client.search_all_from_phoenix( connection=self.connection, dict_status=True) while True: try: result = result_generator.__next__() for mongo_data in mongo_data_list: if mongo_data["WEIBO_CODE_"] == result["WEIBO_CODE_"]: result["FANS_"] = mongo_data["FANS_"] break except StopIteration: break self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=result) self.connection.close()
def judge(item): if not any([num in item for num in ['长城', '长城科技', '长城信息']]): return False else: return True if __name__ == '__main__': # run() pass import pandas as pd main_mongo = MongoClient(entity_code="", mongo_collection="CommonBidding") db, collection_list = main_mongo.client_to_mongodb() collection = main_mongo.get_check_collection( db=db, collection_list=collection_list) # mon_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'] mon_list = [ '08', '09', '10', '11', '12', # '06', ] for _ in range(13): try: data_list = collection.find( {
def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="CommonBidding") # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="CommonBidding") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 MongoDB spider_data_old 数据库对象 old_client = MongoClient(mongo_collection="CommonBidding") # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, # serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) old_client.mongo_db = "spider_data_old" db_old, collection_list_old = old_client.client_to_mongodb() collection_old = db_old["CommonBidding"] # 删除表 # p_client.drop_table_phoenix(connection=connection) # 招投标表创建语句 # sql = ('create table "CommonBidding" ("ID_" varchar primary key, "F"."CONTENT_" varchar,' # '"F"."NOTICE_TIME_" varchar,"F"."TITLE_" varchar,"F"."PROJECT_NAME_" varchar,' # '"F"."BID_CONTENT_" varchar, "F"."SIGN_START_TIME_" varchar, "F"."SIGN_END_TIME_" varchar,' # '"F"."OPEN_BID_TIME_" varchar, "F"."OPEN_BID_PLACE_" varchar, "F"."BID_AGENCY_" varchar,' # '"F"."APPLY_CONDITION_" varchar, "F"."SIGN_QUALIFICATION_" varchar, "F"."PROJECT_ID_" varchar,' # '"F"."WIN_CANDIDATE_" varchar, "F"."CANDIDATE_RANK_" varchar, "F"."BID_" varchar,"F"."URL_" varchar,' # '"F"."DEALTIME_" varchar, "F"."ENTITY_NAME_" varchar, "F"."ENTITY_CODE_" varchar,' # '"F"."ENTITY_STATUS_" varchar, "F"."SIGN_MATERIAL_" varchar, "F"."BID_TYPE_" varchar,' # '"F"."DATETIME_" varchar, "F"."BUDGET_PRICE_" varchar, "F"."PASS_REASON_" varchar,' # '"F"."PRESALE_CONTENT_" varchar, "F"."PRESALE_WAY_" varchar,"F"."PRESALE_START_TIME_" varchar,' # '"F"."PRESALE_END_TIME_" varchar,"F"."PRESALE_ADDR_" varchar,"F"."PRESALE_PREPARE_" varchar,' # '"F"."IMAGE_" varchar) IMMUTABLE_ROWS = true') # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 增加列 # p_client.add_column_phoenix(connection=connection, column="IMAGE_") # 遍历 ENTITY_CODE_ 列表 # self.file_list = ["CommonBidding_86JCW"] for f in self.file_list: status = False entity_code = f.replace(".py", "") module_name = __import__(entity_code) entity_code_mongo = entity_code.replace("CommonBidding_", "") self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code_mongo)) self.remove_id_list = [] self.copy_mongo_data_list = [] try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code_mongo) except pymongo.errors.ServerSelectionTimeoutError: time.sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code_mongo) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 try: self.find_count += mongo_data_list.count() except pymongo.errors.ServerSelectionTimeoutError: time.sleep(1) self.find_count += mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] self.remove_id_list.append(data_id) del data["_id"] # 深拷贝源数据,用于插入 spider_data 库中 copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) # 数据清洗 try: re_data = module_name.data_shuffle(data) final_data = self.shuffle_data(re_data) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue # phoenix_HBase 插入数据 if final_data: try: p_client.upsert_to_phoenix_by_one( connection=connection, data=final_data) once_count += 1 except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) # 删除数据 if status: delete_count = self.delete_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code_mongo, remove_id_list=self.remove_id_list) self.remove_count += delete_count else: self.logger.info("HBase 插入成功条数0条, 不执行删除") # 将数据插入 spider_data_old 中 if status: try: old_client.mongo_db = "spider_data_old" insert_count = old_client.all_to_mongodb( collection=collection_old, insert_list=self.copy_mongo_data_list) self.old_count += insert_count # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) except pymongo.errors.ServerSelectionTimeoutError as e: time.sleep(1) self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) insert_count = old_client.all_to_mongodb( collection=collection_old, insert_list=self.copy_mongo_data_list) self.old_count += insert_count # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) except Exception as e: self.logger.info(e) # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(p_client.count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.handlers.clear()
def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="ORGANIZE_FINASSIST") p_client.verify_list = self.verify_list # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # # 创建 MongoDB spider_data_old 数据库对象 # old_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST") # # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60, # connectTimeoutMS=60, connect=False) # old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = old_client.client_to_mongodb() # collection_old = db_old["ORGANIZE_FINASSIST"] # 获取地区编码 province_list, city_list, area_list, dir_area_list = (GenericScript( entity_code=None, entity_type="ORGANIZE_FINASSIST").area_from_mysql()) # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # 创建表 # # 网点表创建语句 # sql = ('create table "ORGANIZE_FINASSIST" ("ID_" varchar primary key, "C"."BANK_NAME_" varchar,' # '"C"."BANK_CODE_" varchar, "C"."NAME_" varchar,' # '"C"."CODE_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,' # '"C"."AREA_CODE_" varchar, "C"."UNIT_CODE_" varchar, "C"."ADDR_" varchar,' # '"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar, "C"."CITY_" varchar,' # '"C"."CITY_CODE_" varchar, "C"."DISTRICT_NAME_" varchar, "C". "DISTRICT_CODE_" varchar,' # '"C"."LAT_" varchar, "C"."LNG_" varchar, "C"."CREATE_TIME_" varchar, "C"."DEALTIME_" varchar,' # '"C"."URL_" varchar, "C"."TEL_" varchar, "C"."BUSINESS_HOURS_" varchar, "C"."STATUS_" varchar,' # '"C"."IMPORTANCE_" varchar) IMMUTABLE_ROWS = true') # # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 遍历 ENTITY_CODE_ 列表 # self.code_list = ["ABCORGANIZE"] for entity_code in self.code_list: status = False module_name = __import__(entity_code) self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code)) self.remove_id_list = [] self.copy_mongo_data_list = [] self.branch_code_list = [] # find_id = "" if entity_code == "ECITICORGANIZE": find_id = "5c3f48479bb3df1d97d762e1" else: find_id = None try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, data_id=find_id) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, data_id=find_id) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count = mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] # copy_data = {} # self.remove_id_list.append(data_id) try: del data["_id"] # copy_data = deepcopy(data) # self.copy_mongo_data_list.append(copy_data) re_data = module_name.data_shuffle( data, province_list, city_list, area_list) if not re_data: self.bad_count += 1 continue except Exception as e: # except jpype._jexception.SQLExceptionPyRaisable as e: # except org.apache.phoenix.exception.BatchUpdateExecution as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue print(data_id) if isinstance(re_data, list): for list_data in re_data: area_data = "" try: # self.logger.info("_id {}".format(data_id)) area_data = self.shuffle_for_area(list_data) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception( "_id:{} 获取经纬度失败, {}".format(data_id, e)) continue # except ValueError: # pass # phoenix_HBase 插入数据 if area_data: try: # print(area_data) success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=area_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue elif isinstance(re_data, dict): area_data = "" try: area_data = self.shuffle_for_area(re_data) except urllib3.exceptions.NewConnectionError as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception("_id: {}获取经纬度失败, {}".format( data_id, e)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception("_id: {}获取经纬度失败, {}".format( data_id, e)) continue # phoenix_HBase 插入数据 if area_data: try: # print(area_data) success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=area_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if self.success_count % 100 == 0: self.logger.info("HBase 插入成功, 成功条数 {} 条".format( self.success_count)) # 添加 {d:1} # if self.success_count % 50 == 0: # update_count = m_client.update_to_mongodb(collection=collection, # data_id=self.remove_id_list, # data_dict={"d": 1}) # self.remove_id_list = [] # self.remove_count += update_count # self.logger.info("MongoDB 更新成功") mongo_data_list.close() # 添加 {d:1} # if self.remove_id_list: # update_count = m_client.update_to_mongodb(collection=collection, # data_id=self.remove_id_list, # data_dict={"d": 1}) # self.remove_id_list = [] # self.remove_count += update_count # self.logger.info("MongoDB 更新成功") if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) else: continue # 删除数据 # if status: # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection, # entity_code=entity_code, # remove_id_list=self.remove_id_list) # self.remove_count += delete_count # self.logger.info("MongoDB 删除成功") # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # 将数据插入 spider_data_old 中 # if status: # try: # old_client.mongo_db = "spider_data_old" # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except pymongo.errors.ServerSelectionTimeoutError as e: # sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class HexunOpinion(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="HEXUNOPINION") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="SENTIMENT") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def data_shuffle(self, data): if ":" not in data["NOTICE_TIME_"]: return None re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] # re_data["AREA_CODE_"] # re_data["UNIT_CODE_"] period_code = data["NOTICE_TIME_"][:10].replace("-", "") re_data["PERIOD_CODE_"] = period_code re_data["CONTENT_"] = re.sub(r"本报告版权归和讯财经传播研究所所有,未经书面授权允许,不得复制转载。\|.*", "", data["CONTENT_"]) re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"][:10] re_data["STATUS_"] = "1" # re_data["REMARK_"] = "" re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] re_data["TITLE_"] = data["TITLE_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] # re_data["DATETIME_"] = data["DATETIME_"] return re_data def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # create table sql # table_sql = ('create table "SENTIMENT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."REMARK_" varchar, "C"."PERIOD_CODE_" varchar,' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, ' # '"C"."SOURCE_" varchar, "C"."BRIEF_" varchar, "C"."IMAGE_" varchar, "C"."IMPORTANCE_" varchar,' # '"C"."TITLE_" varchar, "C"."URL_" varchar, "C"."NOTICE_TIME_" varchar,' # '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar, "C"."HOME_PAGE_" varchar) IMMUTABLE_ROWS = true') # # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection) for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) if i < 4: re_data["HOME_PAGE_"] = "Y" except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # add {d:1} try: self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, data_dict={"d": 1}) self.remove_count += 1 if self.remove_count % 10 == 0: self.logger.info("MongoDB 更新成功, 成功条数 {}".format( self.remove_count)) except Exception as e: self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format( self.data_id, e)) continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class Meipian(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="meipian_CCBDATA") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="MEIPIAN_CCBDATA") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def data_shuffle(self, data): re_data = dict() if data["TITLE_"]: # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" 通用列族字段 re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] # re_data["BANK_NAME_"] time_arrary = arrow.get(data["CREATE_TIME"]) period_code = time_arrary.format("YYYYMMDD") publish_time = time_arrary.format("YYYY-MM-DD HH:mm:ss") re_data["PERIOD_CODE_"] = str(period_code) re_data["PUBLISH_TIME_"] = str(publish_time) re_data["STATUS_"] = "UNPROCESSED" re_data["CONTENT_"] = data["CONTENT_"] re_data["REMARK_"] = "" # re_data["AREA_CODE_"] # re_data["UNIT_CODE_"] re_data["CREATE_TIME_"] = data["DATETIME_"] re_data["URL_"] = data["URL_"] re_data["TITLE_"] = data["TITLE_"] re_data["CONTENT_TYPE_"] = data["CONTENT_TYPE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["DEALTIME_"] = str(data["DEALTIME_"]) re_data["VISIT_COUNT_"] = data["VISIT_COUNT"] re_data["PRAISE_COUNT_"] = data["PRAISE_COUNT"] re_data["COMMENT_COUNT_"] = data["COMMENT_COUNT"] re_data["SOURCE_"] = data["SOURCE_"] return re_data else: return None def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # # create table sql # table_sql = ('create table "MEIPIAN_CCBDATA" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."URL_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,' # ' "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, ' # '"C"."TITLE_" varchar, "C"."CONTENT_TYPE_" varchar, "C"."ENTITY_NAME_" varchar,' # '"C"."VISIT_COUNT_" varchar, "C"."PRAISE_COUNT_" varchar, "C"."COMMENT_COUNT_" varchar,' # '"C"."DEALTIME_" varchar, "C"."SOURCE_" varchar, "C"."PUBLISH_TIME_" varchar,' # '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true') # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) # f_id = "5c6fa1328d7fee306de9463d" # quit() # f_id = "5c6fe1ba8d7fee1d44775989" # quit() # f_id = "5c6fdb448d7fee394da6a5fb" # quit() Exception while executing batch. # f_id = "5c6fe1ba8d7fee1d44775989" f_id = "5c6fe11b9bb3df6b0ec6168b" # gt 10M mongo_data_list = self.m_client.all_from_mongodb(self.collection, data_id=f_id) for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # add {d:1} try: self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, data_dict={"d": 1}) self.remove_count += 1 if self.remove_count % 10 == 0: self.logger.info("MongoDB 更新成功, 成功条数 {}".format( self.remove_count)) except Exception as e: self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format( self.data_id, e)) continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="WEIBOBASICINFO") p_client.verify_list = self.verify_list # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="WEIBOBASICINFO") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # # 创建 MongoDB spider_data_old 数据库对象 # old_client = MongoClient(mongo_collection="WEIBOBASICINFO") # # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60, # connectTimeoutMS=60, connect=False) # old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = old_client.client_to_mongodb() # collection_old = db_old["ORGANIZE_FINASSIST"] # 获取地区编码 province_list, city_list, area_list, dir_area_list = (GenericScript( entity_code=None, entity_type=None).area_from_mysql()) # 删除表 p_client.drop_table_phoenix(connection=connection) # quit() # 创建表 sql = ( 'create table "WEIBOBASICINFO" ("ID_" varchar primary key, "C"."BANK_CODE_" varchar,' '"C"."BANK_NAME_" varchar, "C"."PERIOD_CODE_" varchar, "C"."CREATE_TIME_" varchar,' '"C"."UPDATE_TIME_" varchar, "C"."REMARK_" varchar, "C"."WEIBO_CODE_" varchar, "C"."MAIN_URL_" varchar,' '"C"."NAME_" varchar, "C"."FOCUS_" varchar, "C"."FANS_" varchar, "C"."COMPANY_URL_" varchar,' '"C"."COMPANY_" varchar, "C"."DETAILED_URL_" varchar, "C"."VIRIFIED_" varchar,"C"."AREA_CODE_" varchar,' '"C"."BIREF_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,' '"C"."DEALTIME_" varchar,"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar,' '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true') p_client.create_new_table_phoenix(connection=connection, sql=sql) # 增加列 # p_client.add_column_phoenix(connection=connection, column="IMAGE_") # 遍历 ENTITY_CODE_ 列表 status = False self.logger.info("开始进行 WEIBOBASICINFO") try: mongo_data_list = m_client.all_from_mongodb(collection=collection) except pymongo.errors.ServerSelectionTimeoutError: time.sleep(1) mongo_data_list = m_client.all_from_mongodb(collection=collection) # 清洗数据并插入 HBase if mongo_data_list: self.find_count = mongo_data_list.count() for data in mongo_data_list: re_data = "" data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) re_data = self.data_shuffle(data=data, province_list=province_list) if not re_data: self.bad_count += 1 continue except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) # phoenix_HBase 插入数据 try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if self.success_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format( self.success_count)) else: quit() # # 删除数据 # if status: # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection, # remove_id_list=self.remove_id_list) # self.remove_count += delete_count # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # # 将数据插入 spider_data_old 中 # if status: # try: # old_client.mongo_db = "spider_data_old" # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # except pymongo.errors.ServerSelectionTimeoutError as e: # time.sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class JsInsuranceCcbData(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="JSINSURANCE_CCBDATA") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 MySQL 对象 __mysql_config = { "host": MYSQL_HOST_25, "port": MYSQL_PORT_25, "database": MYSQL_DATABASE_25, "user": MYSQL_USER_25, "password": MYSQL_PASSWORD_25, "table": MYSQL_TABLE_25 } __mysql_client = MysqlClient(**__mysql_config) __mysql_connection = __mysql_client.client_to_mysql() self.type = __mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'TYPE\'", connection=__mysql_connection) __mysql_client.close_client(connection=__mysql_connection) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="INSURANCE") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" self.a = list() def data_shuffle(self, data): if data["ENTITY_CODE_"] == "PAINSURANCE": return None elif data["ENTITY_CODE_"] == "BJBINSURANCE": data["CONTET_"] = data["CONTET_"].replace("|主险2:", "主险2:") first_shuffle = data["CONTET_"].split("|") data_list = list() company_dict = dict() index_list = list() for first in first_shuffle: if first[-2:] == "公司": company_index = first_shuffle.index(first) company_dict[company_index] = first index_list.append(company_index) else: continue for key in index_list: # print(index_list) j = key + 1 for i in range(100): if index_list.index(key) == len(index_list) - 1: if j == len(first_shuffle) - 1: break else: if j == index_list[index_list.index(key) + 1]: break data_dict = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(first_shuffle[j].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" data_dict["ID_"] = row_key data_dict["ENTITY_CODE_"] = data["ENTITY_CODE_"] data_dict["ENTITY_NAME_"] = data["ENTITY_NAME_"].replace( "模板", "产品") data_dict["BANK_CODE_"] = "BJB" data_dict["BANK_NAME_"] = "北京银行" data_dict["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") data_dict["URL_"] = data["URL_"] data_dict["PRODUCT_NAME_"] = first_shuffle[j] j += 1 # data_dict["TYPE_"] = first_shuffle[j] data_dict["TYPE_"] = "" data_dict["TYPE_CODE_"] = "" for i in self.type: if i["ITEM_LABEL_"][:-1] in first_shuffle[j]: data_dict["TYPE_"] = data_dict["TYPE_"] + i[ "ITEM_LABEL_"] + "|" data_dict["TYPE_CODE_"] = data_dict[ "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|" data_dict["TYPE_"] = data_dict["TYPE_"][:-1] data_dict["TYPE_CODE_"] = data_dict["TYPE_CODE_"][:-1] j += 1 # data_dict["RISK_LEVEL_"] = first_shuffle[j] j += 1 data_dict["PAY_METHOD_"] = first_shuffle[j] j += 1 # data_dict["INSURANCE_DATE_"] = first_shuffle[j] j += 1 # data_dict["TOUZIZHE_TYPE_"] = first_shuffle[j] j += 1 data_dict["COM_NAME_"] = company_dict[key] # data_dict["CONSIGNMENT_"] = "代销" # if "CONTENT_" in data: # data_dict["CONTENT_"] = data["CONTENT_"] data_dict["DEALTIME_"] = data["DEALTIME_"] data_dict["CREATE_TIME_"] = data["DATETIME_"] data_dict["STATUS_"] = "1" # print(data_dict) data_list.append(data_dict) return data_list elif data["ENTITY_CODE_"] == "CIBINSURANCE": data_list = list() insurance_name = re.findall(r".*?计划", data["PRODUCT_NAME_"]) for name in insurance_name: re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(name.encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["PRODUCT_NAME_"] = name re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["BANK_CODE_"] = "CIB" re_data["BANK_NAME_"] = "兴业银行" re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] re_data["CREATE_TIME_"] = data["DATETIME_"] re_data["STATUS_"] = "1" re_data["TYPE_"] = "" re_data["TYPE_CODE_"] = "" for i in self.type: if i["ITEM_LABEL_"][:-1] in name: re_data["TYPE_"] = re_data["TYPE_"] + i[ "ITEM_LABEL_"] + "|" re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"] + i[ "ITEM_VALUE_"] + "|" re_data["TYPE_"] = re_data["TYPE_"][:-1] re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1] data_list.append(re_data) return data_list else: if "INSURANCE_NAME_" not in data and ("PRODUCT_NAME_" not in data): return None else: if "INSURANCE_NAME_" in data: # # 承保年龄 # if ("INSURANCE_AGE_" not in data) or (not data["INSURANCE_AGE_"]): # age = re.findall(r"(\d*)周岁", data["INSURANCE_NAME_"]) # if age: # data["INSURANCE_AGE_"] = age[0] # 保障期限 # if ("INSURANCE_DATE_" not in data) or (not data["INSURANCE_DATE_"]): # limit = re.findall(r"保(终身)|保(\d*年)|(\d*年)期", data["INSURANCE_NAME_"]) # if limit: # for l in limit[0]: # if l: # data["INSURANCE_DATE_"] = l # break re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["INSURANCE_NAME_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["BANK_CODE_"] = data["ENTITY_CODE_"].replace( "INSURANCE", "") re_data["BANK_NAME_"] = data["ENTITY_NAME_"].replace( "保险产品", "") if "INSURANCE_NAME_" in data: re_data["PRODUCT_NAME_"] = data["INSURANCE_NAME_"] if ("INSURANCE_AGE_" in data) or ("AGE_" in data): re_data["AGE_"] = data["INSURANCE_AGE_"] if "TYPE_" in data: re_data["TYPE_"] = "" re_data["TYPE_CODE_"] = "" if data["TYPE_"] == "财险": re_data["TYPE_"] = "财产险" re_data["TYPE_CODE_"] = "PROPERTY_INSURANCE" else: for i in self.type: if i["ITEM_LABEL_"][:-1] in data["TYPE_"]: re_data["TYPE_"] = re_data["TYPE_"] + i[ "ITEM_LABEL_"] + "|" re_data["TYPE_CODE_"] = re_data[ "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|" re_data["TYPE_"] = re_data["TYPE_"][:-1] re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1] else: re_data["TYPE_"] = "" re_data["TYPE_CODE_"] = "" for i in self.type: if i["ITEM_LABEL_"][:-1] in data["ENTITY_NAME_"]: re_data["TYPE_"] = re_data["TYPE_"] + i[ "ITEM_LABEL_"] + "|" re_data["TYPE_CODE_"] = re_data[ "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|" re_data["TYPE_"] = re_data["TYPE_"][:-1] re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1] # if "INSURANCE_DATE_" in data: # re_data["INSURANCE_DATE_"] = data["INSURANCE_DATE_"] if "INSURANCE_DETAIL_" in data: re_data["PRODUCT_DETAIL_"] = data["INSURANCE_DETAIL_"] if "COMPANY_NAME_" in data: re_data["COM_NAME_"] = data["COMPANY_NAME_"] if "LIMIT_NUMBER_" in data: re_data["BUY_LIMIT_"] = data["LIMIT_NUMBER_"] # re_data["AREA_CODE_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") if "CONTENT_" in data: re_data["CONTENT_"] = data["CONTENT_"] # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"] re_data["STATUS_"] = "1" # re_data["REMARK_"] = "" re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] # re_data["TITLE_"] = data["TITLE_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] # re_data["DATETIME_"] = data["DATETIME_"] return re_data elif "PRODUCT_NAME_" in data: re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["PRODUCT_NAME_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] # re_data["BANK_CODE_"] = data["ENTITY_CODE_"] # re_data["BANK_NAME_"] = data["ENTITY_NAME_"] if "PRODUCT_NAME_" in data: re_data["PRODUCT_NAME_"] = data["PRODUCT_NAME_"] if "FEATURE_NAME_" in data: re_data["FEATURE_NAME_"] = data["FEATURE_NAME_"] if "TYPE_" in data: re_data["TYPE_"] = "" re_data["TYPE_CODE_"] = "" if data["TYPE_"] == "财险": re_data["TYPE_"] = "财产险" re_data["TYPE_CODE_"] = "PROPERTY_INSURANCE" elif data["TYPE_"] == "100种疾病保障": re_data["TYPE_"] = "健康险" re_data["TYPE_CODE_"] = "HEALTH_INSURANCE" else: for i in self.type: if i["ITEM_LABEL_"][:-1] in data["TYPE_"]: re_data["TYPE_"] = re_data["TYPE_"] + i[ "ITEM_LABEL_"] + "|" re_data["TYPE_CODE_"] = re_data[ "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|" re_data["TYPE_"] = re_data["TYPE_"][:-1] re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1] if "POLICY_DUTY_" in data: re_data["POLICY_DUTY_"] = data["POLICY_DUTY_"] if "PRODUCT_CASE_" in data: re_data["PRODUCT_CASE_"] = data["PRODUCT_CASE_"] if "BUY_LIMIT_" in data: re_data["BUY_LIMIT_"] = data["BUY_LIMIT_"] if "ENSURE_PRICE_" in data: re_data["ENSURE_PRICE_"] = data["ENSURE_PRICE_"] # re_data["AREA_CODE_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") if "PRODUCT_PRICE_" in data: re_data["PRODUCT_PRICE_"] = data["PRODUCT_PRICE_"] if "PRODUCT_ID_" in data: re_data["PRODUCT_ID_"] = data["PRODUCT_ID_"] if "PRODUCT_CLAUSE_" in data: re_data["PRODUCT_CLAUSE_"] = data["PRODUCT_CLAUSE_"] if "GENDER_" in data: re_data["GENDER_"] = data["GENDER_"] if "AGE_" in data: re_data["AGE_"] = data["AGE_"] if "COM_NAME_" in data: re_data["COM_NAME_"] = data["COM_NAME_"] if "PAY_METHOD_" in data: re_data["PAY_METHOD_"] = data["PAY_METHOD_"] if "PROBLEM_" in data: re_data["PROBLEM_"] = data["PROBLEM_"] if "CLAIM_" in data: re_data["CLAIM_"] = data["CLAIM_"] if "COMMENT_" in data: re_data["COMMENT_"] = data["COMMENT_"] if "ENSURE_CONTENT_" in data: re_data["ENSURE_CONTENT_"] = data["ENSURE_CONTENT_"] if "INSURE_INFO_" in data: re_data["INSURE_INFO_"] = data["INSURE_INFO_"] if "RATE_INFO_" in data: re_data["RATE_INFO_"] = data["RATE_INFO_"] if "SALE_SERVICE_" in data: re_data["SALE_SERVICE_"] = data["SALE_SERVICE_"] # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"] re_data["STATUS_"] = "1" # re_data["REMARK_"] = "" re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] # re_data["TITLE_"] = data["TITLE_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] # re_data["DATETIME_"] = data["DATETIME_"] return re_data def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # # create table sql # table_sql = ('create table "INSURANCE" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,' # ' "C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, ' # '"C"."REMARK_" varchar, "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar,' # '"C"."TYPE_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C".PRODUCT_CLAUSE_ varchar,' # '"C"."SOURCE_" varchar, "C"."PRODUCT_NAME_" varchar, "C"."FEATURE_NAME_" varchar,' # '"C"."POLICY_DUTY_" varchar, "C"."PRODUCT_CASE_" varchar, "C"."BUY_LIMIT_" varchar,' # '"C"."ENSURE_PRICE_" varchar, "C"."PRODUCT_PRICE_" varchar, "C"."PRODUCT_ID_" varchar,' # '"C"."GENDER_" varchar, "C"."AGE_" varchar, "C"."COM_NAME_" varchar, "C"."TYPE_CODE_" varchar,' # '"C"."PAY_METHOD_" varchar, "C"."PRODUCT_DETAIL_" varchar, "C"."PROBLEM_" varchar,' # '"C"."CLAIM_" varchar, "C"."COMMENT_" varchar, "C"."STATUS_" varchar,' # '"C"."ENSURE_CONTENT_" varchar, "C"."INSURE_INFO_" varchar, "C"."RATE_INFO_" varchar,' # '"C"."SALE_SERVICE_" varchar) IMMUTABLE_ROWS = true') # # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection) for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) # print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: if isinstance(re_data, dict): # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue elif isinstance(re_data, list): for r_d in re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=r_d) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # # add {d:1} # try: # self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, # data_dict={"d": 1}) # self.remove_count += 1 # if self.remove_count % 10 == 0: # self.logger.info("MongoDB 更新成功, 成功条数 {}".format(self.remove_count)) # except Exception as e: # self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e)) # continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class Entrust(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="JSENTRUST_CCBDATA") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="ENTRUST") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def data_shuffle(self, data): re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["NAME_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["CREATE_TIME_"] = data["DATETIME_"] re_data["STATUS_"] = "1" re_data["DEALTIME_"] = data["DEALTIME_"] re_data["URL_"] = data["URL_"] if data["ENTITY_CODE_"] == "CHINATRC": # "C" # re_data["AREA_CODE_"] # re_data["BANK_CODE_"] # re_data["BANK_NAME_"] # re_data["UNIT_CODE_"] pub_date = eval(data["PUB_DATE_"]) date = str(pub_date["time"])[:-3] t = arrow.get(int(date)) publish_date = str(t)[:10] period_code = publish_date.replace("-", "") re_data["PERIOD_CODE_"] = period_code # re_data["REMARK_"] # re_data["UPDATE_TIME_"] re_data["CODE_"] = data["CODE_"] re_data["NAME_"] = data["NAME_"] re_data["ISSUER_"] = data["ISSUER_"] re_data["FUNCTION_"] = data["FUNCTION_"] pro_date = eval(data["PRO_START_"]) pro_date = str(pro_date["time"])[:-3] p_t = arrow.get(int(pro_date)) product_date = str(p_t)[:10] re_data["PRO_START_"] = product_date re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"] re_data["RUN_MODE_"] = data["RUN_MODE_"] re_data["INDUSTRY_"] = data["INDUSTRY_"] re_data["PUB_DATE_"] = publish_date # re_data["SCALE_"] = data[""] # re_data["MONTH_"] # re_data["YIELD_RATE_"] # re_data["START_FUNDS_"] # re_data["PURPOSE_"] # re_data["ESTAB_ANNOUNCEMENT_"] # re_data["ENTRUST_STATUS_"] # # re_data["DISTRIBU_MODE_"] # re_data["INVEST_AREA_"] # re_data["TERM_TYPE_"] = data["TERM_TYPE_"] # re_data["INVEST_DIRECTION_"] # re_data["INVEST_MODE_"] = data["INVEST_MODE_"] # re_data["CURRENCY_"] # re_data["MANAGE_TYPE_"] # re_data["SALE_TARGET_"] # re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"] # re_data["ISSUER_AREA_"] # re_data["RESERVE_INFO_"] # re_data["TRUSTEESHIP_BANK_"] # re_data["OTHER_INFO_"] # re_data["OTHER_INFO_"] elif data["ENTITY_CODE_"] == "TRUSTHEXUN": # "C" # re_data["AREA_CODE_"] # re_data["BANK_CODE_"] # re_data["BANK_NAME_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "") # re_data["REMARK_"] # re_data["UPDATE_TIME_"] # re_data["CODE_"] = data["CODE_"] re_data["NAME_"] = data["NAME_"] re_data["ISSUER_"] = data["ISSUER_"] # re_data["FUNCTION_"] = data["FUNCTION_"] # re_data["PRO_START_"] re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"].replace( "至月", "") # re_data["RUN_MODE_"] = data["RUN_MODE_"] re_data["INDUSTRY_"] = data["INDUSTRY_"] re_data["PUB_DATE_"] = data["PUB_DATE_"] re_data["SCALE_"] = data["SCALE_"] # re_data["MONTH_"] re_data["YIELD_RATE_"] = data["YIELD_RATE_"] re_data["START_FUNDS_"] = data["START_FUNDS_"] # re_data["PURPOSE_"] # re_data["ESTAB_ANNOUNCEMENT_"] # re_data["ENTRUST_STATUS_"] # # re_data["DISTRIBU_MODE_"] # re_data["INVEST_AREA_"] # re_data["TERM_TYPE_"] = data["TERM_TYPE_"] # re_data["INVEST_DIRECTION_"] re_data["INVEST_MODE_"] = data["INVEST_MODE_"] re_data["CURRENCY_"] = data["CURRENCY_"] re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"] re_data["SALE_TARGET_"] = data["SALE_TARGET_"] re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"] re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"] re_data["RESERVE_INFO_"] = data["RESERVE_INFO_"] # re_data["TRUSTEESHIP_BANK_"] re_data["OTHER_INFO_"] = data["OTHER_INFO_"] elif data["ENTITY_CODE_"] == "YANGLEE": # "C" # re_data["AREA_CODE_"] # re_data["BANK_CODE_"] # re_data["BANK_NAME_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "") # # re_data["REMARK_"] # # re_data["UPDATE_TIME_"] # re_data["CODE_"] = data["CODE_"] re_data["NAME_"] = data["NAME_"] re_data["ISSUER_"] = data["ISSUER_"] # re_data["FUNCTION_"] = data["FUNCTION_"] # # re_data["PRO_START_"] re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"] # re_data["RUN_MODE_"] = data["RUN_MODE_"] re_data["INDUSTRY_"] = data["INDUSTRY_"] re_data["PUB_DATE_"] = data["PUB_DATE_"] # re_data["SCALE_"] = data["SCALE_"] # # re_data["MONTH_"] re_data["YIELD_RATE_"] = data["YIELD_RATE_"] re_data["START_FUNDS_"] = data["START_FUNDS_"] # # re_data["PURPOSE_"] # # re_data["ESTAB_ANNOUNCEMENT_"] re_data["ENTRUST_STATUS_"] = data["STATUS_"] # re_data["DISTRIBU_MODE_"] = data["DISTRIBU_MODE_"] # # re_data["INVEST_AREA_"] re_data["TERM_TYPE_"] = data["TERM_TYPE_"] # # re_data["INVEST_DIRECTION_"] # re_data["INVEST_MODE_"] = data["INVEST_MODE_"] # re_data["CURRENCY_"] = data["CURRENCY_"] # re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"] # re_data["SALE_TARGET_"] = data["SALE_TARGET_"] # re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"] re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"] # re_data["RESERVE_INFO_"] = data["RESERVE_INFO_"] re_data["TRUSTEESHIP_BANK_"] = data["TRUSTEESHIP_BANK_"] re_data["OTHER_INFO_"] = data["OTHER_INFO_"] elif data["ENTITY_CODE_"] == "TRUSTONE": # "C" # re_data["AREA_CODE_"] # re_data["BANK_CODE_"] # re_data["BANK_NAME_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "") # # re_data["REMARK_"] # # re_data["UPDATE_TIME_"] # re_data["CODE_"] = data["CODE_"] re_data["NAME_"] = data["NAME_"] re_data["ISSUER_"] = data["ISSUER_"] # re_data["FUNCTION_"] = data["FUNCTION_"] # # re_data["PRO_START_"] # re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"] # re_data["RUN_MODE_"] = data["RUN_MODE_"] # re_data["INDUSTRY_"] = data["INDUSTRY_"] re_data["PUB_DATE_"] = data["PUB_DATE_"] re_data["SCALE_"] = data["SCALE_"] # # re_data["MONTH_"] re_data["YIELD_RATE_"] = data["YIELD_RATE_"] re_data["START_FUNDS_"] = data["START_FUNDS_"] # # re_data["PURPOSE_"] # # re_data["ESTAB_ANNOUNCEMENT_"] # re_data["ENTRUST_STATUS_"] = data["STATUS_"] # # re_data["DISTRIBU_MODE_"] = data["DISTRIBU_MODE_"] re_data["INVEST_AREA_"] = data["INVEST_AREA_"] re_data["TERM_TYPE_"] = data["TERM_TYPE_"] re_data["INVEST_DIRECTION_"] = data["INVEST_DIRECTION_"] re_data["INVEST_MODE_"] = data["INVEST_MODE_"] # re_data["CURRENCY_"] = data["CURRENCY_"] # re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"] # re_data["SALE_TARGET_"] = data["SALE_TARGET_"] re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"] # re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"] re_data["RESERVE_INFO_"] = re.sub(r"</?\w*>", "", data["RESERVE_INFO_"]) # re_data["TRUSTEESHIP_BANK_"] = data["TRUSTEESHIP_BANK_"] # re_data["OTHER_INFO_"] = data["OTHER_INFO_"] return re_data def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # # create table sql # table_sql = ('create table "ENTRUST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."CREATE_TIME_" varchar, "C"."STATUS_" varchar,' # '"C"."DEALTIME_" varchar, "C"."URL_" varchar, "C"."AREA_CODE_" varchar, "C"."FUNCTION_" varchar,' # '"C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar,' # '"C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, "C"."UPDATE_TIME_" varchar,' # '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."ISSUER_" varchar, "C"."PRO_START_" varchar,' # '"C"."INVEST_PERIOD_" varchar,"C"."RUN_MODE_" varchar, "C"."INDUSTRY_" varchar,' # '"C"."PUB_DATE_" varchar, "C"."SCALE_" varchar, "C"."MONTH_" varchar, "C"."YIELD_RATE_" varchar,' # '"C"."START_FUNDS_" varchar, "C"."PURPOSE_" varchar, "C"."ESTAB_ANNOUNCEMENT_" varchar,' # '"C"."ENTRUST_STATUS_" varchar, "C"."DISTRIBU_MODE_" varchar, "C"."INVEST_AREA_" varchar,' # '"C"."TERM_TYPE_" varchar, "C"."INVEST_DIRECTION_" varchar, "C"."INVEST_MODE_" varchar,' # '"C"."CURRENCY_" varchar, "C"."MANAGE_TYPE_" varchar, "C"."SALE_TARGET_" varchar,' # '"C"."PROFIT_TYPE_" varchar, "C"."ISSUER_AREA_" varchar, "C"."RESERVE_INFO_" varchar,' # '"C"."TRUSTEESHIP_BANK_" varchar, "C"."OTHER_INFO_" varchar) IMMUTABLE_ROWS = true') # # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection, data_id="5c67307d9bb3df76b4229f79") for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data # try: re_data = self.data_shuffle(data=data) # except Exception as e: # self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) # continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # # add {d:1} # try: # self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, # data_dict={"d": 1}) # self.remove_count += 1 # if self.remove_count % 10 == 0: # self.logger.info("MongoDB 更新成功, 成功条数 {}".format(self.remove_count)) # except Exception as e: # self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e)) # continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class Trend(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="TREND") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="CHA_BRANCH_MARKET_ACT") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def data_shuffle(self, data): re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # 分行 copy_result = dict() copy_result["ID_"] = row_key copy_result["ENTITY_CODE_"] = data["ENTITY_CODE_"] copy_result["ENTITY_NAME_"] = data["ENTITY_NAME_"] copy_result["URL_"] = data["URL_"] # copy_result["PROVINCE_CODE_"] = result[""] # copy_result["PROVINCE_NAME_"] = result[""] # copy_result["CITY_CODE_"] = result[""] # copy_result["CITY_NAME_"] = result[""] # copy_result["AREA_CODE_"] = result[""] # copy_result["AREA_NAME_"] = result[""] # copy_result["LAT_"] = result[""] # copy_result["LNG_"] = result[""] copy_result["APP_VERSION_"] = "BRANCH" copy_result["BANK_CODE_"] = data["ENTITY_CODE_"].replace( "PRIVATEINFO", "") copy_result["BANK_NAME_"] = data["ENTITY_NAME_"].replace("私行动态", "") # copy_result["UNIT_CODE_"] = result["UNIT_CODE_"] # copy_result["UNIT_NAME_"] = result[""] copy_result["PERIOD_CODE_"] = data["NOTICE_TIME_"].replace("-", "") # copy_result["REMARK_"] = result[""] time_array = time.localtime() create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) copy_result["CREATE_TIME_"] = create_time copy_result["SPIDER_TIME_"] = data["DATETIME_"] # copy_result["MODIFIED_TIME_"] = result[""] copy_result["CREATE_BY_ID_"] = "P0131857" copy_result["CREATE_BY_NAME_"] = "钟楷文" # copy_result["MODIFIED_BY_ID_"] = result[""] # copy_result["MODIFIED_BY_NAME_"] = result[""] copy_result["M_STATUS_"] = "0" copy_result["DELETE_STATUS_"] = "0" copy_result["DATA_STATUS_"] = "uncheck" # copy_result["TAGS_"] = result[""] source = re.findall(r"(https?://.*?)/", data["URL_"]) copy_result["SOURCE_"] = source[0] copy_result["SOURCE_NAME_"] = data["ENTITY_NAME_"] # copy_result["SOURCE_TYPE_"] = result[""] # copy_result["HOT_"] = result[""] # copy_result["IMPORTANCE_"] = result[""] copy_result["ACT_NAME_"] = data["TITLE_"] # copy_result["IMAGES_"] = data[""] # copy_result["TARGET_"] = data[""] # copy_result["BRIEF_"] = data[""] copy_result["DETAILS_"] = data["CONTENT_"] # copy_result["RULE_"] = data[""] # copy_result["START_TIME_"] = data[""] # copy_result["END_TIME_"] = data[""] # copy_result["ACT_TYPE1_"] = data[""] # copy_result["ACT_TYPE2_"] = data[""] # copy_result["ACT_TYPE3_"] = data[""] copy_result["PUBLISH_TIME_"] = data["NOTICE_TIME_"] # copy_result["READS_"] = data[""] # copy_result["LIKES_"] = data[""] # copy_result["COMMENTS_"] = data[""] # copy_result["JOINS_"] = data[""] # copy_result["RELAYS_"] = data[""] # copy_result["SOURCE_ID_"] = data[""] # copy_result["HTML_"] = data[""] # copy_result["SOURCE_OWN_NAME_"] = data[""] # copy_result["SOURCE_OWN_ID_"] = data[""] return copy_result # "C" # re_data["ID_"] = row_key # re_data["TYPE_"] = random.choice( # ["税务法律", "子女教育", "健康医养", "财富管理", "生活娱乐", "旅游出行", "艺术/艺术品", "节日庆贺", "其他"]) # re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] # re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] # re_data["BANK_CODE_"] = data["ENTITY_CODE_"].replace("PRIVATEINFO", "") # re_data["BANK_NAME_"] = data["ENTITY_NAME_"].replace("私行动态", "") # # re_data["AREA_CODE_"] # # re_data["UNIT_CODE_"] # period_code = data["NOTICE_TIME_"].replace("-", "") # re_data["PERIOD_CODE_"] = period_code # re_data["CONTENT_"] = data["CONTENT_"] # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"] # re_data["STATUS_"] = "1" # # re_data["REMARK_"] = "" # re_data["CREATE_TIME_"] = data["DATETIME_"] # # re_data["UPDATE_TIME_"] # re_data["TITLE_"] = data["TITLE_"] # re_data["URL_"] = data["URL_"] # re_data["DEALTIME_"] = data["DEALTIME_"] # # re_data["DATETIME_"] = data["DATETIME_"] # # re_data["SOURCE_TYPE_"] # # return re_data def run(self): # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # quit() # add colum # self.p_client.add_column_phoenix(connection=self.connection, column="SOURCE_TYPE_") # quit() # create table sql # table_sql = ('create table "MARKETING_ACT" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."TITLE_" varchar,"C"."NOTICE_TIME_" varchar,' # '"T"."CONTENT_" varchar,"C"."OBJ_" varchar, "C"."ATENDANCE_" varchar, "C"."PERIOD_CODE_" varchar,' # '"C"."IMAGES_" varchar, "C"."RESULTS_" varchar,"C"."PLACE_" varchar, "C"."TYPE_" varchar,' # '"C"."READ_NUM_" varchar, "C"."CONTENT_NUM_" varchar, "C"."COMMENT_CONTENT_" varchar, ' # '"C"."FORWARD_NUM_" varchar, "C"."COLLECTION_NUM_" varchar, "C"."PRAISE_NUM_" varchar,' # '"C"."BANK_NAME_" varchar, "C"."STATUS_" varchar, "C"."REMARK_" varchar, "C"."SOURCE_ID_" varchar,' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar,"C"."SOURCE_" varchar,' # '"C"."URL_" varchar, "C"."BANK_CODE_" varchar, "C"."DEALTIME_" varchar, ' # '"C"."SOURCE_TYPE_" varchar, "C"."IMPROTANCE_" varchar) IMMUTABLE_ROWS = true') # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection) # for i in range(mongo_data_list.count() + 100): for i in range(100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # add {d:1} try: self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, data_dict={"d": 1}) self.remove_count += 1 if self.remove_count % 10 == 0: self.logger.info("MongoDB 更新成功, 成功条数 {}".format( self.remove_count)) except Exception as e: self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format( self.data_id, e)) continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
def run(self): # # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="NEWS_FINASSIST") p_client.verify_list = self.verify_list # # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="NEWS_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # # 表创建语句 # sql = ('create table "NEWS_FINASSIST" ("ID_" varchar primary key, "T"."CONTENT_" varchar, ' # '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."TITLE_" varchar, "C"."BRIEF_" varchar, ' # '"C"."PUBLISH_TIME_" varchar, "C"."KEYWORDS_" varchar, "C"."URL_" varchar, "C"."DATA_SOURCE_" varchar,' # '"C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,' # '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "F"."STATUS_" varchar)' # 'IMMUTABLE_ROWS = true') # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 遍历 ENTITY_CODE_ 列表 for entity_code in self.code_list: status = False module_name = __import__(entity_code) self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code)) # self.remove_id_list = [] # self.copy_mongo_data_list = [] # self.branch_code_list = [] if entity_code == "CAIJINGNEWS": find_id = "5c6bfa508d7fee512a4ca68f" else: find_id = "" # find_id = "" try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, find_id=find_id) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, find_id=find_id) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count = mongo_data_list.count() for i in range(1000000): try: data = mongo_data_list.__next__() except pymongo.errors.ServerSelectionTimeoutError: continue except StopIteration: break # for data in mongo_data_list: data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info( "running on data_id: {}".format(data_id)) # print(data_id) # copy_data = {} # self.remove_id_list.append(data_id) try: del data["_id"] # copy_data = deepcopy(data) # self.copy_mongo_data_list.append(copy_data) data_list = [data] re_data = module_name.data_shuffle(data_list) if not re_data: self.bad_count += 1 # self.remove_id_list.remove(data_id) continue except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue if isinstance(re_data, list): for list_data in re_data: # phoenix_HBase 插入数据 if list_data: try: if entity_code != "CNINFONEWS": ai_data = self.get_brief_from_ai( data=list_data) else: ai_data = list_data # print(ai_data["CONTENT_"]) except Exception as e: self.logger.info("AI 调取失败, 错误信息", e) ai_data = re_data try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=ai_data) once_count += success_count self.success_count += success_count if self.success_count % 10 == 0: self.logger.info( "HBase 插入成功, 成功条数{}条".format( once_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue try: # 添加 {d:1} update_count = m_client.update_to_mongodb( collection=collection, data_id=data_id, data_dict={"d": 1}) self.remove_count += update_count # self.logger.info("MongoDB 更新成功") if self.remove_count % 10 == 0: self.logger.info( "MongoDB 更新成功, 成功条数 {} 条".format( "10")) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "MongoDB 更新 _id 为 {} 的数据失败, {}".format( data_id, e)) continue elif isinstance(re_data, dict): # phoenix_HBase 插入数据 if re_data: try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) once_count += success_count self.success_count += success_count self.logger.info( "HBase 插入成功, 成功条数 {} 条".format( success_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if once_count > 0: status = True self.logger.info("ENTITY_CODE_: {} 插入成功条数 {}".format( entity_code, once_count)) mongo_data_list.close() else: continue # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class BaiduSearch(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="BAIDU_SEARCH") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="BAIDU_SEARCH") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" # BANK_NAME_ 字典 交通银行 BOCOM 改为 COMM 中信银行 ECITIC 改为 CITIC 增加 平安银行 北京银行 上海银行 self.name_dict = { "ICBC": "中国工商银行", "ABC": "中国农业银行", "BOC": "中国银行", "CCB": "中国建设银行", "COMM": "交通银行", "PSBC": "中国邮政储蓄银行", "CZB": "浙商银行", "CBHB": "渤海银行", "CITIC": "中信银行", "CEB": "中国光大银行", "HXB": "华夏银行", "CMBC": "中国民生银行", "CMB": "招商银行", "CIB": "兴业银行", "CGB": "广发银行", "PAB": "平安银行", "SPDB": "浦发银行", "EBCL": "恒丰银行", "PINGAN": "平安银行", "LTD": "中国光大银行", "BEIJING": "北京银行", "BOSC": "上海银行" } # TYPE_ 列表 self.type_list = [ "Market", "Activity", "GoodStart", "MidSeason", "PrivateBank", "Recommendation" ] def data_shuffle(self, data): re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["TITLE_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] # re_data["AREA_CODE_"] for bank_c in self.name_dict: if bank_c in data["ENTITY_CODE_"]: re_data["BANK_CODE_"] = bank_c break if "BANK_CODE_" in re_data: re_data["BANK_NAME_"] = self.name_dict[re_data["BANK_CODE_"]] else: print(data["ENTITY_CODE_"]) # re_data["UNIT_CODE_"] # re_data["PERIOD_CODE_"] = "" re_data["CONTENT_"] = data["CONTENT_"] re_data["STATUS_"] = "UNPROCESSED" # re_data["REMARK_"] = "" re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] for type in self.type_list: if type in data["ENTITY_CODE_"]: re_data["TYPE_"] = type break re_data["TITLE_"] = data["TITLE_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] # re_data["DATETIME_"] = data["DATETIME_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] return re_data def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # # create table sql # table_sql = ('create table "BAIDU_SEARCH" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,"C"."REMARK_" varchar,' # ' "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "T"."CONTENT_" varchar, ' # '"C"."TITLE_" varchar, "C"."URL_" varchar, "C"."ENTITY_NAME_" varchar,"C"."TYPE_" varchar,' # '"C"."DEALTIME_" varchar, "C"."STATUS_" varchar) IMMUTABLE_ROWS = true') # # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection) for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # add {d:1} try: self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, data_dict={"d": 1}) self.remove_count += 1 if self.remove_count % 10 == 0: self.logger.info("MongoDB 更新成功, 成功条数 {}".format( self.remove_count)) except Exception as e: self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format( self.data_id, e)) continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
class MapBarTransfer(object): def __init__(self, table_name="CHA_BRANCH_MAPBAR", collection_name="mapbar"): # phoenix connection self.p_client = PhoenixHbase(table_name=table_name) self.connection = self.p_client.connect_to_phoenix() # MongoDB connection self.m_client = MongoClient(mongo_collection=collection_name, entity_code="MAPBAR_DEATAIL_BJ") self.m_client.mongo_host = "172.22.69.35" self.m_client.mongo_port = 20000 self.m_client.client = pymongo.MongoClient(host="172.22.69.35", port=20000, serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) self.db, self.collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=self.db, collection_list=self.collection_list) # Log self.logger = Logger().logger # count self.count = 0 def main(self): # # 创建表 # table_sql = (f'create table "{self.p_client.table_name}" ("ID_" varchar primary key,' # '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,' # '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,' # '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar' # ') IMMUTABLE_ROWS = true') # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) # 获取数据 # mongo_data_list = self.m_client.all_from_mongodb(collection=self.collection) mongo_data_list = self.m_client.search_from_mongodb( collection=self.collection, field_name="DEALTIME_", field_value={"$gt": "1555136656.0579224"}, data_id="5cb65fac9bb3df61a09c6625") count = 0 while True: # 取一条处理 try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError: time.sleep(3) data = mongo_data_list.__next__() # 清洗 try: data["PHONE_"] = data["PHONE_"].replace("无,", "") u_time_list = re.findall(r"(\d{4}年\d{1,2}月\d{1,2})日", data["UPDATETIME_"]) if u_time_list: u_ = u_time_list[0].replace("年", "-") u_ = u_.replace("月", "-") u_l = u_.split("-") if len(u_l[1]) == 1: u_l[1] = "0" + u_l[1] if len(u_l[2]) == 1: u_l[2] = "0" + u_l[2] data["UPDATETIME_"] = "-".join(u_l) except Exception as e: self.logger.exception(f"数据清洗出错, _id: {data['_id']}, error {e}") continue # 获取经纬度 try: if data["ADDRESS_"]: data["ADDRESS_"] = "".join(data["ADDRESS_"].split("|")[1:]) location_result = get_lat_lng(address=data["ADDRESS_"]) if location_result["status"] == 0: data["LNG_"] = str( location_result["result"]["location"]["lng"]) data["LAT_"] = str( location_result["result"]["location"]["lat"]) else: self.logger.warning(f"_id: {data['_id']} 获取经纬度失败") else: continue except Exception as e: self.logger.exception( f"_id: {data['_id']} 获取经纬度失败, error: {e}") continue # upsert to HBase try: re_data = self.__check_lat(data=data) # 向 HBase 中插入一条 self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) count += 1 if count % 100 == 0: self.logger.info( f"HBase 插入成功, _id: {data['_id']}, 成功条数 {count}") except Exception as e: self.logger.exception( f"HBase 插入失败, _id: {data['_id']}, error: {e}") continue # 关闭 MongoDB cursor mongo_data_list.close() self.logger.info( f"collection: {self.m_client.mongo_collection} 的数据清洗完毕, 成功条数共计: {count} 条" ) def check_lat(self): # # 删除表 # self.p_client.drop_table_phoenix(connection=self.connection, table_name="CHA_BRANCH_MAPBAR") # # table_sql = (f'create table "CHA_BRANCH_MAPBAR" ("ID_" varchar primary key,' # '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,' # '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,' # '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar,' # '"C"."CHECK_LAT_" varchar, "C"."CHECK_LNG_" varchar' # ') IMMUTABLE_ROWS = true') # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) self.p_client.table_name = "FANSILE" data_cursor = self.p_client.search_all_from_phoenix( connection=self.connection, dict_status=True) self.p_client.table_name = "CHA_BRANCH_MAPBAR" while True: try: data = data_cursor.__next__() # del data["('C', 'CHECK_LNG_')"] # if not data["LAT_"]: # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue # if 30.7083860773 < float(data["LAT_"]) < 31.8739003864: # pass # else: # self.logger.warning(f"错误 _id: {data['ID_']}, 经纬度: {data['LAT_']},{data['LNG_']}") # data["CHECK_LAT_"] = data["LAT_"] # data["CHECK_LNG_"] = data["LNG_"] # data["LAT_"] = "" # data["LNG_"] = "" # # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue # if 120.8778122800 < float(data["LNG_"]) < 122.1248433443: # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue # else: # self.logger.warning(f"错误 _id: {data['ID_']}, 经纬度: {data['LAT_']},{data['LNG_']}") # data["CHECK_LAT_"] = data["LAT_"] # data["CHECK_LNG_"] = data["LNG_"] # data["LAT_"] = "" # data["LNG_"] = "" # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=data) self.count += 1 if self.count % 100 == 0: self.logger.info( f"HBase 插入成功, _id: {data['_id']}, 成功条数 {self.count} 条") except StopIteration: break def __check_lat(self, data): if "LAT_" not in data: return data # 上海 # if 30.7083860773 < float(data["LAT_"]) < 31.8739003864: # 北京 if 39.4498800000 < float(data["LAT_"]) < 41.1684980000: pass else: self.logger.warning( f"错误 _id: {data['_id']}, 经纬度: {data['LAT_']},{data['LNG_']}") data["CHECK_LAT_"] = data["LAT_"] data["CHECK_LNG_"] = data["LNG_"] data["LAT_"] = "" data["LNG_"] = "" return data # 上海 # if 120.8778122800 < float(data["LNG_"]) < 122.1248433443: # 北京 if 115.4534230000 < float(data["LNG_"]) < 117.5461160000: return data else: self.logger.warning( f"错误 _id: {data['_id']}, 经纬度: {data['LAT_']},{data['LNG_']}") data["CHECK_LAT_"] = data["LAT_"] data["CHECK_LNG_"] = data["LNG_"] data["LAT_"] = "" data["LNG_"] = "" return data