def work(bbd_type): conf_file = "DBConfig.ini" src_db_dict = \ { 'type': confGetterFunc(conf_file, 'html_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'html_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'html_db', 'port')) } des_db_dict = \ { 'type': confGetterFunc(conf_file, 'data_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'data_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'data_db', 'port')) } from CommonLib.Logging import Logging log = Logging(name=bbd_type) log.info("Process begin") bbd_type = bbd_type.lower() queue_name = bbd_type bbd_table = "qyxx_data" bbd_src_table = "qyxx_html" module_name = bbd_type.capitalize() + "Handler" handler = ClassFactory.getClassInst(module_name, package_name="Parser", pinyin=bbd_type.lower()) normal_table = bbd_type + "_data" err_table = normal_table + "_error" # html_normal_table = bbd_type+"_src" des_db_inst = DBManager.getInstance(des_db_dict["type"], bbd_table, host=des_db_dict["host"], port=des_db_dict["port"]) #存 解析后数据 err_db_inst = DBManager.getInstance(src_db_dict["type"], err_table, host=src_db_dict["host"], port=src_db_dict["port"]) fetch = Fetcher(queue_name, "qyxx", get_db_dict=src_db_dict, save_db_dict=des_db_dict) # debug while True: try: source_dict = fetch.get() if source_dict: # 拷贝 种子信息到解析后的数据里面 if source_dict.has_key("bbd_seed"): seed_dict = {"bbd_seed": source_dict["bbd_seed"]} if source_dict.has_key("BBD_SEED"): seed_dict = {"bbd_seed": source_dict["BBD_SEED"]} log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ING, seed_dict) log.info(log_info) # fetch.backup() # 避免进程异常退出引起的数据丢失 res_dict = UniField.cloneNeedColumns(source_dict) res_dict.update(seed_dict) log.info("start to a new seed %s", seed_dict) res_dict = handler.parse(source_dict, res_dict) if res_dict["status"] == 0: res_dict = UniField.unifyParseResult(res_dict, bbd_table=bbd_table) des_db_inst.changeTable(bbd_table) des_db_inst.save(res_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table, str(des_db_inst.size())) des_db_inst.changeTable(bbd_src_table) des_db_inst.save(source_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table, str(des_db_inst.size())) log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_SUC, seed_dict) log.info(log_info) else: source_dict["data"] = res_dict err_db_inst.save(source_dict) log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ERO, seed_dict) log.info(log_info) else: log.info(u"解析%s队列为空, 等待10秒重试", bbd_type) time.sleep(10) except Exception as e: log.info(str(e)) source_dict["data"] = res_dict err_db_inst.save(source_dict) raise Exception(e)
def work(self, pro_type): conf_file = "DBConfig.ini" src_db_dict = \ { 'type': confGetterFunc(conf_file, 'html_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'html_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'html_db', 'port')) } des_db_dict = \ { 'type': confGetterFunc(conf_file, 'data_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'data_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'data_db', 'port')) } from CommonLib.Logging import Logging log = Logging(name=pro_type) log.info("Process begin") pro_type = pro_type.lower() queue_name = pro_type module_name = pro_type.capitalize() + "Handler" handler = ClassFactory.getClassInst(module_name, package_name="Parser", pinyin=pro_type.lower()) # nb_module_name = pro_type.capitalize() +"Nb" + "Handler" # nb_handler = ClassFactory.getClassInst(module_name, package_name = "Parser", pinyin=pro_type.lower()) normal_table = pro_type + "_data" err_table = normal_table + "_error" # db_inst = DBManager.getInstance(des_db_dict["type"], normal_table, host = des_db_dict["host"], port = des_db_dict["port"]) #ssdb 存 解析后数据 # kfk_inst = DBManager.getInstance("kafka", "qyxx_html", host = "spider7", port = 9092) # debug_normal_table = "new_"+pro_type.lower()+"_data" # db_debug = DBManager.getInstance("mongo",debug_normal_table, host = "spider7", port = 27037)# mongo, 数据存本地,用于调试 # fetch=Fetcher(queue_name.lower() ,"qyxx") enable this if not debug fetch = Fetcher(queue_name, "qyxx", get_db_dict=src_db_dict, save_db_dict=des_db_dict) # debug while True: try: # source_dict = fetch.hget() source_dict = fetch.get() if source_dict: # 拷贝 种子信息到解析后的数据里面 if source_dict.has_key("bbd_seed"): seed_dict = {"bbd_seed": source_dict["bbd_seed"]} if source_dict.has_key("BBD_SEED"): seed_dict = {"bbd_seed": source_dict["BBD_SEED"]} log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ING, seed_dict) log.info(log_info) # fetch.backup() # 避免进程异常退出引起的数据丢失 res_dict = UniField.cloneNeedColumns(source_dict) log.info("start to a new seed %s", source_dict) #debug # db_inst.changeTable("new_"+pro_type.lower()) # db_inst.save(source_dict); # rowkey=source_dict["rowkey"] # db_inst.hset(rowkey,source_dict) # db_inst.changeTable("new_"+pro_type.lower()+"_processed") # db_inst.save(source_dict) res_dict = handler.parse(source_dict, res_dict) if res_dict["status"] == 0: db_inst.changeTable(normal_table) res_dict = UniField.unifyParseResult(res_dict) #for debug db_debug.save(res_dict) # db_inst.save(res_dict) # kfk_inst.save(source_dict) # print "kfk size:",kfk_inst.size() log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_SUC, seed_dict) log.info(log_info) else: db_inst.changeTable(err_table) res_dict["html"] = source_dict # db_inst.save(res_dict) db_debug.save(res_dict) log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ERO, seed_dict) log.info(log_info) else: log.info(u"解析%s队列为空, 等待10秒重试", pro_type) time.sleep(10) except Exception as e: print str(e) raise Exception(e)
def work(bbd_type): conf_file = "DBConfig.ini" src_db_dict = \ { 'type': confGetterFunc(conf_file, 'html_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'html_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'html_db', 'port')) } des_db_dict = \ { 'type': confGetterFunc(conf_file, 'data_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'data_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'data_db', 'port')) } from CommonLib.Logging import Logging log = Logging(name=bbd_type) log.info("Process begin") bbd_type = bbd_type.lower() queue_name = bbd_type module_name = bbd_type.capitalize() + "Handler" handler = ClassFactory.getClassInst(module_name, package_name="xgxx", pinyin=bbd_type.lower()) bbd_table = bbd_type + "_data" bbd_src_table = bbd_table + "_src" err_table = bbd_table + "_error" # html_normal_table = bbd_type+"_src"+"_nb" des_db_inst = DBManager.getInstance(des_db_dict["type"], bbd_table, host=des_db_dict["host"], port=des_db_dict["port"]) # 存 解析后数据 err_db_inst = DBManager.getInstance(src_db_dict["type"], err_table, host=src_db_dict["host"], port=src_db_dict["port"]) if bbd_type == 'jyyc': while True: for province in [ 'anhui', 'beijing', 'chongqing', 'fujian', 'gansu', 'guangdong', 'guizhou', 'hainan', 'hebei', 'heilongjiang', 'henan', 'hubei', 'hunan', 'jiangsu', 'jiangxi', 'jilin', 'liaoning', 'neimenggu', 'ningxia', 'qinghai', 'shanghai', 'shangxixian', 'sichuan', 'tianjin', 'xinjiang', 'xizang', 'yunnan', 'zhejiang', 'zongju', 'shandong' ]: jyyc_queue = 'jyyc_{}'.format(province) fetch = Fetcher(jyyc_queue, "xgxx", get_db_dict=src_db_dict, save_db_dict=des_db_dict) while True: try: source_dict = fetch.get() if source_dict: res_dict = UniField.cloneNeedColumns(source_dict) res_dict = handler.parse(source_dict, res_dict, province) if res_dict["status"] == 0: res_dict = UniField.unifyParseResult( res_dict, bbd_table=bbd_table) des_db_inst.changeTable(bbd_table) des_db_inst.save(res_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table, str(des_db_inst.size())) des_db_inst.changeTable(bbd_src_table) des_db_inst.save(source_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table, str(des_db_inst.size())) else: source_dict["data"] = res_dict err_db_inst.save(source_dict) else: log.info(u"解析%s队列为空, 进入下一队列", jyyc_queue) break except Exception as e: log.info(str(e)) source_dict["data"] = res_dict err_db_inst.save(source_dict) raw_input('ssss') raise Exception(e) log.info(u'解析完一轮, 一个小时后进入下一轮') time.sleep(1 * 60 * 60) fetch = Fetcher(queue_name, "xgxx", get_db_dict=src_db_dict, save_db_dict=des_db_dict) # debug while True: try: source_dict = fetch.get() if source_dict: res_dict = UniField.cloneNeedColumns(source_dict) # log.info("start to a new seed %s",seed_dict) res_dict = handler.parse(source_dict, res_dict) if res_dict["status"] == 0: res_dict = UniField.unifyParseResult(res_dict, bbd_table=bbd_table) des_db_inst.changeTable(bbd_table) des_db_inst.save(res_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table, str(des_db_inst.size())) des_db_inst.changeTable(bbd_src_table) des_db_inst.save(source_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table, str(des_db_inst.size())) else: source_dict["data"] = res_dict err_db_inst.save(source_dict) else: log.info(u"解析%s队列为空, 等待10秒重试", bbd_type) time.sleep(10) except Exception as e: log.info(str(e)) source_dict["data"] = res_dict err_db_inst.save(source_dict) raise Exception(e)