def work(self, pro_type): conf_file = "DBConfig.ini" src_db_dict = \ { 'type': confGetterFunc(conf_file, 'html_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'html_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'html_db', 'port')) } des_db_dict = \ { 'type': confGetterFunc(conf_file, 'data_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'data_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'data_db', 'port')) } from CommonLib.Logging import Logging log = Logging(name=pro_type) log.info("Process begin") pro_type = pro_type.lower() queue_name = pro_type module_name = pro_type.capitalize() + "Handler" handler = ClassFactory.getClassInst(module_name, package_name="Parser", pinyin=pro_type.lower()) # nb_module_name = pro_type.capitalize() +"Nb" + "Handler" # nb_handler = ClassFactory.getClassInst(module_name, package_name = "Parser", pinyin=pro_type.lower()) normal_table = pro_type + "_data" err_table = normal_table + "_error" # db_inst = DBManager.getInstance(des_db_dict["type"], normal_table, host = des_db_dict["host"], port = des_db_dict["port"]) #ssdb 存 解析后数据 # kfk_inst = DBManager.getInstance("kafka", "qyxx_html", host = "spider7", port = 9092) # debug_normal_table = "new_"+pro_type.lower()+"_data" # db_debug = DBManager.getInstance("mongo",debug_normal_table, host = "spider7", port = 27037)# mongo, 数据存本地,用于调试 # fetch=Fetcher(queue_name.lower() ,"qyxx") enable this if not debug fetch = Fetcher(queue_name, "qyxx", get_db_dict=src_db_dict, save_db_dict=des_db_dict) # debug while True: try: # source_dict = fetch.hget() source_dict = fetch.get() if source_dict: # 拷贝 种子信息到解析后的数据里面 if source_dict.has_key("bbd_seed"): seed_dict = {"bbd_seed": source_dict["bbd_seed"]} if source_dict.has_key("BBD_SEED"): seed_dict = {"bbd_seed": source_dict["BBD_SEED"]} log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ING, seed_dict) log.info(log_info) # fetch.backup() # 避免进程异常退出引起的数据丢失 res_dict = UniField.cloneNeedColumns(source_dict) log.info("start to a new seed %s", source_dict) #debug # db_inst.changeTable("new_"+pro_type.lower()) # db_inst.save(source_dict); # rowkey=source_dict["rowkey"] # db_inst.hset(rowkey,source_dict) # db_inst.changeTable("new_"+pro_type.lower()+"_processed") # db_inst.save(source_dict) res_dict = handler.parse(source_dict, res_dict) if res_dict["status"] == 0: db_inst.changeTable(normal_table) res_dict = UniField.unifyParseResult(res_dict) #for debug db_debug.save(res_dict) # db_inst.save(res_dict) # kfk_inst.save(source_dict) # print "kfk size:",kfk_inst.size() log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_SUC, seed_dict) log.info(log_info) else: db_inst.changeTable(err_table) res_dict["html"] = source_dict # db_inst.save(res_dict) db_debug.save(res_dict) log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ERO, seed_dict) log.info(log_info) else: log.info(u"解析%s队列为空, 等待10秒重试", pro_type) time.sleep(10) except Exception as e: print str(e) raise Exception(e)
def work(bbd_type): conf_file = "DBConfig.ini" src_db_dict = \ { 'type': confGetterFunc(conf_file, 'html_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'html_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'html_db', 'port')) } des_db_dict = \ { 'type': confGetterFunc(conf_file, 'data_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'data_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'data_db', 'port')) } from CommonLib.Logging import Logging log = Logging(name=bbd_type) log.info("Process begin") bbd_type = bbd_type.lower() queue_name = bbd_type nb_module_name = bbd_type.capitalize() + "Nb" + "Handler" nb_handler = ClassFactory.getClassInst(nb_module_name, package_name="Parser", pinyin=bbd_type.lower()) bbd_table = "qyxx_data_nb" bbd_src_table = "qyxx_html_nb" normal_table = bbd_type + "_data" + "_nb" err_table = normal_table + "_error" # html_normal_table = bbd_type+"_src"+"_nb" des_db_inst = DBManager.getInstance(des_db_dict["type"], bbd_table, host=des_db_dict["host"], port=des_db_dict["port"]) #存 解析后数据 err_db_inst = DBManager.getInstance(src_db_dict["type"], err_table, host=src_db_dict["host"], port=src_db_dict["port"]) fetch = Fetcher(queue_name + "_nbxx", "qyxx", get_db_dict=src_db_dict, save_db_dict=des_db_dict) # debug while True: try: source_dict = fetch.hget() if source_dict: res_dict = UniField.cloneNeedColumns(source_dict) if res_dict.has_key("year"): res_dict["_id"] = UniField.updateId( res_dict['_id'], res_dict['year']) # log.info("start to a new seed %s",seed_dict) res_dict = nb_handler.parse(source_dict, res_dict) if res_dict["status"] == 0: res_dict = UniField.unifyParseResult(res_dict, bbd_table=bbd_table) des_db_inst.changeTable(bbd_table) des_db_inst.save(res_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table, str(des_db_inst.size())) des_db_inst.changeTable(bbd_src_table) des_db_inst.save(source_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table, str(des_db_inst.size())) # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_SUC, seed_dict) # log.info(log_info) else: source_dict["data"] = res_dict err_db_inst.save(source_dict) # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ERO, seed_dict) # log.info(log_info) else: log.info(u"解析%s队列为空, 等待10秒重试", bbd_type) time.sleep(10) except Exception as e: log.info(str(e)) source_dict["data"] = res_dict err_db_inst.save(source_dict) raise Exception(e)
def work(bbd_type): conf_file = "DBConfig.ini" src_db_dict = \ { 'type': confGetterFunc(conf_file, 'html_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'html_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'html_db', 'port')) } des_db_dict = \ { 'type': confGetterFunc(conf_file, 'data_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'data_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'data_db', 'port')) } from CommonLib.Logging import Logging log = Logging(name=bbd_type) log.info("Process begin") bbd_type = bbd_type.lower() queue_name = bbd_type module_name = bbd_type.capitalize() + "Handler" handler = ClassFactory.getClassInst(module_name, package_name="xgxx", pinyin=bbd_type.lower()) bbd_table = bbd_type + "_data" bbd_src_table = bbd_table + "_src" err_table = bbd_table + "_error" # html_normal_table = bbd_type+"_src"+"_nb" des_db_inst = DBManager.getInstance(des_db_dict["type"], bbd_table, host=des_db_dict["host"], port=des_db_dict["port"]) # 存 解析后数据 err_db_inst = DBManager.getInstance(src_db_dict["type"], err_table, host=src_db_dict["host"], port=src_db_dict["port"]) if bbd_type == 'jyyc': while True: for province in [ 'anhui', 'beijing', 'chongqing', 'fujian', 'gansu', 'guangdong', 'guizhou', 'hainan', 'hebei', 'heilongjiang', 'henan', 'hubei', 'hunan', 'jiangsu', 'jiangxi', 'jilin', 'liaoning', 'neimenggu', 'ningxia', 'qinghai', 'shanghai', 'shangxixian', 'sichuan', 'tianjin', 'xinjiang', 'xizang', 'yunnan', 'zhejiang', 'zongju', 'shandong' ]: jyyc_queue = 'jyyc_{}'.format(province) fetch = Fetcher(jyyc_queue, "xgxx", get_db_dict=src_db_dict, save_db_dict=des_db_dict) while True: try: source_dict = fetch.get() if source_dict: res_dict = UniField.cloneNeedColumns(source_dict) res_dict = handler.parse(source_dict, res_dict, province) if res_dict["status"] == 0: res_dict = UniField.unifyParseResult( res_dict, bbd_table=bbd_table) des_db_inst.changeTable(bbd_table) des_db_inst.save(res_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table, str(des_db_inst.size())) des_db_inst.changeTable(bbd_src_table) des_db_inst.save(source_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table, str(des_db_inst.size())) else: source_dict["data"] = res_dict err_db_inst.save(source_dict) else: log.info(u"解析%s队列为空, 进入下一队列", jyyc_queue) break except Exception as e: log.info(str(e)) source_dict["data"] = res_dict err_db_inst.save(source_dict) raw_input('ssss') raise Exception(e) log.info(u'解析完一轮, 一个小时后进入下一轮') time.sleep(1 * 60 * 60) fetch = Fetcher(queue_name, "xgxx", get_db_dict=src_db_dict, save_db_dict=des_db_dict) # debug while True: try: source_dict = fetch.get() if source_dict: res_dict = UniField.cloneNeedColumns(source_dict) # log.info("start to a new seed %s",seed_dict) res_dict = handler.parse(source_dict, res_dict) if res_dict["status"] == 0: res_dict = UniField.unifyParseResult(res_dict, bbd_table=bbd_table) des_db_inst.changeTable(bbd_table) des_db_inst.save(res_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table, str(des_db_inst.size())) des_db_inst.changeTable(bbd_src_table) des_db_inst.save(source_dict) log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table, str(des_db_inst.size())) else: source_dict["data"] = res_dict err_db_inst.save(source_dict) else: log.info(u"解析%s队列为空, 等待10秒重试", bbd_type) time.sleep(10) except Exception as e: log.info(str(e)) source_dict["data"] = res_dict err_db_inst.save(source_dict) raise Exception(e)
def work(bbd_type, value_list=None): conf_file = "DBConfig.ini" db_conf_dict = \ { 'type':confGetterFunc(conf_file, 'html_db', 'type').lower(), 'host':confGetterFunc(conf_file, 'html_db', 'host').lower(), 'port':int(confGetterFunc(conf_file, 'html_db', 'port')) } def getNbxxDict(src_dict): nbxx_key_list = filter(lambda x: x.startswith("qynb_"), src_dict.keys()) nbxx_list = map(lambda x: {x: src_dict.pop(x)}, nbxx_key_list) return nbxx_list def getYear(nb_dict): key = nb_dict.keys()[0] year = key.split("_")[1] return year def storeResult(src_dict, company_dict=None): """ 回调函数,由爬虫调用,存储数据到ssdb :param src_dict: :param company_dict: :return: """ try: if src_dict["status"] == 0: src_dict = UniField.unifyRequestResult(src_dict, bbd_type) if src_dict.has_key("rowkey"): rowkey = src_dict["rowkey"] nbxx_list = getNbxxDict(src_dict) nb_year_list = [] # 用来向solr接口发送信息 for nb_item in nbxx_list: # 拆分年报成单独的数据条目,使用不同的rowkey, 放入hash year = getYear(nb_item) nb_year_list.append(year) nbxx_dict = UniField.cloneNeedColumns(src_dict) nbxx_dict.update({"bbd_seed": bbd_seed_dict}) nbxx_dict.update(nb_item) db_inst.changeTable(bbd_type + "_nbxx") nb_rk = rowkey + "|_|" + year nbxx_dict["rowkey"] = nb_rk nbxx_dict["year"] = year db_inst.hset(nb_rk, nbxx_dict) log.info(u"存储 %s 年年报 成功,rowkey 为 [ %s ]", year, nb_rk) zch = src_dict["rowkey_dict"]["company_zch"] company_name = src_dict["rowkey_dict"]["company_name"] log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING, bbd_seed_dict) log.info(log_info) src_dict.update({"bbd_seed": bbd_seed_dict}) db_inst.changeTable(bbd_type) db_inst.save(src_dict) log.info(u" ,rowkey 为 [ %s ]", rowkey) NbxxApiControler().nbUpdate(company_name=company_name, pinyin=bbd_type, zch=zch, years_list=nb_year_list) else: raise Exception("No rowkey") else: db_inst.changeTable(bbd_type + "_error") db_inst.save(src_dict) except Exception as e: log.info(str(e)) db_inst.changeTable(bbd_type + "_error") db_inst.save(src_dict) log.info(u"存储抓取网页原文 失败,rowkey 为 [ %s ]", rowkey) def crawlerKeyWordList(keyword_list): """ 一次抓取关键词,如果第一个抓不到,尝试第二个,如果最后一个还是没成功,记录种子信息到ssdb :param keyword_list: :return: """ try: keyword_num = len(keyword_list) for keyword in keyword_list: keyword_num -= 1 seed_status = inst.crawl(keyword) if seed_status.access_type == SeedAccessType.OK: # 状态成功,打印消息 # log.info("End seed with keyword %s", keyword) log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC, bbd_seed_dict) log.info(log_info) log.info(u"种子抓取成功:)") break elif seed_status.access_type != SeedAccessType.OK and keyword_num > 0: # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, bbd_seed_dict) log.info(u"种子抓取失败,关键字 [%s]", keyword) continue else: seed.update(status=seed_status.access_type) log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, bbd_seed_dict) log.info(log_info) log.info(u"种子抓取失败,存储到队列,种子状态为 %s", str(seed_status)) seed.save() except Exception as e: log.info(str(e)) raise Exception(u"种子抓取过程中遇到异常") ################################################################################################################################## try: from CommonLib.Logging import Logging log = Logging(name=bbd_type) log.info("Process begin for %s,logger=%s", bbd_type, str(log)) module_name = "Crawler" + bbd_type.capitalize() bbd_type = bbd_type.lower() inst = ClassFactory.getClassInst(module_name, package_name="qyxx_all", pinyin=bbd_type, callbackFromOuterControl=storeResult) db_inst = DBManager.getInstance(db_conf_dict["type"], bbd_type, host=db_conf_dict["host"], port=db_conf_dict["port"]) bbd_seed_dict = {} if value_list: for keywd_list in value_list: crawlerKeyWordList(keywd_list) else: seed = Seed(bbd_type) while True: seed.get() bbd_seed_dict = seed.getDict() log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING, bbd_seed_dict) log.info("starting a new seed %s", log_info) if seed.url_status: seed_status = inst.crawlUrl(seed.url, seed.name) if seed_status.access_type == SeedAccessType.OK: # 状态成功,打印消息 log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC, bbd_seed_dict) log.info(log_info) else: # 用url没有抓成功, 用keywordlist 抓 log.info(" Url get company info failed [%s]", bbd_type) keyword_list = seed.values crawlerKeyWordList(keyword_list) else: keyword_list = seed.values crawlerKeyWordList(keyword_list) except Exception as e: log.info(str(e)) seed.save() raise Exception(e)
def work(bbd_type, need_seed=True, value_list=None): """ 爬虫外部控制主函数,包括一下功能: 1. 初始化爬虫类 2. 初始化DB连接 3. 获取种子 4. 存储爬虫返回的数据 5. 存储爬取异常的种子信息 :param bbd_type: 爬虫存储的队列名,也会关联到爬虫模块名,注意***** :param value_list: 爬虫种子信息,手动调试使用 :return: """ conf_file = "DBConfig.ini" db_conf_dict = \ { 'type': confGetterFunc(conf_file, 'html_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'html_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'html_db', 'port')) } seed_db_dict = \ { 'type': confGetterFunc(conf_file, 'seed_db', 'type').lower(), 'host': confGetterFunc(conf_file, 'seed_db', 'host').lower(), 'port': int(confGetterFunc(conf_file, 'seed_db', 'port')) } def storeResult(src_dict, company_dict=None): """ 回调函数,由爬虫调用,存储数据到ssdb :param src_dict: :param company_dict: :return: """ try: if "bbd_tmp_queue" in src_dict: queue_name = src_dict["bbd"] if src_dict["status"] == 0: src_dict = UniField.unifyRequestResult(src_dict, bbd_type) if "rowkey" in src_dict.keys(): src_dict.update({"bbd_seed": bbd_seed_dict}) if 'table_name' in src_dict: table_name = src_dict.get('table_name') db_inst.changeTable(table_name) else: db_inst.changeTable(bbd_type) db_inst.save(src_dict) else: raise Exception("No rowkey") else: db_inst.changeTable(bbd_type + "_error") db_inst.save(src_dict) except Exception as e: log.info(str(e)) db_inst.changeTable(bbd_type + "_error") db_inst.save(src_dict) def crawlerKeyWordList(keyword_list): """ 一次抓取关键词,如果第一个抓不到,尝试第二个,如果最后一个还是没成功,记录种子信息到ssdb :param keyword_list: :return: """ keyword_num = len(keyword_list) for keyword in keyword_list: seed_status = inst.crawl(keyword) if seed_status.access_type == SeedAccessType.OK: # 状态成功,打印消息 # log.info("End seed with keyword %s", keyword) log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC, bbd_seed_dict) log.info(log_info) break elif seed_status.access_type != SeedAccessType.OK and keyword_num > 0: keyword_num -= 1 # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, bbd_seed_dict) log.info("Use Key word [%s] get company failed", keyword) continue else: seed.update(status=seed_status.access_type) log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, bbd_seed_dict) log.info(log_info) seed.save() ################################################################################################################################## try: from CommonLib.Logging import Logging log = Logging(name=bbd_type) log.info("Process begin for %s,logger=%s", bbd_type, str(log)) module_name = "Crawler" + bbd_type.capitalize() bbd_type = bbd_type.lower() inst = ClassFactory.getClassInst(module_name, package_name="xgxx", pinyin=bbd_type, callbackFromOuterControl=storeResult) db_inst = DBManager.getInstance(db_conf_dict["type"], bbd_type, host=db_conf_dict["host"], port=db_conf_dict["port"]) if not need_seed: inst.crawl() else: bbd_seed_dict = {} if value_list: for keywd_list in value_list: crawlerKeyWordList(keywd_list) else: seed_db_inst = DBManager.getInstance(seed_db_dict["type"], bbd_type, host=seed_db_dict["host"], port=seed_db_dict["port"]) while True: bbd_seed_dict = seed_db_inst.get() # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING, bbd_seed_dict) # log.info("start to a new seed %s",log_info) seed_status = inst.crawl(bbd_seed_dict) if seed_status.access_type == SeedAccessType.OK: # 状态成功,打印消息 log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC, bbd_seed_dict) log.info(log_info) else: # 用url没有抓成功, 用keywordlist 抓 log.info(u"种子抓取失败,存取到相应队列 [%s]", bbd_type) seed_db_inst.changeTable(bbd_type + "_error") seed_db_inst.save(bbd_seed_dict) except Exception as e: raise Exception(e)