示例#1
0
def work(bbd_type):
    conf_file = "DBConfig.ini"
    src_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'html_db', 'port'))
        }
    des_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'data_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'data_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'data_db', 'port'))
        }

    from CommonLib.Logging import Logging
    log = Logging(name=bbd_type)
    log.info("Process begin")

    bbd_type = bbd_type.lower()
    queue_name = bbd_type
    bbd_table = "qyxx_data"
    bbd_src_table = "qyxx_html"
    module_name = bbd_type.capitalize() + "Handler"
    handler = ClassFactory.getClassInst(module_name,
                                        package_name="Parser",
                                        pinyin=bbd_type.lower())

    normal_table = bbd_type + "_data"
    err_table = normal_table + "_error"
    # html_normal_table = bbd_type+"_src"

    des_db_inst = DBManager.getInstance(des_db_dict["type"],
                                        bbd_table,
                                        host=des_db_dict["host"],
                                        port=des_db_dict["port"])  #存 解析后数据
    err_db_inst = DBManager.getInstance(src_db_dict["type"],
                                        err_table,
                                        host=src_db_dict["host"],
                                        port=src_db_dict["port"])

    fetch = Fetcher(queue_name,
                    "qyxx",
                    get_db_dict=src_db_dict,
                    save_db_dict=des_db_dict)  # debug

    while True:
        try:
            source_dict = fetch.get()
            if source_dict:
                # 拷贝 种子信息到解析后的数据里面
                if source_dict.has_key("bbd_seed"):
                    seed_dict = {"bbd_seed": source_dict["bbd_seed"]}
                if source_dict.has_key("BBD_SEED"):
                    seed_dict = {"bbd_seed": source_dict["BBD_SEED"]}
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ING,
                                    seed_dict)
                log.info(log_info)
                # fetch.backup() # 避免进程异常退出引起的数据丢失
                res_dict = UniField.cloneNeedColumns(source_dict)
                res_dict.update(seed_dict)
                log.info("start to a new seed %s", seed_dict)

                res_dict = handler.parse(source_dict, res_dict)
                if res_dict["status"] == 0:
                    res_dict = UniField.unifyParseResult(res_dict,
                                                         bbd_table=bbd_table)
                    des_db_inst.changeTable(bbd_table)
                    des_db_inst.save(res_dict)
                    log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table,
                             str(des_db_inst.size()))
                    des_db_inst.changeTable(bbd_src_table)
                    des_db_inst.save(source_dict)
                    log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table,
                             str(des_db_inst.size()))
                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_SUC,
                                        seed_dict)
                    log.info(log_info)
                else:
                    source_dict["data"] = res_dict
                    err_db_inst.save(source_dict)
                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ERO,
                                        seed_dict)
                    log.info(log_info)
            else:
                log.info(u"解析%s队列为空, 等待10秒重试", bbd_type)
                time.sleep(10)
        except Exception as e:
            log.info(str(e))
            source_dict["data"] = res_dict
            err_db_inst.save(source_dict)
            raise Exception(e)
示例#2
0
def work(self, pro_type):
    conf_file = "DBConfig.ini"
    src_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'html_db', 'port'))
        }
    des_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'data_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'data_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'data_db', 'port'))
        }

    from CommonLib.Logging import Logging
    log = Logging(name=pro_type)

    log.info("Process begin")

    pro_type = pro_type.lower()
    queue_name = pro_type

    module_name = pro_type.capitalize() + "Handler"
    handler = ClassFactory.getClassInst(module_name,
                                        package_name="Parser",
                                        pinyin=pro_type.lower())

    # nb_module_name = pro_type.capitalize() +"Nb" + "Handler"
    # nb_handler  = ClassFactory.getClassInst(module_name, package_name = "Parser", pinyin=pro_type.lower())

    normal_table = pro_type + "_data"
    err_table = normal_table + "_error"

    # db_inst = DBManager.getInstance(des_db_dict["type"], normal_table, host = des_db_dict["host"], port = des_db_dict["port"]) #ssdb 存 解析后数据

    # kfk_inst = DBManager.getInstance("kafka", "qyxx_html", host = "spider7", port = 9092)
    # debug_normal_table =  "new_"+pro_type.lower()+"_data"
    # db_debug = DBManager.getInstance("mongo",debug_normal_table, host = "spider7", port = 27037)# mongo, 数据存本地,用于调试

    # fetch=Fetcher(queue_name.lower() ,"qyxx") enable this if not debug

    fetch = Fetcher(queue_name,
                    "qyxx",
                    get_db_dict=src_db_dict,
                    save_db_dict=des_db_dict)  # debug

    while True:
        try:
            # source_dict = fetch.hget()
            source_dict = fetch.get()

            if source_dict:
                # 拷贝 种子信息到解析后的数据里面
                if source_dict.has_key("bbd_seed"):
                    seed_dict = {"bbd_seed": source_dict["bbd_seed"]}
                if source_dict.has_key("BBD_SEED"):
                    seed_dict = {"bbd_seed": source_dict["BBD_SEED"]}
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ING,
                                    seed_dict)
                log.info(log_info)
                # fetch.backup() # 避免进程异常退出引起的数据丢失
                res_dict = UniField.cloneNeedColumns(source_dict)
                log.info("start to a new seed %s", source_dict)

                #debug
                # db_inst.changeTable("new_"+pro_type.lower())
                # db_inst.save(source_dict);
                # rowkey=source_dict["rowkey"]
                # db_inst.hset(rowkey,source_dict)
                # db_inst.changeTable("new_"+pro_type.lower()+"_processed")
                # db_inst.save(source_dict)
                res_dict = handler.parse(source_dict, res_dict)

                if res_dict["status"] == 0:
                    db_inst.changeTable(normal_table)
                    res_dict = UniField.unifyParseResult(res_dict)

                    #for debug
                    db_debug.save(res_dict)

                    # db_inst.save(res_dict)
                    # kfk_inst.save(source_dict)
                    # print "kfk size:",kfk_inst.size()
                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_SUC,
                                        seed_dict)
                    log.info(log_info)
                else:
                    db_inst.changeTable(err_table)
                    res_dict["html"] = source_dict

                    # db_inst.save(res_dict)
                    db_debug.save(res_dict)

                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ERO,
                                        seed_dict)
                    log.info(log_info)
            else:
                log.info(u"解析%s队列为空, 等待10秒重试", pro_type)
                time.sleep(10)
        except Exception as e:
            print str(e)
            raise Exception(e)
示例#3
0
def work(bbd_type):
    conf_file = "DBConfig.ini"
    src_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'html_db', 'port'))
        }
    des_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'data_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'data_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'data_db', 'port'))
        }

    from CommonLib.Logging import Logging
    log = Logging(name=bbd_type)
    log.info("Process begin")

    bbd_type = bbd_type.lower()
    queue_name = bbd_type

    module_name = bbd_type.capitalize() + "Handler"
    handler = ClassFactory.getClassInst(module_name,
                                        package_name="xgxx",
                                        pinyin=bbd_type.lower())

    bbd_table = bbd_type + "_data"
    bbd_src_table = bbd_table + "_src"
    err_table = bbd_table + "_error"
    # html_normal_table = bbd_type+"_src"+"_nb"

    des_db_inst = DBManager.getInstance(des_db_dict["type"],
                                        bbd_table,
                                        host=des_db_dict["host"],
                                        port=des_db_dict["port"])  # 存 解析后数据
    err_db_inst = DBManager.getInstance(src_db_dict["type"],
                                        err_table,
                                        host=src_db_dict["host"],
                                        port=src_db_dict["port"])

    if bbd_type == 'jyyc':
        while True:
            for province in [
                    'anhui', 'beijing', 'chongqing', 'fujian', 'gansu',
                    'guangdong', 'guizhou', 'hainan', 'hebei', 'heilongjiang',
                    'henan', 'hubei', 'hunan', 'jiangsu', 'jiangxi', 'jilin',
                    'liaoning', 'neimenggu', 'ningxia', 'qinghai', 'shanghai',
                    'shangxixian', 'sichuan', 'tianjin', 'xinjiang', 'xizang',
                    'yunnan', 'zhejiang', 'zongju', 'shandong'
            ]:
                jyyc_queue = 'jyyc_{}'.format(province)
                fetch = Fetcher(jyyc_queue,
                                "xgxx",
                                get_db_dict=src_db_dict,
                                save_db_dict=des_db_dict)
                while True:
                    try:
                        source_dict = fetch.get()
                        if source_dict:
                            res_dict = UniField.cloneNeedColumns(source_dict)
                            res_dict = handler.parse(source_dict, res_dict,
                                                     province)
                            if res_dict["status"] == 0:
                                res_dict = UniField.unifyParseResult(
                                    res_dict, bbd_table=bbd_table)
                                des_db_inst.changeTable(bbd_table)
                                des_db_inst.save(res_dict)
                                log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ",
                                         bbd_table, str(des_db_inst.size()))
                                des_db_inst.changeTable(bbd_src_table)
                                des_db_inst.save(source_dict)
                                log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ",
                                         bbd_src_table,
                                         str(des_db_inst.size()))
                            else:
                                source_dict["data"] = res_dict
                                err_db_inst.save(source_dict)
                        else:
                            log.info(u"解析%s队列为空, 进入下一队列", jyyc_queue)
                            break
                    except Exception as e:
                        log.info(str(e))
                        source_dict["data"] = res_dict
                        err_db_inst.save(source_dict)
                        raw_input('ssss')
                        raise Exception(e)
            log.info(u'解析完一轮, 一个小时后进入下一轮')
            time.sleep(1 * 60 * 60)

    fetch = Fetcher(queue_name,
                    "xgxx",
                    get_db_dict=src_db_dict,
                    save_db_dict=des_db_dict)  # debug
    while True:
        try:
            source_dict = fetch.get()
            if source_dict:
                res_dict = UniField.cloneNeedColumns(source_dict)
                # log.info("start to a new seed %s",seed_dict)
                res_dict = handler.parse(source_dict, res_dict)
                if res_dict["status"] == 0:
                    res_dict = UniField.unifyParseResult(res_dict,
                                                         bbd_table=bbd_table)
                    des_db_inst.changeTable(bbd_table)
                    des_db_inst.save(res_dict)
                    log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table,
                             str(des_db_inst.size()))
                    des_db_inst.changeTable(bbd_src_table)
                    des_db_inst.save(source_dict)
                    log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table,
                             str(des_db_inst.size()))
                else:
                    source_dict["data"] = res_dict
                    err_db_inst.save(source_dict)
            else:
                log.info(u"解析%s队列为空, 等待10秒重试", bbd_type)
                time.sleep(10)
        except Exception as e:
            log.info(str(e))
            source_dict["data"] = res_dict
            err_db_inst.save(source_dict)
            raise Exception(e)