Exemplo n.º 1
0
 def crawlerKeyWordList(keyword_list):
     """
     一次抓取关键词,如果第一个抓不到,尝试第二个,如果最后一个还是没成功,记录种子信息到ssdb
     :param keyword_list:
     :return:
     """
     keyword_num = len(keyword_list)
     for keyword in keyword_list:
         seed_status = inst.crawl(keyword)
         if seed_status.access_type == SeedAccessType.OK:  # 状态成功,打印消息
             # log.info("End seed with keyword %s", keyword)
             log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC,
                                 seed.getDict())
             log.info(log_info)
             break
         elif seed_status.access_type != SeedAccessType.OK and keyword_num > 0:
             keyword_num -= 1
             # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, seed.getDict())
             log.info("Use Key word [%s] get company failed", keyword)
             continue
         else:
             seed.update(status=seed_status.access_type)
             log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO,
                                 seed.getDict())
             log.info(log_info)
             seed.save()
Exemplo n.º 2
0
    def crawlerKeyWordList(keyword_list):
        """
        一次抓取关键词,如果第一个抓不到,尝试第二个,如果最后一个还是没成功,记录种子信息到ssdb
        :param keyword_list:
        :return:
        """
        try:
            keyword_num = len(keyword_list)
            for keyword in keyword_list:
                keyword_num -= 1
                seed_status = inst.crawl(keyword)
                if seed_status.access_type == SeedAccessType.OK:  # 状态成功,打印消息
                    # log.info("End seed with keyword %s", keyword)
                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC,
                                        bbd_seed_dict)
                    log.info(log_info)
                    log.info(u"种子抓取成功:)")
                    break
                elif seed_status.access_type != SeedAccessType.OK and keyword_num > 0:

                    # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, bbd_seed_dict)
                    log.info(u"种子抓取失败,关键字 [%s]", keyword)
                    continue
                else:
                    seed.update(status=seed_status.access_type)
                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO,
                                        bbd_seed_dict)
                    log.info(log_info)
                    log.info(u"种子抓取失败,存储到队列,种子状态为 %s", str(seed_status))
                    seed.save()
        except Exception as e:
            log.info(str(e))
            raise Exception(u"种子抓取过程中遇到异常")
Exemplo n.º 3
0
    def storeResult(src_dict, company_dict=None):
        """
        回调函数,由爬虫调用,存储数据到ssdb
        :param src_dict:
        :param company_dict:
        :return:
        """
        try:
            if src_dict["status"] == 0:
                src_dict = UniField.unifyRequestResult(src_dict, bbd_type)
                if src_dict.has_key("rowkey"):
                    rowkey = src_dict["rowkey"]

                    nbxx_list = getNbxxDict(src_dict)
                    nb_year_list = []  # 用来向solr接口发送信息
                    for nb_item in nbxx_list:
                        # 拆分年报成单独的数据条目,使用不同的rowkey, 放入hash
                        year = getYear(nb_item)
                        nb_year_list.append(year)
                        nbxx_dict = UniField.cloneNeedColumns(src_dict)
                        nbxx_dict.update({"bbd_seed": bbd_seed_dict})
                        nbxx_dict.update(nb_item)
                        db_inst.changeTable(bbd_type + "_nbxx")
                        nb_rk = rowkey + "|_|" + year
                        nbxx_dict["rowkey"] = nb_rk
                        nbxx_dict["year"] = year
                        db_inst.hset(nb_rk, nbxx_dict)
                        log.info(u"存储 %s 年年报 成功,rowkey 为 [ %s ]", year, nb_rk)
                    zch = src_dict["rowkey_dict"]["company_zch"]
                    company_name = src_dict["rowkey_dict"]["company_name"]

                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING,
                                        bbd_seed_dict)
                    log.info(log_info)
                    src_dict.update({"bbd_seed": bbd_seed_dict})
                    db_inst.changeTable(bbd_type)
                    db_inst.save(src_dict)
                    log.info(u" ,rowkey 为 [ %s ]", rowkey)
                    NbxxApiControler().nbUpdate(company_name=company_name,
                                                pinyin=bbd_type,
                                                zch=zch,
                                                years_list=nb_year_list)

                else:
                    raise Exception("No rowkey")
            else:
                db_inst.changeTable(bbd_type + "_error")
                db_inst.save(src_dict)
        except Exception as e:
            log.info(str(e))
            db_inst.changeTable(bbd_type + "_error")
            db_inst.save(src_dict)

            log.info(u"存储抓取网页原文 失败,rowkey 为 [ %s ]", rowkey)
Exemplo n.º 4
0
def work(self, pro_type):
    conf_file = "DBConfig.ini"
    src_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'html_db', 'port'))
        }
    des_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'data_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'data_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'data_db', 'port'))
        }

    from CommonLib.Logging import Logging
    log = Logging(name=pro_type)

    log.info("Process begin")

    pro_type = pro_type.lower()
    queue_name = pro_type

    module_name = pro_type.capitalize() + "Handler"
    handler = ClassFactory.getClassInst(module_name,
                                        package_name="Parser",
                                        pinyin=pro_type.lower())

    # nb_module_name = pro_type.capitalize() +"Nb" + "Handler"
    # nb_handler  = ClassFactory.getClassInst(module_name, package_name = "Parser", pinyin=pro_type.lower())

    normal_table = pro_type + "_data"
    err_table = normal_table + "_error"

    # db_inst = DBManager.getInstance(des_db_dict["type"], normal_table, host = des_db_dict["host"], port = des_db_dict["port"]) #ssdb 存 解析后数据

    # kfk_inst = DBManager.getInstance("kafka", "qyxx_html", host = "spider7", port = 9092)
    # debug_normal_table =  "new_"+pro_type.lower()+"_data"
    # db_debug = DBManager.getInstance("mongo",debug_normal_table, host = "spider7", port = 27037)# mongo, 数据存本地,用于调试

    # fetch=Fetcher(queue_name.lower() ,"qyxx") enable this if not debug

    fetch = Fetcher(queue_name,
                    "qyxx",
                    get_db_dict=src_db_dict,
                    save_db_dict=des_db_dict)  # debug

    while True:
        try:
            # source_dict = fetch.hget()
            source_dict = fetch.get()

            if source_dict:
                # 拷贝 种子信息到解析后的数据里面
                if source_dict.has_key("bbd_seed"):
                    seed_dict = {"bbd_seed": source_dict["bbd_seed"]}
                if source_dict.has_key("BBD_SEED"):
                    seed_dict = {"bbd_seed": source_dict["BBD_SEED"]}
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ING,
                                    seed_dict)
                log.info(log_info)
                # fetch.backup() # 避免进程异常退出引起的数据丢失
                res_dict = UniField.cloneNeedColumns(source_dict)
                log.info("start to a new seed %s", source_dict)

                #debug
                # db_inst.changeTable("new_"+pro_type.lower())
                # db_inst.save(source_dict);
                # rowkey=source_dict["rowkey"]
                # db_inst.hset(rowkey,source_dict)
                # db_inst.changeTable("new_"+pro_type.lower()+"_processed")
                # db_inst.save(source_dict)
                res_dict = handler.parse(source_dict, res_dict)

                if res_dict["status"] == 0:
                    db_inst.changeTable(normal_table)
                    res_dict = UniField.unifyParseResult(res_dict)

                    #for debug
                    db_debug.save(res_dict)

                    # db_inst.save(res_dict)
                    # kfk_inst.save(source_dict)
                    # print "kfk size:",kfk_inst.size()
                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_SUC,
                                        seed_dict)
                    log.info(log_info)
                else:
                    db_inst.changeTable(err_table)
                    res_dict["html"] = source_dict

                    # db_inst.save(res_dict)
                    db_debug.save(res_dict)

                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ERO,
                                        seed_dict)
                    log.info(log_info)
            else:
                log.info(u"解析%s队列为空, 等待10秒重试", pro_type)
                time.sleep(10)
        except Exception as e:
            print str(e)
            raise Exception(e)
Exemplo n.º 5
0
def work(bbd_type, value_list=None):
    conf_file = "DBConfig.ini"
    db_conf_dict = \
        {
            'type':confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host':confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port':int(confGetterFunc(conf_file, 'html_db', 'port'))
        }

    def getNbxxDict(src_dict):
        nbxx_key_list = filter(lambda x: x.startswith("qynb_"),
                               src_dict.keys())
        nbxx_list = map(lambda x: {x: src_dict.pop(x)}, nbxx_key_list)
        return nbxx_list

    def getYear(nb_dict):
        key = nb_dict.keys()[0]
        year = key.split("_")[1]
        return year

    def storeResult(src_dict, company_dict=None):
        """
        回调函数,由爬虫调用,存储数据到ssdb
        :param src_dict:
        :param company_dict:
        :return:
        """
        try:
            if src_dict["status"] == 0:
                src_dict = UniField.unifyRequestResult(src_dict, bbd_type)
                if src_dict.has_key("rowkey"):
                    rowkey = src_dict["rowkey"]

                    nbxx_list = getNbxxDict(src_dict)
                    nb_year_list = []  # 用来向solr接口发送信息
                    for nb_item in nbxx_list:
                        # 拆分年报成单独的数据条目,使用不同的rowkey, 放入hash
                        year = getYear(nb_item)
                        nb_year_list.append(year)
                        nbxx_dict = UniField.cloneNeedColumns(src_dict)
                        nbxx_dict.update({"bbd_seed": bbd_seed_dict})
                        nbxx_dict.update(nb_item)
                        db_inst.changeTable(bbd_type + "_nbxx")
                        nb_rk = rowkey + "|_|" + year
                        nbxx_dict["rowkey"] = nb_rk
                        nbxx_dict["year"] = year
                        db_inst.hset(nb_rk, nbxx_dict)
                        log.info(u"存储 %s 年年报 成功,rowkey 为 [ %s ]", year, nb_rk)
                    zch = src_dict["rowkey_dict"]["company_zch"]
                    company_name = src_dict["rowkey_dict"]["company_name"]

                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING,
                                        bbd_seed_dict)
                    log.info(log_info)
                    src_dict.update({"bbd_seed": bbd_seed_dict})
                    db_inst.changeTable(bbd_type)
                    db_inst.save(src_dict)
                    log.info(u" ,rowkey 为 [ %s ]", rowkey)
                    NbxxApiControler().nbUpdate(company_name=company_name,
                                                pinyin=bbd_type,
                                                zch=zch,
                                                years_list=nb_year_list)

                else:
                    raise Exception("No rowkey")
            else:
                db_inst.changeTable(bbd_type + "_error")
                db_inst.save(src_dict)
        except Exception as e:
            log.info(str(e))
            db_inst.changeTable(bbd_type + "_error")
            db_inst.save(src_dict)

            log.info(u"存储抓取网页原文 失败,rowkey 为 [ %s ]", rowkey)

    def crawlerKeyWordList(keyword_list):
        """
        一次抓取关键词,如果第一个抓不到,尝试第二个,如果最后一个还是没成功,记录种子信息到ssdb
        :param keyword_list:
        :return:
        """
        try:
            keyword_num = len(keyword_list)
            for keyword in keyword_list:
                keyword_num -= 1
                seed_status = inst.crawl(keyword)
                if seed_status.access_type == SeedAccessType.OK:  # 状态成功,打印消息
                    # log.info("End seed with keyword %s", keyword)
                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC,
                                        bbd_seed_dict)
                    log.info(log_info)
                    log.info(u"种子抓取成功:)")
                    break
                elif seed_status.access_type != SeedAccessType.OK and keyword_num > 0:

                    # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, bbd_seed_dict)
                    log.info(u"种子抓取失败,关键字 [%s]", keyword)
                    continue
                else:
                    seed.update(status=seed_status.access_type)
                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO,
                                        bbd_seed_dict)
                    log.info(log_info)
                    log.info(u"种子抓取失败,存储到队列,种子状态为 %s", str(seed_status))
                    seed.save()
        except Exception as e:
            log.info(str(e))
            raise Exception(u"种子抓取过程中遇到异常")

    ##################################################################################################################################
    try:
        from CommonLib.Logging import Logging
        log = Logging(name=bbd_type)
        log.info("Process begin for %s,logger=%s", bbd_type, str(log))

        module_name = "Crawler" + bbd_type.capitalize()
        bbd_type = bbd_type.lower()
        inst = ClassFactory.getClassInst(module_name,
                                         package_name="qyxx_all",
                                         pinyin=bbd_type,
                                         callbackFromOuterControl=storeResult)
        db_inst = DBManager.getInstance(db_conf_dict["type"],
                                        bbd_type,
                                        host=db_conf_dict["host"],
                                        port=db_conf_dict["port"])
        bbd_seed_dict = {}
        if value_list:
            for keywd_list in value_list:
                crawlerKeyWordList(keywd_list)
        else:
            seed = Seed(bbd_type)

            while True:
                seed.get()
                bbd_seed_dict = seed.getDict()
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING, bbd_seed_dict)
                log.info("starting a new seed %s", log_info)
                if seed.url_status:
                    seed_status = inst.crawlUrl(seed.url, seed.name)
                    if seed_status.access_type == SeedAccessType.OK:  # 状态成功,打印消息
                        log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC,
                                            bbd_seed_dict)
                        log.info(log_info)
                    else:  # 用url没有抓成功, 用keywordlist 抓
                        log.info(" Url get company info failed  [%s]",
                                 bbd_type)
                        keyword_list = seed.values
                        crawlerKeyWordList(keyword_list)
                else:
                    keyword_list = seed.values
                    crawlerKeyWordList(keyword_list)
    except Exception as e:
        log.info(str(e))
        seed.save()
        raise Exception(e)
Exemplo n.º 6
0
def work(bbd_type):
    conf_file = "DBConfig.ini"
    src_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'html_db', 'port'))
        }
    des_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'data_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'data_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'data_db', 'port'))
        }

    from CommonLib.Logging import Logging
    log = Logging(name=bbd_type)
    log.info("Process begin")

    bbd_type = bbd_type.lower()
    queue_name = bbd_type
    bbd_table = "qyxx_data"
    bbd_src_table = "qyxx_html"
    module_name = bbd_type.capitalize() + "Handler"
    handler = ClassFactory.getClassInst(module_name,
                                        package_name="Parser",
                                        pinyin=bbd_type.lower())

    normal_table = bbd_type + "_data"
    err_table = normal_table + "_error"
    # html_normal_table = bbd_type+"_src"

    des_db_inst = DBManager.getInstance(des_db_dict["type"],
                                        bbd_table,
                                        host=des_db_dict["host"],
                                        port=des_db_dict["port"])  #存 解析后数据
    err_db_inst = DBManager.getInstance(src_db_dict["type"],
                                        err_table,
                                        host=src_db_dict["host"],
                                        port=src_db_dict["port"])

    fetch = Fetcher(queue_name,
                    "qyxx",
                    get_db_dict=src_db_dict,
                    save_db_dict=des_db_dict)  # debug

    while True:
        try:
            source_dict = fetch.get()
            if source_dict:
                # 拷贝 种子信息到解析后的数据里面
                if source_dict.has_key("bbd_seed"):
                    seed_dict = {"bbd_seed": source_dict["bbd_seed"]}
                if source_dict.has_key("BBD_SEED"):
                    seed_dict = {"bbd_seed": source_dict["BBD_SEED"]}
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ING,
                                    seed_dict)
                log.info(log_info)
                # fetch.backup() # 避免进程异常退出引起的数据丢失
                res_dict = UniField.cloneNeedColumns(source_dict)
                res_dict.update(seed_dict)
                log.info("start to a new seed %s", seed_dict)

                res_dict = handler.parse(source_dict, res_dict)
                if res_dict["status"] == 0:
                    res_dict = UniField.unifyParseResult(res_dict,
                                                         bbd_table=bbd_table)
                    des_db_inst.changeTable(bbd_table)
                    des_db_inst.save(res_dict)
                    log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_table,
                             str(des_db_inst.size()))
                    des_db_inst.changeTable(bbd_src_table)
                    des_db_inst.save(source_dict)
                    log.info(u"插入数据到 [%s] 成功, 队列大小为: %s ", bbd_src_table,
                             str(des_db_inst.size()))
                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_SUC,
                                        seed_dict)
                    log.info(log_info)
                else:
                    source_dict["data"] = res_dict
                    err_db_inst.save(source_dict)
                    log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_PARSE_ERO,
                                        seed_dict)
                    log.info(log_info)
            else:
                log.info(u"解析%s队列为空, 等待10秒重试", bbd_type)
                time.sleep(10)
        except Exception as e:
            log.info(str(e))
            source_dict["data"] = res_dict
            err_db_inst.save(source_dict)
            raise Exception(e)
Exemplo n.º 7
0
def work(bbd_type, need_seed=True, value_list=None):
    """
    爬虫外部控制主函数,包括一下功能:
    1. 初始化爬虫类
    2. 初始化DB连接
    3. 获取种子
    4. 存储爬虫返回的数据
    5. 存储爬取异常的种子信息
    :param bbd_type: 爬虫存储的队列名,也会关联到爬虫模块名,注意*****
    :param value_list: 爬虫种子信息,手动调试使用
    :return:
    """
    conf_file = "DBConfig.ini"
    db_conf_dict = \
        {
            'type': confGetterFunc(conf_file, 'html_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'html_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'html_db', 'port'))
        }
    seed_db_dict = \
        {
            'type': confGetterFunc(conf_file, 'seed_db', 'type').lower(),
            'host': confGetterFunc(conf_file, 'seed_db', 'host').lower(),
            'port': int(confGetterFunc(conf_file, 'seed_db', 'port'))
        }

    def storeResult(src_dict, company_dict=None):
        """
        回调函数,由爬虫调用,存储数据到ssdb
        :param src_dict:
        :param company_dict:
        :return:
        """
        try:
            if "bbd_tmp_queue" in src_dict:
                queue_name = src_dict["bbd"]

            if src_dict["status"] == 0:
                src_dict = UniField.unifyRequestResult(src_dict, bbd_type)
                if "rowkey" in src_dict.keys():
                    src_dict.update({"bbd_seed": bbd_seed_dict})
                    if 'table_name' in src_dict:
                        table_name = src_dict.get('table_name')
                        db_inst.changeTable(table_name)
                    else:
                        db_inst.changeTable(bbd_type)
                    db_inst.save(src_dict)
                else:
                    raise Exception("No rowkey")
            else:
                db_inst.changeTable(bbd_type + "_error")
                db_inst.save(src_dict)
        except Exception as e:
            log.info(str(e))
            db_inst.changeTable(bbd_type + "_error")
            db_inst.save(src_dict)

    def crawlerKeyWordList(keyword_list):
        """
        一次抓取关键词,如果第一个抓不到,尝试第二个,如果最后一个还是没成功,记录种子信息到ssdb
        :param keyword_list:
        :return:
        """
        keyword_num = len(keyword_list)
        for keyword in keyword_list:
            seed_status = inst.crawl(keyword)
            if seed_status.access_type == SeedAccessType.OK:  # 状态成功,打印消息
                # log.info("End seed with keyword %s", keyword)
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC, bbd_seed_dict)
                log.info(log_info)
                break
            elif seed_status.access_type != SeedAccessType.OK and keyword_num > 0:
                keyword_num -= 1
                # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, bbd_seed_dict)
                log.info("Use Key word [%s] get company failed", keyword)
                continue
            else:
                seed.update(status=seed_status.access_type)
                log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ERO, bbd_seed_dict)
                log.info(log_info)
                seed.save()

    ##################################################################################################################################
    try:
        from CommonLib.Logging import Logging
        log = Logging(name=bbd_type)
        log.info("Process begin for %s,logger=%s", bbd_type, str(log))

        module_name = "Crawler" + bbd_type.capitalize()
        bbd_type = bbd_type.lower()
        inst = ClassFactory.getClassInst(module_name,
                                         package_name="xgxx",
                                         pinyin=bbd_type,
                                         callbackFromOuterControl=storeResult)
        db_inst = DBManager.getInstance(db_conf_dict["type"],
                                        bbd_type,
                                        host=db_conf_dict["host"],
                                        port=db_conf_dict["port"])
        if not need_seed:
            inst.crawl()
        else:
            bbd_seed_dict = {}
            if value_list:
                for keywd_list in value_list:
                    crawlerKeyWordList(keywd_list)
            else:
                seed_db_inst = DBManager.getInstance(seed_db_dict["type"],
                                                     bbd_type,
                                                     host=seed_db_dict["host"],
                                                     port=seed_db_dict["port"])
                while True:
                    bbd_seed_dict = seed_db_inst.get()
                    # log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_ING, bbd_seed_dict)
                    # log.info("start to a new seed %s",log_info)
                    seed_status = inst.crawl(bbd_seed_dict)
                    if seed_status.access_type == SeedAccessType.OK:  # 状态成功,打印消息
                        log_info = get_logs(STATE.BBD_SEED_IS_CRAWL_SUC, bbd_seed_dict)
                        log.info(log_info)
                    else:  # 用url没有抓成功, 用keywordlist 抓
                        log.info(u"种子抓取失败,存取到相应队列 [%s]", bbd_type)
                        seed_db_inst.changeTable(bbd_type + "_error")
                        seed_db_inst.save(bbd_seed_dict)

    except Exception as e:
        raise Exception(e)