示例#1
0
    def addInitUrlFromCheck(hcfg, redis_cfg, getRow, ts=0):
        import XX.DB.HappyBaseHelper as HaB

        conn_redis = RedisHelper.get_redis_connect_by_cfg(redis_cfg)
        # TODO:
        conn_hbase = HaB.HappyBaseHelper.get_connection_by_cfg(hcfg)
        # pool = HaB.HappyBaseHelper.getPoolByCfg(hcfg)
        while 1:
            keys = conn_redis.keys("*:start_urls:check")
            if not keys:
                BF.print_from_head("No More Check IU in " +
                                   str(redis_cfg["host"]),
                                   ts=ts)
                continue
            for key in keys:
                jd = json.loads(conn_redis.lpop(key))
                url = jd["url"]
                if url:
                    # table = HaB.HappyBaseHelper.getTable("crawl_" + jd["project"], pool=pool)
                    table = HaB.HappyBaseHelper.get_table("crawl_" +
                                                          jd["project"],
                                                          conn=conn_hbase)
                    # HBase是否存在
                    row = getRow(url=url)
                    if row:
                        exists = HaB.HappyBaseHelper.get_row(row)
                        if not exists:
                            res = conn_redis.lpush(key[:-6], url)
                            print("Add new IU res \t\t" + str(res))
                        else:
                            print("Already Crawled!\t\t" + url)
                    else:
                        print("==== No row key", jd)
                time.sleep(ts)
示例#2
0
文件: PipeLine.py 项目: billsteve/XX
 def process_item(self, item, spider):
     topic_producer = self.client.topics[spider.name]
     producer = topic_producer.get_producer()
     # 数据处理
     item = chtml.parse_dict(item)
     json_str = json.dumps(item, ensure_ascii=False)
     producer.produce(json_str)
     bf.print_from_head(spider.name + "\tAdd kafka")
     return item
示例#3
0
    def process_response(self, request, response, spider):
        if response.status != self.settings.get("STATUS_CODE", 200):
            logger.info("=== Add not 200 set res is \t" + str(
                self.conn_redis.sadd(spider.name +
                                     ":start_urls:not200", response.url)))

        # 延时
        err_count = self.conn_redis.scard(spider.name + ":start_urls:not200")
        for i in range(int(err_count)):
            BF.print_from_head("Has " + str(err_count) +
                               " error url Please wait" + "." * i)
            time.sleep(1)
        return response
示例#4
0
def re_add_not200(redis_cfg=RC.ali2_cfg(db=0), ts=10):
    conn_redis = RedisHelper.get_redis_connect_by_cfg(redis_cfg)
    while 1:
        keys = conn_redis.keys("*not200*")
        if not keys:
            BF.print_from_head("No More not 200 Spider in " + str(redis_cfg["host"]), ts=ts)
            continue
        for key in keys:
            url = conn_redis.spop(key)
            if url:
                if conn_redis.sadd("s_not_200_urls", url):
                    logger.info("Readd url res is\t" + str(conn_redis.lpush(key[:-7], url)) + "\tkey is\t" + key[:-7] + "\t url is \t" + url)
                else:
                    print("Retry already!")
            else:
                logger.info("No url in set \t" + str(key))
            time.sleep(ts)
示例#5
0
def cache_file_2_hbase(root_path, hb_cfg, table_name, pro_num=0):
    conn_hbase = happybase.Connection(**hb_cfg)
    table = conn_hbase.table("crawl_" + table_name)
    for fp, fn in FH.FileHelper.get_file_list(root_path):
        # TODO:WORDS16改为string模式
        if not fn.startswith(cc.WORDS16[pro_num]):
            continue
        spider = fp.split(os.sep)[-4]
        response = pickle.load(open(fp + os.sep + fn, "rb"))
        row = spider + "_" + Enc.Encrypt.md5(response.url)
        if table.row(row):
            BF.print_from_head("Exists\t" + row)
            continue
        data = {
            "source:url": str(response.url),
            "source:status_code": str(response.status),
            "source:html": str(response.text),
            "source:type": "html",
            "source:size": str(len(response.text)),
            "source:encoding": response.encoding
        }
        table.put(row, data)
        logger.info(row)
示例#6
0
    def add_table_column2redis(pro_num,
                               *args,
                               column=None,
                               url_fun=None,
                               process_num=10,
                               fn="",
                               spider="",
                               module_name="",
                               class_name=None,
                               r_cfg=None,
                               m_cfg=None,
                               service=True,
                               from_id=None,
                               limit=3000,
                               **kwargs):
        session = sa.SqlAlchemyHelper.get_session_by_cfg(m_cfg)
        conn_redis = ur.RedisHelper.get_redis_connect_by_cfg(r_cfg)
        if kwargs.get("del_q"):
            conn_redis.delete(spider + kwargs.get("suffix", ":start_urls"))
        if from_id is None:
            from_id = conn_redis.get("kid_" + str(fn) + "_" + class_name +
                                     "_" + str(pro_num) + "_from_id")
            from_id = from_id if from_id else 0
            logger.info("From id is \t" + str(from_id))

        while 1:
            if conn_redis.llen(spider +
                               kwargs.get("suffix", ":start_urls")) > limit:
                BF.print_from_head("===Too much\t" + class_name + "\t")
                time.sleep(2 * (pro_num + 1))
                continue
            model_class = getattr(importlib.import_module(module_name),
                                  class_name)
            infos = model_class.getByFromIdAndMod(from_id,
                                                  process_num,
                                                  pro_num,
                                                  session,
                                                  limit=10)
            if infos:
                for info in infos:
                    if url_fun:
                        url = url_fun(info.__dict__.get(column))
                    else:
                        url = info.__dict__.get(column)
                    if url:
                        url = url.strip()
                        if kwargs.get("bf"):
                            bf = BloomFilter.BloomFilter(conn_redis,
                                                         key=spider)
                            if bf.is_exists(url):
                                BF.print_no_end("-")
                            else:
                                res = conn_redis.lpush(
                                    spider +
                                    kwargs.get("suffix", ":start_urls"), url)
                                logger.info(str((spider, res, info.id, url)))
                                bf.add(url)
                        else:
                            res = conn_redis.lpush(
                                spider + kwargs.get("suffix", ":start_urls"),
                                url)
                            logger.info(str((spider, res, info.id, url)))

                    from_id = info.id
                    conn_redis.set(
                        "kid_" + str(fn) + "_" + class_name + "_" +
                        str(pro_num) + "_from_id", from_id)
            else:
                if service:
                    BF.print_from_head("No More\t" + class_name + "\t")
                    time.sleep(2 * (pro_num + 1))
                    session.commit()
                else:
                    return