def addInitUrlFromCheck(hcfg, redis_cfg, getRow, ts=0): import XX.DB.HappyBaseHelper as HaB conn_redis = RedisHelper.get_redis_connect_by_cfg(redis_cfg) # TODO: conn_hbase = HaB.HappyBaseHelper.get_connection_by_cfg(hcfg) # pool = HaB.HappyBaseHelper.getPoolByCfg(hcfg) while 1: keys = conn_redis.keys("*:start_urls:check") if not keys: BF.print_from_head("No More Check IU in " + str(redis_cfg["host"]), ts=ts) continue for key in keys: jd = json.loads(conn_redis.lpop(key)) url = jd["url"] if url: # table = HaB.HappyBaseHelper.getTable("crawl_" + jd["project"], pool=pool) table = HaB.HappyBaseHelper.get_table("crawl_" + jd["project"], conn=conn_hbase) # HBase是否存在 row = getRow(url=url) if row: exists = HaB.HappyBaseHelper.get_row(row) if not exists: res = conn_redis.lpush(key[:-6], url) print("Add new IU res \t\t" + str(res)) else: print("Already Crawled!\t\t" + url) else: print("==== No row key", jd) time.sleep(ts)
def process_item(self, item, spider): topic_producer = self.client.topics[spider.name] producer = topic_producer.get_producer() # 数据处理 item = chtml.parse_dict(item) json_str = json.dumps(item, ensure_ascii=False) producer.produce(json_str) bf.print_from_head(spider.name + "\tAdd kafka") return item
def process_response(self, request, response, spider): if response.status != self.settings.get("STATUS_CODE", 200): logger.info("=== Add not 200 set res is \t" + str( self.conn_redis.sadd(spider.name + ":start_urls:not200", response.url))) # 延时 err_count = self.conn_redis.scard(spider.name + ":start_urls:not200") for i in range(int(err_count)): BF.print_from_head("Has " + str(err_count) + " error url Please wait" + "." * i) time.sleep(1) return response
def re_add_not200(redis_cfg=RC.ali2_cfg(db=0), ts=10): conn_redis = RedisHelper.get_redis_connect_by_cfg(redis_cfg) while 1: keys = conn_redis.keys("*not200*") if not keys: BF.print_from_head("No More not 200 Spider in " + str(redis_cfg["host"]), ts=ts) continue for key in keys: url = conn_redis.spop(key) if url: if conn_redis.sadd("s_not_200_urls", url): logger.info("Readd url res is\t" + str(conn_redis.lpush(key[:-7], url)) + "\tkey is\t" + key[:-7] + "\t url is \t" + url) else: print("Retry already!") else: logger.info("No url in set \t" + str(key)) time.sleep(ts)
def cache_file_2_hbase(root_path, hb_cfg, table_name, pro_num=0): conn_hbase = happybase.Connection(**hb_cfg) table = conn_hbase.table("crawl_" + table_name) for fp, fn in FH.FileHelper.get_file_list(root_path): # TODO:WORDS16改为string模式 if not fn.startswith(cc.WORDS16[pro_num]): continue spider = fp.split(os.sep)[-4] response = pickle.load(open(fp + os.sep + fn, "rb")) row = spider + "_" + Enc.Encrypt.md5(response.url) if table.row(row): BF.print_from_head("Exists\t" + row) continue data = { "source:url": str(response.url), "source:status_code": str(response.status), "source:html": str(response.text), "source:type": "html", "source:size": str(len(response.text)), "source:encoding": response.encoding } table.put(row, data) logger.info(row)
def add_table_column2redis(pro_num, *args, column=None, url_fun=None, process_num=10, fn="", spider="", module_name="", class_name=None, r_cfg=None, m_cfg=None, service=True, from_id=None, limit=3000, **kwargs): session = sa.SqlAlchemyHelper.get_session_by_cfg(m_cfg) conn_redis = ur.RedisHelper.get_redis_connect_by_cfg(r_cfg) if kwargs.get("del_q"): conn_redis.delete(spider + kwargs.get("suffix", ":start_urls")) if from_id is None: from_id = conn_redis.get("kid_" + str(fn) + "_" + class_name + "_" + str(pro_num) + "_from_id") from_id = from_id if from_id else 0 logger.info("From id is \t" + str(from_id)) while 1: if conn_redis.llen(spider + kwargs.get("suffix", ":start_urls")) > limit: BF.print_from_head("===Too much\t" + class_name + "\t") time.sleep(2 * (pro_num + 1)) continue model_class = getattr(importlib.import_module(module_name), class_name) infos = model_class.getByFromIdAndMod(from_id, process_num, pro_num, session, limit=10) if infos: for info in infos: if url_fun: url = url_fun(info.__dict__.get(column)) else: url = info.__dict__.get(column) if url: url = url.strip() if kwargs.get("bf"): bf = BloomFilter.BloomFilter(conn_redis, key=spider) if bf.is_exists(url): BF.print_no_end("-") else: res = conn_redis.lpush( spider + kwargs.get("suffix", ":start_urls"), url) logger.info(str((spider, res, info.id, url))) bf.add(url) else: res = conn_redis.lpush( spider + kwargs.get("suffix", ":start_urls"), url) logger.info(str((spider, res, info.id, url))) from_id = info.id conn_redis.set( "kid_" + str(fn) + "_" + class_name + "_" + str(pro_num) + "_from_id", from_id) else: if service: BF.print_from_head("No More\t" + class_name + "\t") time.sleep(2 * (pro_num + 1)) session.commit() else: return