def process_request(self, request, spider): save_path = ToCrawlUrl.settings.get( "ROOT_PATH_LOG") + ctime.get_today() + "_to_crawl.log" log_file(save_path, str(ctime.get_now_time()) + "\t" + spider.name + "\t" + request.url + "\n", method="a")
def process_request(self, request, spider): save_path = self.settings.get( "ROOT_PATH_LOG") + Dt.get_today() + "_to_crawl.log" # TODO: logging.info(save_path, str(Dt.get_now_time()) + "\t" + spider.name + "\t" + request.url + "\n", method="a")
def process_response(self, request, response, spider): save_path = CrawledUrl.settings.get( "ROOT_PATH_LOG") + ctime.get_today() + "_carwled.log" log_file(save_path, str(ctime.get_now_time()) + "\t" + spider.name + "\t" + str(response.status) + "\t" + request.url + "\n", method="a") return response
def process_response(self, request, response, spider): save_path = self.settings.get( "ROOT_PATH_LOG") + Dt.get_today() + "_carwled.log" # TODO: logging.info(save_path, str(Dt.get_now_time()) + "\t" + spider.name + "\t" + str(response.status) + "\t" + request.url + "\n", method="a") return response
def add_tai_yang(redis_host="127.0.0.1", db=11): r = redis.Redis(redis_host, db=db) url = "http://http-api.taiyangruanjian.com/getip?num=1&type" \ "=2&pro=&city=0&yys=0&port=11&pack=13604&ts=1&ys=0&cs=0&lb=1&sb=0&pb=4&mr=0®ions=" while True: time.sleep(1) if r.dbsize() < 3: resp = Rh.RequestHelper.send_cache_request(url) if resp.status == 200: try: json_data = json.loads(resp.text) if json_data["code"] == 0: ip = str(json_data["data"][0]["ip"]) + ":" + str( json_data["data"][0]["port"]) ets = int( Dt.str_to_ts(json_data["data"][0]["expire_time"]) - time.time() + 1) d("OK " + json_data["data"][0]["expire_time"] + " ====== now + " + str(ets // 60), line1="===") r.set(ip, 0, ex=ets) time.sleep(1) except: print("not json data" + resp.text) traceback.print_exc() else: print("proxy return error" + str(resp.text)) else: break return 1
def add_zhima(r): url = "http://webapi.http.zhimacangku.com/getip?num=1&type=2&pro=&city=0&yys=0&port=1&pack=15624&ts=1&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1®ions=" while True: time.sleep(1) if r.dbsize() < 2: resp = Rh.RequestHelper.send_cache_request(url) if resp.status == 200: try: json_data = json.loads(resp.text) if json_data["code"] == 0: ip = str(json_data["data"][0]["ip"]) + ":" + str( json_data["data"][0]["port"]) ets = int( Dt.str_to_ts(json_data["data"][0]["expire_time"]) - time.time() + 1) d("OK " + json_data["data"][0]["expire_time"] + " ====== now + " + str(ets // 60), line1="===") r.set(ip, 0, ex=ets) time.sleep(1) else: print(json_data) except: print("not json data" + resp.text) traceback.print_exc() else: print("proxy return error" + str(resp.text)) else: print("Enough") time.sleep(10)
def get_date_time(s): if not s or not isinstance(s, str): return "" r = r"(\d{2,4}[-/年]\d{1,2}[-/月]\d{1,2}日? \d{2}:\d{2}(:\d{2})?)" r = r"\d{2,4}[-/年]\d{1,2}[-/月]\d{1,2}日? \d{2}:\d{2}(:\d{2})?" p = re.compile(r) result = re.findall(p, s) try: result[0] = result[0].replace('年', '-').replace('月', '-').replace( '日', '').replace('号', '') if result[0] == "今天": return DT.GetToday() if len(result[0]) == 4 or len(result[0]) == 5: print(result[0]) return str(DT.GetToday().split("-")[0]) + result[0] return result[0] except Exception as e: print(e)
def get_date(s): if not s or not isinstance(s, str): return "" p = re.compile( r'\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}|\d{4}年\d{1,2}月\d{1,2}[日号]|今天|\d{1,2}月\d{1,2}[日号]' ) result = re.findall(p, s) try: print(result[0]) result[0] = result[0].replace('年', '-').replace('月', '-').replace( '日', '').replace('号', '') if result[0] == "今天": return DT.GetToday() if len(result[0]) == 4 or len(result[0]) == 5: print(result[0]) return str(DT.GetToday().split("-")[0]) + result[0] return result[0] except Exception as e: print(e)
def inner(*args, **kwargs): if len(args) < 2: json_data = kwargs.get("json_data", None) spider = kwargs.get("spider", None) fp = kwargs.get("fp", None) else: json_data = args[0] spider = args[1] fp = args[3] if json_data: today = dt.get_today().replace("-", "_") uf.FileHelper.mkdir(fp + spider) json.dump(json_data, open(fp + spider + os.sep + today + ".json", "a", encoding="utf-8"), ensure_ascii=False) return func(*args, **kwargs) # 2
def __init__(self, *args, **kwargs): super(BBSItem, self).__init__(self, *args, **kwargs) self['platform'] = '' self['keyword'] = '' self['insert_time'] = ctime.GetNowTime() self['bbs_url'] = '' self['bbs_name'] = '' self['bbs_time'] = '' self['bbs_floor'] = "" self['bbs_info'] = {} self['bbs_source_url'] = '' self["bbs_id"] = "" self['spider'] = '' self['searchword'] = '' self["pkeyword"] = "" self["job_id"] = "" self["car_id"] = "" self["cache_info"] = { "ts": 86400 * 3, "file": True }
def json_2_redis(*args, **kw): rcfg = kw.get("rcfg") if not rcfg: print("No rcfg" + "===" * 10) return rename = kw.get("rename", 0) conn_redis = dr.RedisHelper.get_redis_connect_by_cfg(rcfg) fp = kw.get("fp", "") ts = kw.get("ts", 1) spider = kw.get("spider") if rename and str(fp).startswith(dt.get_today().replace("-", "_")): return for line in open(fp, encoding="utf-8"): length = conn_redis.llen(spider + ":items") if length > 50000: bf.print_from_head(fp + "\t Too much,Please customer\t" + str(length) + "\t\t") time.sleep(ts) bf.print_blank_end(conn_redis.lpush(spider + ":items", line)) if rename: uf.FileHelper.rename_file(fp, str(fp) + "1") print("=====File Over\t" + fp + "=====") conn_redis.connection_pool.disconnect()
def print_from_head(s, **kw): print_no_end("\r" + Dt.get_now_time() + "\t" + str(s), **kw)
def printFromHead(s, **kw): printNoEnd("\r" + dt.GetNowTime() + "\t" + str(s), **kw)