Пример #1
0
 def process_request(self, request, spider):
     save_path = ToCrawlUrl.settings.get(
         "ROOT_PATH_LOG") + ctime.get_today() + "_to_crawl.log"
     log_file(save_path,
              str(ctime.get_now_time()) + "\t" + spider.name + "\t" +
              request.url + "\n",
              method="a")
Пример #2
0
 def process_request(self, request, spider):
     save_path = self.settings.get(
         "ROOT_PATH_LOG") + Dt.get_today() + "_to_crawl.log"
     # TODO:
     logging.info(save_path,
                  str(Dt.get_now_time()) + "\t" + spider.name + "\t" +
                  request.url + "\n",
                  method="a")
Пример #3
0
 def process_response(self, request, response, spider):
     save_path = CrawledUrl.settings.get(
         "ROOT_PATH_LOG") + ctime.get_today() + "_carwled.log"
     log_file(save_path,
              str(ctime.get_now_time()) + "\t" + spider.name + "\t" +
              str(response.status) + "\t" + request.url + "\n",
              method="a")
     return response
Пример #4
0
 def process_response(self, request, response, spider):
     save_path = self.settings.get(
         "ROOT_PATH_LOG") + Dt.get_today() + "_carwled.log"
     # TODO:
     logging.info(save_path,
                  str(Dt.get_now_time()) + "\t" + spider.name + "\t" +
                  str(response.status) + "\t" + request.url + "\n",
                  method="a")
     return response
Пример #5
0
def add_tai_yang(redis_host="127.0.0.1", db=11):
    r = redis.Redis(redis_host, db=db)
    url = "http://http-api.taiyangruanjian.com/getip?num=1&type" \
          "=2&pro=&city=0&yys=0&port=11&pack=13604&ts=1&ys=0&cs=0&lb=1&sb=0&pb=4&mr=0&regions="
    while True:
        time.sleep(1)
        if r.dbsize() < 3:
            resp = Rh.RequestHelper.send_cache_request(url)
            if resp.status == 200:
                try:
                    json_data = json.loads(resp.text)
                    if json_data["code"] == 0:
                        ip = str(json_data["data"][0]["ip"]) + ":" + str(
                            json_data["data"][0]["port"])
                        ets = int(
                            Dt.str_to_ts(json_data["data"][0]["expire_time"]) -
                            time.time() + 1)
                        d("OK " + json_data["data"][0]["expire_time"] +
                          " ====== now + " + str(ets // 60),
                          line1="===")
                        r.set(ip, 0, ex=ets)
                        time.sleep(1)
                except:
                    print("not json data" + resp.text)
                    traceback.print_exc()
            else:
                print("proxy return error" + str(resp.text))
        else:
            break
    return 1
Пример #6
0
def add_zhima(r):
    url = "http://webapi.http.zhimacangku.com/getip?num=1&type=2&pro=&city=0&yys=0&port=1&pack=15624&ts=1&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions="
    while True:
        time.sleep(1)
        if r.dbsize() < 2:
            resp = Rh.RequestHelper.send_cache_request(url)
            if resp.status == 200:
                try:
                    json_data = json.loads(resp.text)
                    if json_data["code"] == 0:
                        ip = str(json_data["data"][0]["ip"]) + ":" + str(
                            json_data["data"][0]["port"])
                        ets = int(
                            Dt.str_to_ts(json_data["data"][0]["expire_time"]) -
                            time.time() + 1)
                        d("OK " + json_data["data"][0]["expire_time"] +
                          " ====== now + " + str(ets // 60),
                          line1="===")
                        r.set(ip, 0, ex=ets)
                        time.sleep(1)
                    else:
                        print(json_data)
                except:
                    print("not json data" + resp.text)
                    traceback.print_exc()
            else:
                print("proxy return error" + str(resp.text))
        else:
            print("Enough")
            time.sleep(10)
Пример #7
0
 def get_date_time(s):
     if not s or not isinstance(s, str):
         return ""
     r = r"(\d{2,4}[-/年]\d{1,2}[-/月]\d{1,2}日? \d{2}:\d{2}(:\d{2})?)"
     r = r"\d{2,4}[-/年]\d{1,2}[-/月]\d{1,2}日? \d{2}:\d{2}(:\d{2})?"
     p = re.compile(r)
     result = re.findall(p, s)
     try:
         result[0] = result[0].replace('年', '-').replace('月', '-').replace(
             '日', '').replace('号', '')
         if result[0] == "今天":
             return DT.GetToday()
         if len(result[0]) == 4 or len(result[0]) == 5:
             print(result[0])
             return str(DT.GetToday().split("-")[0]) + result[0]
         return result[0]
     except Exception as e:
         print(e)
Пример #8
0
 def get_date(s):
     if not s or not isinstance(s, str):
         return ""
     p = re.compile(
         r'\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}|\d{4}年\d{1,2}月\d{1,2}[日号]|今天|\d{1,2}月\d{1,2}[日号]'
     )
     result = re.findall(p, s)
     try:
         print(result[0])
         result[0] = result[0].replace('年', '-').replace('月', '-').replace(
             '日', '').replace('号', '')
         if result[0] == "今天":
             return DT.GetToday()
         if len(result[0]) == 4 or len(result[0]) == 5:
             print(result[0])
             return str(DT.GetToday().split("-")[0]) + result[0]
         return result[0]
     except Exception as e:
         print(e)
Пример #9
0
 def inner(*args, **kwargs):
     if len(args) < 2:
         json_data = kwargs.get("json_data", None)
         spider = kwargs.get("spider", None)
         fp = kwargs.get("fp", None)
     else:
         json_data = args[0]
         spider = args[1]
         fp = args[3]
     if json_data:
         today = dt.get_today().replace("-", "_")
         uf.FileHelper.mkdir(fp + spider)
         json.dump(json_data, open(fp + spider + os.sep + today + ".json", "a", encoding="utf-8"), ensure_ascii=False)
     return func(*args, **kwargs)  # 2
Пример #10
0
 def __init__(self, *args, **kwargs):
     super(BBSItem, self).__init__(self, *args, **kwargs)
     self['platform'] = ''
     self['keyword'] = ''
     self['insert_time'] = ctime.GetNowTime()
     self['bbs_url'] = ''
     self['bbs_name'] = ''
     self['bbs_time'] = ''
     self['bbs_floor'] = ""
     self['bbs_info'] = {}
     self['bbs_source_url'] = ''
     self["bbs_id"] = ""
     self['spider'] = ''
     self['searchword'] = ''
     self["pkeyword"] = ""
     self["job_id"] = ""
     self["car_id"] = ""
     self["cache_info"] = {
         "ts": 86400 * 3,
         "file": True
     }
Пример #11
0
    def json_2_redis(*args, **kw):
        rcfg = kw.get("rcfg")
        if not rcfg:
            print("No rcfg" + "===" * 10)
            return
        rename = kw.get("rename", 0)
        conn_redis = dr.RedisHelper.get_redis_connect_by_cfg(rcfg)
        fp = kw.get("fp", "")
        ts = kw.get("ts", 1)
        spider = kw.get("spider")
        if rename and str(fp).startswith(dt.get_today().replace("-", "_")):
            return

        for line in open(fp, encoding="utf-8"):
            length = conn_redis.llen(spider + ":items")
            if length > 50000:
                bf.print_from_head(fp + "\t Too much,Please customer\t" +
                                   str(length) + "\t\t")
                time.sleep(ts)
            bf.print_blank_end(conn_redis.lpush(spider + ":items", line))
        if rename:
            uf.FileHelper.rename_file(fp, str(fp) + "1")
        print("=====File Over\t" + fp + "=====")
        conn_redis.connection_pool.disconnect()
Пример #12
0
def print_from_head(s, **kw):
    print_no_end("\r" + Dt.get_now_time() + "\t" + str(s), **kw)
Пример #13
0
def printFromHead(s, **kw):
    printNoEnd("\r" + dt.GetNowTime() + "\t" + str(s), **kw)