Пример #1
0
 def process_request(self, request, spider):
     save_path = ToCrawlUrl.settings.get(
         "ROOT_PATH_LOG") + ctime.get_today() + "_to_crawl.log"
     log_file(save_path,
              str(ctime.get_now_time()) + "\t" + spider.name + "\t" +
              request.url + "\n",
              method="a")
Пример #2
0
 def process_request(self, request, spider):
     save_path = self.settings.get(
         "ROOT_PATH_LOG") + Dt.get_today() + "_to_crawl.log"
     # TODO:
     logging.info(save_path,
                  str(Dt.get_now_time()) + "\t" + spider.name + "\t" +
                  request.url + "\n",
                  method="a")
Пример #3
0
 def process_response(self, request, response, spider):
     save_path = CrawledUrl.settings.get(
         "ROOT_PATH_LOG") + ctime.get_today() + "_carwled.log"
     log_file(save_path,
              str(ctime.get_now_time()) + "\t" + spider.name + "\t" +
              str(response.status) + "\t" + request.url + "\n",
              method="a")
     return response
Пример #4
0
 def process_response(self, request, response, spider):
     save_path = self.settings.get(
         "ROOT_PATH_LOG") + Dt.get_today() + "_carwled.log"
     # TODO:
     logging.info(save_path,
                  str(Dt.get_now_time()) + "\t" + spider.name + "\t" +
                  str(response.status) + "\t" + request.url + "\n",
                  method="a")
     return response
Пример #5
0
 def get_date_time(s):
     if not s or not isinstance(s, str):
         return ""
     r = r"(\d{2,4}[-/年]\d{1,2}[-/月]\d{1,2}日? \d{2}:\d{2}(:\d{2})?)"
     r = r"\d{2,4}[-/年]\d{1,2}[-/月]\d{1,2}日? \d{2}:\d{2}(:\d{2})?"
     p = re.compile(r)
     result = re.findall(p, s)
     try:
         result[0] = result[0].replace('年', '-').replace('月', '-').replace(
             '日', '').replace('号', '')
         if result[0] == "今天":
             return DT.get_today()
         if len(result[0]) == 4 or len(result[0]) == 5:
             print(result[0])
             return str(DT.get_today().split("-")[0]) + result[0]
         return result[0]
     except Exception as e:
         print(e)
Пример #6
0
 def get_date(s):
     if not s or not isinstance(s, str):
         return ""
     p = re.compile(
         r'\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}|\d{4}年\d{1,2}月\d{1,2}[日号]|今天|\d{1,2}月\d{1,2}[日号]'
     )
     result = re.findall(p, s)
     try:
         print(result[0])
         result[0] = result[0].replace('年', '-').replace('月', '-').replace(
             '日', '').replace('号', '')
         if result[0] == "今天":
             return DT.get_today()
         if len(result[0]) == 4 or len(result[0]) == 5:
             print(result[0])
             return str(DT.get_today().split("-")[0]) + result[0]
         return result[0]
     except Exception as e:
         print(e)
Пример #7
0
 def inner(*args, **kwargs):
     if len(args) < 2:
         json_data = kwargs.get("json_data", None)
         spider = kwargs.get("spider", None)
         fp = kwargs.get("fp", None)
     else:
         json_data = args[0]
         spider = args[1]
         fp = args[3]
     if json_data:
         today = dt.get_today().replace("-", "_")
         uf.FileHelper.mkdir(fp + spider)
         json.dump(json_data, open(fp + spider + os.sep + today + ".json", "a", encoding="utf-8"), ensure_ascii=False)
     return func(*args, **kwargs)  # 2
Пример #8
0
    def json_2_redis(*args, **kw):
        rcfg = kw.get("rcfg")
        if not rcfg:
            print("No rcfg" + "===" * 10)
            return
        rename = kw.get("rename", 0)
        conn_redis = dr.RedisHelper.get_redis_connect_by_cfg(rcfg)
        fp = kw.get("fp", "")
        ts = kw.get("ts", 1)
        spider = kw.get("spider")
        if rename and str(fp).startswith(dt.get_today().replace("-", "_")):
            return

        for line in open(fp, encoding="utf-8"):
            length = conn_redis.llen(spider + ":items")
            if length > 50000:
                bf.print_from_head(fp + "\t Too much,Please customer\t" +
                                   str(length) + "\t\t")
                time.sleep(ts)
            bf.print_blank_end(conn_redis.lpush(spider + ":items", line))
        if rename:
            uf.FileHelper.rename_file(fp, str(fp) + "1")
        print("=====File Over\t" + fp + "=====")
        conn_redis.connection_pool.disconnect()