def process_exception(self, request, exception, spider): ex_class = global_object_name(exception.__class__) ticker = request.meta["ticker"] self.stats.inc_value('downloader/exception_count', spider=spider) self.stats.inc_value('downloader/exception_type_count/%s' % ex_class, spider=spider) self.stats.set_value(f'downloader/my_errors/{ticker}', ex_class)
def _retry(self, request, reason, spider): # 为代理列表中代理tag+1,表示已经一次失效 self.add_proxy_tag(request.meta.get('proxy', '')) # 判断失效次数,如果这是第二次则删除,该代理 now_proxy = request.meta.get('proxy', '') now_ip_port = now_proxy.split(':')[1][2:] + ':' + now_proxy.split( ':')[2] for each in IP_PORT_LIST: if each != '': if now_ip_port == each[0] and each[1] == 8: self.delete_proxy(request.meta.get('proxy', False)) self.delete_list_proxy(request.meta.get('proxy', False)) retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: # 删除该代理,并从列表中剔除 # self.delete_proxy(request.meta.get('proxy', False)) # self.delete_list_proxy(request.meta.get('proxy', False)) logger.debug( "Retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq else: # 删除该代理,并从列表中剔除 # self.delete_proxy(request.meta.get('proxy', False)) # self.delete_list_proxy(request.meta.get('proxy', False)) stats.inc_value('retry/max_reached') logger.debug( "Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider})
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq else: stats.inc_value('retry/max_reached') url = request.url self.server.lpush('programList:starturls', url) proxy = request.meta['proxy'] self.server.srem("proxy_set", proxy) logger.debug( "rewrite %(request)s to redis because retrying failed %(retries)d times: %(reason)s, " "and remove the bad proxy %(proxy)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider})
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: logger.debug( "Retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq else: request.dont_filter = True request.priority = request.priority + self.priority_adjust return request
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq else: stats.inc_value('retry/max_reached') logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider})
def _retry(self, request, reason, spider): retries = request.meta.get('retry_proxy_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: logger.debug("RetryWithProxy: Retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_proxy_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) _proxy = next(ips)[0] proxy = _proxy['http'] if 'http' in _proxy.keys() else _proxy['https'] retryreq.meta['proxy'] = proxy logger.info(f"Retrying with proxy: {proxy}") return retryreq else: stats.inc_value('retry/max_reached') logger.error("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider})
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: logger.info("Retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value(f'retry/reason_count/{reason}') return retryreq else: stats.inc_value('retry/max_reached') logger.error("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) #放弃超时的url并记录下来 with open(self.errorpath,'a+') as fp: fp.write(str(request.url)+"\n")
def init_database(self, pool, mysql_config, db, table, item, taskid, spider_name): response, spider = self._hook(taskid, spider_name) # 这里有看不见的钩子 # 需要注意的是,在一些老版本的mysql 里面并不支持 utf8mb4。 # 所以:这都什么时代了,赶紧使用大于 5.5 版本的 mysql ! charset = mysql_config.get('charset') ''' CREATE TABLE `student` ( `s_id` MEDIUMTEXT NULL, `s_name` MEDIUMTEXT NULL, `s_age` MEDIUMTEXT NULL, `s_msg` MEDIUMTEXT NULL, ); ''' try: conn = pool.dbapi.connect(**mysql_config) cursor = conn.cursor() table_sql = "" for k,v in item.items(): # 创建db,创建表名,所有字段都以 MEDIUMTEXT 存储 # MEDIUMTEXT 最大能使用16M 的长度,所以对于一般的html文本已经足够。 table_sql += '`{}` MEDIUMTEXT NULL,'.format(str(k)) cursor.execute('Create Database If Not Exists {} Character Set {}'.format(db, charset)) cursor.execute('Create Table If Not Exists `{}`.`{}` ({})'.format(db, table, table_sql.strip(','))) conn.commit() cursor.close() conn.close() except Exception as e: traceback.print_exc() ex_class = global_object_name(e.__class__) self.stats.inc_value('create_db/exception_count', spider=spider) self.stats.inc_value('create_db/exception_type_count/%s' % ex_class, spider=spider)
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq else: # 全部重试错误,要保存错误的url和参数 - start error_request = spider.name + ":error_urls" self.redis_client.sadd(error_request, request.url) # 全部重试错误,要保存错误的url和参数 - en stats.inc_value('retry/max_reached') logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider})
def process_exception(self, request, exception, spider): ex_class = global_object_name(exception.__class__) ## 异常数量的统计 self.stats.inc_value('downloader/exception_count', spider=spider) ## 异常类型数量的统计 self.stats.inc_value('downloader/exception_type_count/%s' % ex_class, spider=spider)
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust # 重新获取代理,添加代理 proxy_ip = "http://" + getProxy() retryreq.meta['proxy'] = proxy_ip if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) # print(retryreq) # print("*"*100) return retryreq else: stats.inc_value('retry/max_reached') logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider})
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) with open('error.txt', 'a') as f: f.write(f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]-{request.url}--{reason}--MyRetry\n") return retryreq else: stats.inc_value('retry/max_reached') logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider})
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) retryreq = request.copy() # 主要魔改的这个函数,因为在重访问的过程中使用到了 Request 的copy函数重新创建对象 # 这个函数不会将 _plusmeta 重新拷贝一份,所以这里需要重新拷贝,否则会有问题。 # 重点就仅仅是魔改新增了下面这一行而已 retryreq._plusmeta = request._plusmeta retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq else: stats.inc_value('retry/max_reached') logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider})
def _retry(self, request, reason, spider): retries = request.meta.get("retry_times", 0) + 1 stats = spider.crawler.stats spider.logger.debug( "Retrying %(request)s (failed %(retries)d times): %(reason)s", { "request": request, "retries": retries, "reason": reason }, extra={"spider": spider}, ) retryreq = request.copy() retryreq.meta["retry_times"] = retries retryreq.dont_filter = True if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value("retry/count") stats.inc_value("retry/reason_count/%s" % reason) current_folio = request.meta["state"].current_folio spider.logger.info( "error: %s on folio %s, backing off %s seconds", reason, current_folio, retries, ) time.sleep(1 * retries) return retryreq
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) # ua = random.choice(user_agent_list) # request.headers.setdefault('User-Agent', ua) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust # 重新获取代理,添加代理 proxy_ip = "http://" + getProxy() retryreq.meta['proxy'] = proxy_ip # retryreq.meta['headers'] = proxy_ip print(proxy_ip) print("-" * 100) if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: time.sleep(3) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq else: stats.inc_value('retry/max_reached') spider.logger.warn('{0}超过重试次数,停止重试'.format(request.url))
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: spider.logger.debug( "Retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq else: if isinstance(reason, TimeoutError) or isinstance( reason, TCPTimedOutError) or isinstance( reason, ConnectionRefusedError ) and not spider.direct_connection: # retryreq = self.change_request_proxy(request, spider) retryreq = spider.change_request_proxy(request) return retryreq # elif isinstance(reason, HttpError) and reason.value.response.status == 429: # if not spider.direct_connection: # retryreq = self.change_request_proxy(request, spider) # return retryreq # else: # sleep_time = F.rand_int((180, 600)) # spider.logger.info('Meet 429 code! Sleep for {} seconds...'.format(sleep_time)) # self.crawler.engine.pause() # time.sleep(sleep_time) # spider.logger.info('Wake up!') # self.crawler.engine.unpause() # return request else: stats.inc_value('retry/max_reached') spider.logger.debug( "Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider})
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries < retry_times: logger.debug( "Retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider}) # 删除该代理 self.delete_proxy(request.meta.get('proxy', False)) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust # 设置新的ip代理 global IP_PORT_QUEUE if IP_PORT_QUEUE.empty(): IP_PORT_QUEUE = get_ip_port_queue() ip_port = IP_PORT_QUEUE.get() # 删除当前代理 # request.meta.pop('proxy') retryreq.meta['proxy'] = 'http://' + ip_port if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq else: stats.inc_value('retry/max_reached') logger.debug( "Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider})
def _retry(self, request, reason, spider, response): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: logger.debug( "Retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq elif self.server is not None: stats.inc_value('retry/max_reached') logger.debug( "Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider}) reason = response_status_message(response.status) dict_response = { 'url': request.url, 'reason': reason, 'retries': retries } data = self.default_serialize(dict_response) print '*' * 10 + 'record invaild request and url is %s' % request.url + '*' * 10 RETRYT_KEY = '%(spider)s:invailrequest' self.server.rpush(RETRYT_KEY, data) return response
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: logger.debug( "Retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider}) # 修改2 # http_proxy = request.meta.get('http_proxy') # self.delete_proxy(http_proxy) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq else: # 修改1 formdata = unquote(request.body.decode('utf-8')) http_proxy = request.meta.get('http_proxy') self.cant_retry_formdata_set.add(formdata + ' ' + http_proxy) # print(formdata + ' ' + http_proxy) stats.inc_value('retry/max_reached') logger.debug( "Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider})
def insert_item(self, conn, db, table, item, taskid, spider_name): response, spider = self._hook(taskid, spider_name) # 这里有看不见的钩子 # 使用 json 通用处理,存储时保证了数据类型,取数据时候使用 json.loads 来解析类型。 table_sql = "" for k,v in item.items(): table_sql += "'{}',".format(json.dumps(v)) try: conn.execute('INSERT INTO `{}`.`{}` VALUES({})'.format(db, table, table_sql.strip(','))) self.stats.inc_value('item_mysql/db:{}/table:{}/count'.format(db, table), spider=spider) except Exception as e: traceback.print_exc() ex_class = global_object_name(e.__class__) self.stats.inc_value('item/exception_count', spider=spider) self.stats.inc_value('item/exception_type_count/%s' % ex_class, spider=spider)
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: logger.debug( "Retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust _proxy = Proxy() number = random.randint(20, 50) proxy_id = _proxy.get_ip(server_id=number) proxy_id = proxy_id.decode() proxy = "http://" + proxy_id + ":9990" retryreq.meta["proxy"] = proxy if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq else: _proxy = Proxy() proxy = _proxy.get() proxy2 = _proxy.get() proxy3 = _proxy.get() proxy4 = _proxy.get() request.meta["proxy"] = "http://" + proxy4 request.dont_filter = True request.priority = request.priority + self.priority_adjust # return self.process_request(request, spider) return request
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 print("获取request.meta.getretry_times", request.meta.get('retry_times')) retry_times = self.max_retry_times print("retry_timesretry_times", type(retry_times), retry_times) if 'max_retry_times' in request.meta: retry_times = request.meta[ 'max_retry_times'] #settings未设置时从这里读取默认的重试次数 print("默认重试次数,settings未设置", type(retry_times), request.meta['max_retry_times']) stats = spider.crawler.stats if retries <= retry_times: logger.debug( "重试elseRetrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq else: print("执行else,放弃重试将有异常的url写入redis中") r.sadd('jiayuan_except', request.url) #避免重复,使用set r.save() stats.inc_value('retry/max_reached') logger.debug( "放弃了重试,已重试10000次了, retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider})
def process_exception(self, request, exception, spider): if (isinstance(exception, self.EXCEPTIONS_TO_RETRY) and not request.meta.get('dont_retry', False)): if isinstance(exception, (TunnelError, defer.TimeoutError, TimeoutError)): if self.need_switch_proxy: request.meta["need_switch_proxy"] = True ret = self._retry(request, exception, spider) if ret: return ret else: # 重试次数达到状态 response = HtmlResponse(url='', request=request) response._status = 1 if isinstance(exception, Exception): reason = global_object_name(exception.__class__) self.logger.debug( "max retries had reached because of {}!".format(reason)) return response
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats proxy = request.meta['proxy'] if retries <= retry_times: logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) # logger.debug('Retrying proxy <%s> #%d: %s' % (proxy, retries, reason)) retryreq = self._get_retry_request(request, retries) if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) else: stats.inc_value('retry/max_reached') logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) # try: del self.proxies[proxy] retryreq = self._get_retry_request(request, 0) self.change_proxy(retryreq) logger.error('Removing proxy <%s>, %d proxies left' % (proxy, len(self.proxies))) # except (ValueError, KeyError): # pass return retryreq
def _retry(self, request, reason, spider): retries = request.meta.get("retry_times", 0) + 1 retry_times = self.max_retry_times if "max_retry_times" in request.meta: retry_times = request.meta["max_retry_times"] stats = spider.crawler.stats if retries <= retry_times: logger.debug( "Retrying %(request)s (failed %(retries)d times): %(reason)s", { "request": request, "retries": retries, "reason": reason }, extra={"spider": spider}, ) retryreq = request.copy() retryreq.meta["retry_times"] = retries retryreq.meta["refresh_cache"] = True retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value("retry/count") stats.inc_value("retry/reason_count/%s" % reason) return retryreq else: stats.inc_value("retry/max_reached") logger.debug( "Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", { "request": request, "retries": retries, "reason": reason }, extra={"spider": spider}, )
def _retry(self, request, reason, spider): """modified logger level""" retries = request.meta.get('retry_times', 0) + 1 retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: logger.info( "Retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.meta['change_proxy'] = True # if use proxy, change proxy retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq else: stats.inc_value('retry/max_reached') logger.warning( "Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider})
def retry(self, request: ScrapflyScrapyRequest, reason: Union[str, Exception], delay: Optional[int] = None): logger.info('==> Retrying request for reason %s' % reason) stats = self.crawler.stats retries = request.meta.get('retry_times', 0) + 1 if retries >= self.custom_settings.get('SCRAPFLY_MAX_API_RETRIES', 5): return None retryreq = request.replace(dont_filter=True) retryreq.priority += 100 if retryreq.scrape_config.cache is True: retryreq.scrape_config.cache_clear = True retryreq.meta['retry_times'] = retries if isinstance(reason, ScrapflyError): stats.inc_value(f'scrapfly/api_retry/{reason.code}') if isinstance(reason, Exception): reason = global_object_name(reason.__class__) logger.warning(f"Retrying {request} for x{retries - 1}: {reason}", extra={'spider': self}) stats.inc_value('scrapfly/api_retry/count') if delay is None: deferred = Deferred() deferred.addCallback(self.crawler.engine.schedule, request=retryreq, spider=self) else: deferred = task.deferLater(reactor, delay, self.crawler.engine.crawl, retryreq, self) return deferred
def process_exception(self, request, exception, spider): if isinstance(exception, (IgnoreRequest, DropItem)): return if not self._is_enabled_for_request(request): return autoextract = request.meta.pop(AUTOEXTRACT_META_KEY) stop_time = time.time() latency = time.time() - autoextract['timing']['start_ts'] autoextract['timing'].update({'end_ts': stop_time, 'latency': latency}) # Make sure to log all unknown failures logger.warning('AutoExtract failure after %.3fs for %s: %s', latency, autoextract['original_url'], repr(exception), extra={'spider': spider}) request.meta['autoextract'] = autoextract ex_class = global_object_name(exception.__class__) self.inc_metric('autoextract/errors/total_count', spider=spider) self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider)
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 stats = spider.crawler.stats logger.debug( "Retrying %(request)s (failed %(retries)d times): %(reason)s", { 'request': request, 'retries': retries, 'reason': reason }, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries proxy = self.get_random_proxy() retryreq.meta['proxy'] = proxy retryreq.dont_filter = True retryreq.priority = request.priority + 1 if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq
def _retry(self, request, reason, spider): response_retries = request.meta.get('response_retry', 0) exception_retries = request.meta.get('exception_retry', 0) print("response_retries is %s" % response_retries) print("exception_retries is %s" % exception_retries) retries = response_retries + exception_retries retry_times = self.max_retry_times if 'max_retry_times' in request.meta: retry_times = request.meta['max_retry_times'] stats = spider.crawler.stats if retries <= retry_times: logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust if isinstance(reason, Exception): reason = global_object_name(reason.__class__) stats.inc_value('retry/count') stats.inc_value('retry/reason_count/%s' % reason) return retryreq else: stats.inc_value('retry/max_reached') logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", {'request': request, 'retries': retries, 'reason': reason}, extra={'spider': spider}) # 如果主要是由于出现exception,则说明该ip地址很可能失效 if exception_retries > response_retries: # 随意封装一个response,返回给靠近engin的middleware,也就是上面定义的MiddlewareIpagentDownloaderMiddleware response = HtmlResponse(url='retry_over_exception') return response
def process_exception(self, request, exception, spider): ex_class = global_object_name(exception.__class__) self.stats.inc_value('downloader/exception_count', spider=spider) self.stats.inc_value('downloader/exception_type_count/%s' % ex_class, spider=spider)