def process_exception(self, request, exception, spider): to_return = RetryMiddleware.process_exception( self, request, exception, spider) # customize retry middleware by modifying this request.meta['url'] = request.url self.record_failed('failed.txt', request, exception, 'url') return to_return
def process_exception(self, request, exception, spider): to_return = RetryMiddleware.process_exception(self, request, exception, spider) # customize retry middleware by modifying this request.meta['url'] = request.url self.record_failed('failed.txt', request, exception, 'url') return to_return
def _retry(self, request, reason, spider): if isinstance(reason, TCPTimedOutError): reason.args = (u'...',) retries = request.meta.get('retry_times', 0) if str(reason).find('404') > -1 and request.callback.im_class == CarDetailSpider: ci = request.cookies[FetchConstant.CarInfo] fs = FetchSession() ci_exist = fs.query(CarInfo).filter(CarInfo.seqid == ci.seqid).first() if ci_exist: try: ci_exist.statustype = CarInfoValueConst.offline ci_exist.offlinedatetime = datetime.datetime.today() fs.commit() msg = (u'[404] seqid: %s ,url not exist %s') % (ci.seqid, request.url,) spider.log(msg, log.INFO) except: fs.rollback() finally: fs.close() request.meta['retry_times'] = self.max_retry_times return RetryMiddleware._retry(self, request, reason, spider) if retries <= self.max_retry_times - 1: next_proxy = get_valid_proxy.next() rs = request.copy() if next_proxy: proxy_str = next_proxy.build_literal() rs = rs.replace(dont_filter=True) rs.meta['proxy'] = proxy_str msg = (u'use to %s access %s ') % (proxy_str, rs.url) spider.log(msg, log.DEBUG) else: try: del rs.meta[u'proxy'] msg = (u'use self ip asscess %s') % (rs.url) spider.log(msg, log.DEBUG) except :pass return RetryMiddleware._retry(self, rs, reason, spider)
class RetryTest(unittest.TestCase): def setUp(self): self.spider = BaseSpider() self.mw = RetryMiddleware() self.mw.max_retry_times = 2 def test_priority_adjust(self): req = Request('http://www.scrapytest.org/503') rsp = Response('http://www.scrapytest.org/503', body='', status=503) req2 = self.mw.process_response(req, rsp, self.spider) assert req2.priority < req.priority def test_404(self): req = Request('http://www.scrapytest.org/404') rsp = Response('http://www.scrapytest.org/404', body='', status=404) # dont retry 404s assert self.mw.process_response(req, rsp, self.spider) is rsp def test_503(self): req = Request('http://www.scrapytest.org/503') rsp = Response('http://www.scrapytest.org/503', body='', status=503) # first retry req = self.mw.process_response(req, rsp, self.spider) assert isinstance(req, Request) self.assertEqual(req.meta['retry_times'], 1) # second retry req = self.mw.process_response(req, rsp, self.spider) assert isinstance(req, Request) self.assertEqual(req.meta['retry_times'], 2) # discard it assert self.mw.process_response(req, rsp, self.spider) is rsp def test_twistederrors(self): for exc in (ServerTimeoutError, DNSLookupError, ConnectionRefusedError, ConnectionDone, ConnectError, ConnectionLost): req = Request('http://www.scrapytest.org/%s' % exc.__name__) self._test_retry_exception(req, exc()) def _test_retry_exception(self, req, exception): # first retry req = self.mw.process_exception(req, exception, self.spider) assert isinstance(req, Request) self.assertEqual(req.meta['retry_times'], 1) # second retry req = self.mw.process_exception(req, exception, self.spider) assert isinstance(req, Request) self.assertEqual(req.meta['retry_times'], 2) # discard it req = self.mw.process_exception(req, exception, self.spider) self.assertEqual(req, None)
def _retry(self, request, reason, spider): log.message('Changing proxy') conn = TorCtl.connect(passphrase="1234") conn.sendAndRecv('signal newnym\r\n') conn.close() time.sleep(3) log.message("renewed") return RetryMiddleware._retry(self, request, reason, spider)
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) proxy = request.meta.get(u'proxy') if retries <= self.max_retry_times - 1: try: next_proxy = spider.get_next_proxy(request.cookies) except Exception: msg = (u'there is no proxy list in cookies %s ,please check') spider.log(msg, log.WARNING) return RetryMiddleware._retry(self, request, reason, spider) if proxy: msg = (u'proxy %s fail, use %s for the %srd time ' 'retry') % (proxy, next_proxy, retries) else: msg = (u'request without proxy , use %s for the %srd time ' 'retry') % (next_proxy, retries) spider.log(msg, log.INFO) request.meta[u'proxy'] = next_proxy return RetryMiddleware._retry(self, request, reason, spider)
def _retry(self, request, reason, spider): log.msg('Changing-proxy') tn = telnetlib.Telnet('127.0.0.1', 9051) tn.read_until("Escape character is '^]'.", 2) tn.write('AUTHENTICATE "267765"\r\n') tn.read_until("250 OK", 2) tn.write("signal NEWNYM\r\n") tn.read_until("250 OK", 2) tn.write("quit\r\n") tn.close() time.sleep(3) log.msg('Proxychanged') return RetryMiddleware._retry(self, request, reason, spider)
def _retry(self, request, reason, spider): settings = spider.settings if RetryChangeProxyMiddleware.conn is None: RetryChangeProxyMiddleware.conn = TorCtl.connect(controlAddr=settings.get('TOR_HOST'), controlPort=settings.get('TOR_PORT'), passphrase=settings.get('TOR_PASSW')) RetryChangeProxyMiddleware.last = 0 RetryChangeProxyMiddleware.timelimit = settings.get('TOR_CHANGE_LIMIT') if isinstance(reason, basestring): #log.msg('Valid retry, reason: ' + reason + ' for URL ' + request.url, log.INFO) t = time.time() diff = t - RetryChangeProxyMiddleware.last if RetryChangeProxyMiddleware.conn and diff > RetryChangeProxyMiddleware.timelimit: TorCtl.Connection.send_signal(RetryChangeProxyMiddleware.conn, "NEWNYM") RetryChangeProxyMiddleware.last = t log.msg('Proxy changed for reason %s. New last: %s' % (reason, time.strftime("%H:%M:%S")), log.INFO) #else: # log.msg('Proxy not changed! Time difference is %s seconds' % ("{:.2f}".format(diff)), log.INFO) return RetryMiddleware._retry(self, request, reason, spider)
def __init__(self, settings): RetryMiddleware.__init__(self, settings)
def setUp(self): self.spider = BaseSpider() self.mw = RetryMiddleware() self.mw.max_retry_times = 2
def _retry_proxy(self, request, reason, spider): change_proxy(log_msg=True) # time.sleep(1) return RetryMiddleware._retry(self, request, reason, spider)
def setUp(self): crawler = get_crawler(Spider) self.spider = crawler._create_spider('foo') self.mw = RetryMiddleware.from_crawler(crawler) self.mw.max_retry_times = 2
def setUp(self): self.spider = BaseSpider('foo') self.mw = RetryMiddleware() self.mw.max_retry_times = 2
def setUp(self): crawler = get_crawler() self.spider = Spider("foo") self.mw = RetryMiddleware.from_crawler(crawler) self.mw.max_retry_times = 2
def _retry(self, request, exception, spider): if isinstance(exception, ConnectionRefusedError): TorManager.get_instance().refresh_circuit() time.sleep(3) log.msg('Connection refused and tor circuit refreshed') return RetryMiddleware._retry(self, request, exception, spider)
def process_response(self, request, response, spider): if not busy(response.body_as_unicode()): return RetryMiddleware.process_response(self, request, response, spider) reason = 'tora request failed' return self._retry(request, reason, spider) or response
class RetryTest(unittest.TestCase): def setUp(self): self.spider = BaseSpider('foo') self.mw = RetryMiddleware() self.mw.max_retry_times = 2 def test_priority_adjust(self): req = Request('http://www.scrapytest.org/503') rsp = Response('http://www.scrapytest.org/503', body='', status=503) req2 = self.mw.process_response(req, rsp, self.spider) assert req2.priority < req.priority def test_404(self): req = Request('http://www.scrapytest.org/404') rsp = Response('http://www.scrapytest.org/404', body='', status=404) # dont retry 404s assert self.mw.process_response(req, rsp, self.spider) is rsp def test_dont_retry(self): req = Request('http://www.scrapytest.org/503', meta={'dont_retry': True}) rsp = Response('http://www.scrapytest.org/503', body='', status=503) # first retry r = self.mw.process_response(req, rsp, self.spider) assert r is rsp def test_dont_retry_exc(self): req = Request('http://www.scrapytest.org/503', meta={'dont_retry': True}) rsp = Response('http://www.scrapytest.org/503', body='', status=503) r = self.mw.process_exception(req, DNSLookupError(), self.spider) assert r is None def test_503(self): req = Request('http://www.scrapytest.org/503') rsp = Response('http://www.scrapytest.org/503', body='', status=503) # first retry req = self.mw.process_response(req, rsp, self.spider) assert isinstance(req, Request) self.assertEqual(req.meta['retry_times'], 1) # second retry req = self.mw.process_response(req, rsp, self.spider) assert isinstance(req, Request) self.assertEqual(req.meta['retry_times'], 2) # discard it assert self.mw.process_response(req, rsp, self.spider) is rsp def test_twistederrors(self): for exc in (ServerTimeoutError, DNSLookupError, ConnectionRefusedError, ConnectionDone, ConnectError, ConnectionLost): req = Request('http://www.scrapytest.org/%s' % exc.__name__) self._test_retry_exception(req, exc()) def _test_retry_exception(self, req, exception): # first retry req = self.mw.process_exception(req, exception, self.spider) assert isinstance(req, Request) self.assertEqual(req.meta['retry_times'], 1) # second retry req = self.mw.process_exception(req, exception, self.spider) assert isinstance(req, Request) self.assertEqual(req.meta['retry_times'], 2) # discard it req = self.mw.process_exception(req, exception, self.spider) self.assertEqual(req, None)
def process_response(self, request, response, spider): ret = RetryMiddleware.process_response(self, request, response, spider) if spider.name != 'tora' or good(response): return ret reason = 'tora request failed' return self._retry(request, reason, spider) or response
def setUp(self): crawler = get_crawler() self.spider = Spider('foo') self.mw = RetryMiddleware.from_crawler(crawler) self.mw.max_retry_times = 2