def _redirect(self, redirected, request, spider, reason): redirect_url = redirected.url try: ci = request.cookies[FetchConstant.CarInfo] seqid = ci.seqid except:pass try: fs = FetchSession() ci_exist = fs.query(CarInfo).filter(CarInfo.sourceurl == redirect_url)\ .filter(CarInfo.seqid != seqid).first() if ci_exist: msg = (u'car with popular exist %s' % seqid) spider.log(msg, log.INFO) # ci.statustype = u'3' try: # fs.merge(ci) fs.delete(ci) except: fs.rollback() else: msg = (u'delete car with popular exist %s ' % seqid) spider.log(msg, log.INFO) fs.commit() finally: fs.close() raise IgnoreRequest else: return RedirectMiddleware._redirect(self, redirected, request, spider, reason) except Exception as e: raise e
def setUp(self): self.spider = BaseSpider('foo') self.mw = RedirectMiddleware()
class RedirectMiddlewareTest(unittest.TestCase): def setUp(self): self.spider = BaseSpider('foo') self.mw = RedirectMiddleware() def test_priority_adjust(self): req = Request('http://a.com') rsp = Response('http://a.com', headers={'Location': 'http://a.com/redirected'}, status=301) req2 = self.mw.process_response(req, rsp, self.spider) assert req2.priority > req.priority def test_redirect_301(self): def _test(method): url = 'http://www.example.com/301' url2 = 'http://www.example.com/redirected' req = Request(url, method=method) rsp = Response(url, headers={'Location': url2}, status=301) req2 = self.mw.process_response(req, rsp, self.spider) assert isinstance(req2, Request) self.assertEqual(req2.url, url2) self.assertEqual(req2.method, method) # response without Location header but with status code is 3XX should be ignored del rsp.headers['Location'] assert self.mw.process_response(req, rsp, self.spider) is rsp _test('GET') _test('POST') _test('HEAD') def test_dont_redirect(self): url = 'http://www.example.com/301' url2 = 'http://www.example.com/redirected' req = Request(url, meta={'dont_redirect': True}) rsp = Response(url, headers={'Location': url2}, status=301) r = self.mw.process_response(req, rsp, self.spider) assert isinstance(r, Response) assert r is rsp def test_redirect_302(self): url = 'http://www.example.com/302' url2 = 'http://www.example.com/redirected2' req = Request(url, method='POST', body='test', headers={'Content-Type': 'text/plain', 'Content-length': '4'}) rsp = Response(url, headers={'Location': url2}, status=302) req2 = self.mw.process_response(req, rsp, self.spider) assert isinstance(req2, Request) self.assertEqual(req2.url, url2) self.assertEqual(req2.method, 'GET') assert 'Content-Type' not in req2.headers, \ "Content-Type header must not be present in redirected request" assert 'Content-Length' not in req2.headers, \ "Content-Length header must not be present in redirected request" assert not req2.body, \ "Redirected body must be empty, not '%s'" % req2.body # response without Location header but with status code is 3XX should be ignored del rsp.headers['Location'] assert self.mw.process_response(req, rsp, self.spider) is rsp def test_redirect_302_head(self): url = 'http://www.example.com/302' url2 = 'http://www.example.com/redirected2' req = Request(url, method='HEAD') rsp = Response(url, headers={'Location': url2}, status=302) req2 = self.mw.process_response(req, rsp, self.spider) assert isinstance(req2, Request) self.assertEqual(req2.url, url2) self.assertEqual(req2.method, 'HEAD') # response without Location header but with status code is 3XX should be ignored del rsp.headers['Location'] assert self.mw.process_response(req, rsp, self.spider) is rsp def test_meta_refresh(self): body = """<html> <head><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head> </html>""" req = Request(url='http://example.org') rsp = HtmlResponse(url='http://example.org', body=body) req2 = self.mw.process_response(req, rsp, self.spider) assert isinstance(req2, Request) self.assertEqual(req2.url, 'http://example.org/newpage') def test_meta_refresh_with_high_interval(self): # meta-refresh with high intervals don't trigger redirects body = """<html> <head><meta http-equiv="refresh" content="1000;url=http://example.org/newpage" /></head> </html>""" req = Request(url='http://example.org') rsp = HtmlResponse(url='http://example.org', body=body) rsp2 = self.mw.process_response(req, rsp, self.spider) assert rsp is rsp2 def test_meta_refresh_trough_posted_request(self): body = """<html> <head><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head> </html>""" req = Request(url='http://example.org', method='POST', body='test', headers={'Content-Type': 'text/plain', 'Content-length': '4'}) rsp = HtmlResponse(url='http://example.org', body=body) req2 = self.mw.process_response(req, rsp, self.spider) assert isinstance(req2, Request) self.assertEqual(req2.url, 'http://example.org/newpage') self.assertEqual(req2.method, 'GET') assert 'Content-Type' not in req2.headers, \ "Content-Type header must not be present in redirected request" assert 'Content-Length' not in req2.headers, \ "Content-Length header must not be present in redirected request" assert not req2.body, \ "Redirected body must be empty, not '%s'" % req2.body def test_max_redirect_times(self): self.mw.max_redirect_times = 1 req = Request('http://scrapytest.org/302') rsp = Response('http://scrapytest.org/302', headers={'Location': '/redirected'}, status=302) req = self.mw.process_response(req, rsp, self.spider) assert isinstance(req, Request) assert 'redirect_times' in req.meta self.assertEqual(req.meta['redirect_times'], 1) self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider) def test_ttl(self): self.mw.max_redirect_times = 100 req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1}) rsp = Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}, status=302) req = self.mw.process_response(req, rsp, self.spider) assert isinstance(req, Request) self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
def setUp(self): crawler = get_crawler() self.spider = BaseSpider('foo') self.mw = RedirectMiddleware.from_crawler(crawler)
def setUp(self): crawler = get_crawler() self.spider = BaseSpider("foo") self.mw = RedirectMiddleware.from_crawler(crawler)
def setUp(self): crawler = get_crawler() self.spider = Spider('foo') self.mw = RedirectMiddleware.from_crawler(crawler)
def setUp(self): crawler = get_crawler(Spider) self.spider = crawler._create_spider('foo') self.mw = RedirectMiddleware.from_crawler(crawler)
class RedirectMiddlewareTest(unittest.TestCase): def setUp(self): self.spider = BaseSpider('foo') self.mw = RedirectMiddleware() def test_priority_adjust(self): req = Request('http://a.com') rsp = Response('http://a.com', headers={'Location': 'http://a.com/redirected'}, status=301) req2 = self.mw.process_response(req, rsp, self.spider) assert req2.priority > req.priority def test_redirect_301(self): def _test(method): url = 'http://www.example.com/301' url2 = 'http://www.example.com/redirected' req = Request(url, method=method) rsp = Response(url, headers={'Location': url2}, status=301) req2 = self.mw.process_response(req, rsp, self.spider) assert isinstance(req2, Request) self.assertEqual(req2.url, url2) self.assertEqual(req2.method, method) # response without Location header but with status code is 3XX should be ignored del rsp.headers['Location'] assert self.mw.process_response(req, rsp, self.spider) is rsp _test('GET') _test('POST') _test('HEAD') def test_dont_redirect(self): url = 'http://www.example.com/301' url2 = 'http://www.example.com/redirected' req = Request(url, meta={'dont_redirect': True}) rsp = Response(url, headers={'Location': url2}, status=301) r = self.mw.process_response(req, rsp, self.spider) assert isinstance(r, Response) assert r is rsp def test_redirect_302(self): url = 'http://www.example.com/302' url2 = 'http://www.example.com/redirected2' req = Request(url, method='POST', body='test', headers={ 'Content-Type': 'text/plain', 'Content-length': '4' }) rsp = Response(url, headers={'Location': url2}, status=302) req2 = self.mw.process_response(req, rsp, self.spider) assert isinstance(req2, Request) self.assertEqual(req2.url, url2) self.assertEqual(req2.method, 'GET') assert 'Content-Type' not in req2.headers, \ "Content-Type header must not be present in redirected request" assert 'Content-Length' not in req2.headers, \ "Content-Length header must not be present in redirected request" assert not req2.body, \ "Redirected body must be empty, not '%s'" % req2.body # response without Location header but with status code is 3XX should be ignored del rsp.headers['Location'] assert self.mw.process_response(req, rsp, self.spider) is rsp def test_redirect_302_head(self): url = 'http://www.example.com/302' url2 = 'http://www.example.com/redirected2' req = Request(url, method='HEAD') rsp = Response(url, headers={'Location': url2}, status=302) req2 = self.mw.process_response(req, rsp, self.spider) assert isinstance(req2, Request) self.assertEqual(req2.url, url2) self.assertEqual(req2.method, 'HEAD') # response without Location header but with status code is 3XX should be ignored del rsp.headers['Location'] assert self.mw.process_response(req, rsp, self.spider) is rsp def test_meta_refresh(self): body = """<html> <head><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head> </html>""" req = Request(url='http://example.org') rsp = HtmlResponse(url='http://example.org', body=body) req2 = self.mw.process_response(req, rsp, self.spider) assert isinstance(req2, Request) self.assertEqual(req2.url, 'http://example.org/newpage') def test_meta_refresh_with_high_interval(self): # meta-refresh with high intervals don't trigger redirects body = """<html> <head><meta http-equiv="refresh" content="1000;url=http://example.org/newpage" /></head> </html>""" req = Request(url='http://example.org') rsp = HtmlResponse(url='http://example.org', body=body) rsp2 = self.mw.process_response(req, rsp, self.spider) assert rsp is rsp2 def test_meta_refresh_trough_posted_request(self): body = """<html> <head><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head> </html>""" req = Request(url='http://example.org', method='POST', body='test', headers={ 'Content-Type': 'text/plain', 'Content-length': '4' }) rsp = HtmlResponse(url='http://example.org', body=body) req2 = self.mw.process_response(req, rsp, self.spider) assert isinstance(req2, Request) self.assertEqual(req2.url, 'http://example.org/newpage') self.assertEqual(req2.method, 'GET') assert 'Content-Type' not in req2.headers, \ "Content-Type header must not be present in redirected request" assert 'Content-Length' not in req2.headers, \ "Content-Length header must not be present in redirected request" assert not req2.body, \ "Redirected body must be empty, not '%s'" % req2.body def test_max_redirect_times(self): self.mw.max_redirect_times = 1 req = Request('http://scrapytest.org/302') rsp = Response('http://scrapytest.org/302', headers={'Location': '/redirected'}, status=302) req = self.mw.process_response(req, rsp, self.spider) assert isinstance(req, Request) assert 'redirect_times' in req.meta self.assertEqual(req.meta['redirect_times'], 1) self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider) def test_ttl(self): self.mw.max_redirect_times = 100 req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1}) rsp = Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}, status=302) req = self.mw.process_response(req, rsp, self.spider) assert isinstance(req, Request) self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider) def test_redirect_urls(self): req1 = Request('http://scrapytest.org/first') rsp1 = Response('http://scrapytest.org/first', headers={'Location': '/redirected'}, status=302) req2 = self.mw.process_response(req1, rsp1, self.spider) rsp2 = Response('http://scrapytest.org/redirected', headers={'Location': '/redirected2'}, status=302) req3 = self.mw.process_response(req2, rsp2, self.spider) self.assertEqual(req2.url, 'http://scrapytest.org/redirected') self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first']) self.assertEqual(req3.url, 'http://scrapytest.org/redirected2') self.assertEqual(req3.meta['redirect_urls'], [ 'http://scrapytest.org/first', 'http://scrapytest.org/redirected' ])