예제 #1
0
    def _redirect(self, redirected, request, spider, reason):
        
        redirect_url = redirected.url
        try:
            ci = request.cookies[FetchConstant.CarInfo]
            seqid = ci.seqid
        except:pass
        
        try:
            fs = FetchSession()
            ci_exist = fs.query(CarInfo).filter(CarInfo.sourceurl == redirect_url)\
                        .filter(CarInfo.seqid != seqid).first()
            if ci_exist:
                msg = (u'car with popular exist %s' % seqid)
                spider.log(msg, log.INFO)
#                ci.statustype = u'3'
                try:
#                    fs.merge(ci)
                    fs.delete(ci)
                except:
                    fs.rollback()
                else:
                    msg = (u'delete car with popular exist %s ' % seqid)
                    spider.log(msg, log.INFO)
                    fs.commit()
                finally:
                    fs.close()
                
                raise IgnoreRequest
            else:
                return RedirectMiddleware._redirect(self, redirected, request, spider, reason)
        except Exception as e:
            raise e
 def setUp(self):
     self.spider = BaseSpider('foo')
     self.mw = RedirectMiddleware()
class RedirectMiddlewareTest(unittest.TestCase):

    def setUp(self):
        self.spider = BaseSpider('foo')
        self.mw = RedirectMiddleware()

    def test_priority_adjust(self):
        req = Request('http://a.com')
        rsp = Response('http://a.com', headers={'Location': 'http://a.com/redirected'}, status=301)
        req2 = self.mw.process_response(req, rsp, self.spider)
        assert req2.priority > req.priority

    def test_redirect_301(self):
        def _test(method):
            url = 'http://www.example.com/301'
            url2 = 'http://www.example.com/redirected'
            req = Request(url, method=method)
            rsp = Response(url, headers={'Location': url2}, status=301)

            req2 = self.mw.process_response(req, rsp, self.spider)
            assert isinstance(req2, Request)
            self.assertEqual(req2.url, url2)
            self.assertEqual(req2.method, method)

            # response without Location header but with status code is 3XX should be ignored
            del rsp.headers['Location']
            assert self.mw.process_response(req, rsp, self.spider) is rsp

        _test('GET')
        _test('POST')
        _test('HEAD')

    def test_dont_redirect(self):
        url = 'http://www.example.com/301'
        url2 = 'http://www.example.com/redirected'
        req = Request(url, meta={'dont_redirect': True})
        rsp = Response(url, headers={'Location': url2}, status=301)

        r = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(r, Response)
        assert r is rsp

    def test_redirect_302(self):
        url = 'http://www.example.com/302'
        url2 = 'http://www.example.com/redirected2'
        req = Request(url, method='POST', body='test', 
            headers={'Content-Type': 'text/plain', 'Content-length': '4'})
        rsp = Response(url, headers={'Location': url2}, status=302)

        req2 = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req2, Request)
        self.assertEqual(req2.url, url2)
        self.assertEqual(req2.method, 'GET')
        assert 'Content-Type' not in req2.headers, \
            "Content-Type header must not be present in redirected request"
        assert 'Content-Length' not in req2.headers, \
            "Content-Length header must not be present in redirected request"
        assert not req2.body, \
            "Redirected body must be empty, not '%s'" % req2.body

        # response without Location header but with status code is 3XX should be ignored
        del rsp.headers['Location']
        assert self.mw.process_response(req, rsp, self.spider) is rsp

    def test_redirect_302_head(self):
        url = 'http://www.example.com/302'
        url2 = 'http://www.example.com/redirected2'
        req = Request(url, method='HEAD')
        rsp = Response(url, headers={'Location': url2}, status=302)

        req2 = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req2, Request)
        self.assertEqual(req2.url, url2)
        self.assertEqual(req2.method, 'HEAD')

        # response without Location header but with status code is 3XX should be ignored
        del rsp.headers['Location']
        assert self.mw.process_response(req, rsp, self.spider) is rsp

    def test_meta_refresh(self):
        body = """<html>
            <head><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
            </html>"""
        req = Request(url='http://example.org')
        rsp = HtmlResponse(url='http://example.org', body=body)
        req2 = self.mw.process_response(req, rsp, self.spider)

        assert isinstance(req2, Request)
        self.assertEqual(req2.url, 'http://example.org/newpage')

    def test_meta_refresh_with_high_interval(self):
        # meta-refresh with high intervals don't trigger redirects
        body = """<html>
            <head><meta http-equiv="refresh" content="1000;url=http://example.org/newpage" /></head>
            </html>"""
        req = Request(url='http://example.org')
        rsp = HtmlResponse(url='http://example.org', body=body)
        rsp2 = self.mw.process_response(req, rsp, self.spider)

        assert rsp is rsp2

    def test_meta_refresh_trough_posted_request(self):
        body = """<html>
            <head><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
            </html>"""
        req = Request(url='http://example.org', method='POST', body='test',
            headers={'Content-Type': 'text/plain', 'Content-length': '4'})
        rsp = HtmlResponse(url='http://example.org', body=body)
        req2 = self.mw.process_response(req, rsp, self.spider)

        assert isinstance(req2, Request)
        self.assertEqual(req2.url, 'http://example.org/newpage')
        self.assertEqual(req2.method, 'GET')
        assert 'Content-Type' not in req2.headers, \
            "Content-Type header must not be present in redirected request"
        assert 'Content-Length' not in req2.headers, \
            "Content-Length header must not be present in redirected request"
        assert not req2.body, \
            "Redirected body must be empty, not '%s'" % req2.body

    def test_max_redirect_times(self):
        self.mw.max_redirect_times = 1
        req = Request('http://scrapytest.org/302')
        rsp = Response('http://scrapytest.org/302', headers={'Location': '/redirected'}, status=302)

        req = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req, Request)
        assert 'redirect_times' in req.meta
        self.assertEqual(req.meta['redirect_times'], 1)
        self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)

    def test_ttl(self):
        self.mw.max_redirect_times = 100
        req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1})
        rsp = Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}, status=302)

        req = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req, Request)
        self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
 def setUp(self):
     crawler = get_crawler()
     self.spider = BaseSpider('foo')
     self.mw = RedirectMiddleware.from_crawler(crawler)
 def setUp(self):
     crawler = get_crawler()
     self.spider = BaseSpider("foo")
     self.mw = RedirectMiddleware.from_crawler(crawler)
 def setUp(self):
     crawler = get_crawler()
     self.spider = Spider('foo')
     self.mw = RedirectMiddleware.from_crawler(crawler)
 def setUp(self):
     crawler = get_crawler(Spider)
     self.spider = crawler._create_spider('foo')
     self.mw = RedirectMiddleware.from_crawler(crawler)
 def setUp(self):
     crawler = get_crawler(Spider)
     self.spider = crawler._create_spider('foo')
     self.mw = RedirectMiddleware.from_crawler(crawler)
 def setUp(self):
     self.spider = BaseSpider('foo')
     self.mw = RedirectMiddleware()
class RedirectMiddlewareTest(unittest.TestCase):
    def setUp(self):
        self.spider = BaseSpider('foo')
        self.mw = RedirectMiddleware()

    def test_priority_adjust(self):
        req = Request('http://a.com')
        rsp = Response('http://a.com',
                       headers={'Location': 'http://a.com/redirected'},
                       status=301)
        req2 = self.mw.process_response(req, rsp, self.spider)
        assert req2.priority > req.priority

    def test_redirect_301(self):
        def _test(method):
            url = 'http://www.example.com/301'
            url2 = 'http://www.example.com/redirected'
            req = Request(url, method=method)
            rsp = Response(url, headers={'Location': url2}, status=301)

            req2 = self.mw.process_response(req, rsp, self.spider)
            assert isinstance(req2, Request)
            self.assertEqual(req2.url, url2)
            self.assertEqual(req2.method, method)

            # response without Location header but with status code is 3XX should be ignored
            del rsp.headers['Location']
            assert self.mw.process_response(req, rsp, self.spider) is rsp

        _test('GET')
        _test('POST')
        _test('HEAD')

    def test_dont_redirect(self):
        url = 'http://www.example.com/301'
        url2 = 'http://www.example.com/redirected'
        req = Request(url, meta={'dont_redirect': True})
        rsp = Response(url, headers={'Location': url2}, status=301)

        r = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(r, Response)
        assert r is rsp

    def test_redirect_302(self):
        url = 'http://www.example.com/302'
        url2 = 'http://www.example.com/redirected2'
        req = Request(url,
                      method='POST',
                      body='test',
                      headers={
                          'Content-Type': 'text/plain',
                          'Content-length': '4'
                      })
        rsp = Response(url, headers={'Location': url2}, status=302)

        req2 = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req2, Request)
        self.assertEqual(req2.url, url2)
        self.assertEqual(req2.method, 'GET')
        assert 'Content-Type' not in req2.headers, \
            "Content-Type header must not be present in redirected request"
        assert 'Content-Length' not in req2.headers, \
            "Content-Length header must not be present in redirected request"
        assert not req2.body, \
            "Redirected body must be empty, not '%s'" % req2.body

        # response without Location header but with status code is 3XX should be ignored
        del rsp.headers['Location']
        assert self.mw.process_response(req, rsp, self.spider) is rsp

    def test_redirect_302_head(self):
        url = 'http://www.example.com/302'
        url2 = 'http://www.example.com/redirected2'
        req = Request(url, method='HEAD')
        rsp = Response(url, headers={'Location': url2}, status=302)

        req2 = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req2, Request)
        self.assertEqual(req2.url, url2)
        self.assertEqual(req2.method, 'HEAD')

        # response without Location header but with status code is 3XX should be ignored
        del rsp.headers['Location']
        assert self.mw.process_response(req, rsp, self.spider) is rsp

    def test_meta_refresh(self):
        body = """<html>
            <head><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
            </html>"""
        req = Request(url='http://example.org')
        rsp = HtmlResponse(url='http://example.org', body=body)
        req2 = self.mw.process_response(req, rsp, self.spider)

        assert isinstance(req2, Request)
        self.assertEqual(req2.url, 'http://example.org/newpage')

    def test_meta_refresh_with_high_interval(self):
        # meta-refresh with high intervals don't trigger redirects
        body = """<html>
            <head><meta http-equiv="refresh" content="1000;url=http://example.org/newpage" /></head>
            </html>"""
        req = Request(url='http://example.org')
        rsp = HtmlResponse(url='http://example.org', body=body)
        rsp2 = self.mw.process_response(req, rsp, self.spider)

        assert rsp is rsp2

    def test_meta_refresh_trough_posted_request(self):
        body = """<html>
            <head><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
            </html>"""
        req = Request(url='http://example.org',
                      method='POST',
                      body='test',
                      headers={
                          'Content-Type': 'text/plain',
                          'Content-length': '4'
                      })
        rsp = HtmlResponse(url='http://example.org', body=body)
        req2 = self.mw.process_response(req, rsp, self.spider)

        assert isinstance(req2, Request)
        self.assertEqual(req2.url, 'http://example.org/newpage')
        self.assertEqual(req2.method, 'GET')
        assert 'Content-Type' not in req2.headers, \
            "Content-Type header must not be present in redirected request"
        assert 'Content-Length' not in req2.headers, \
            "Content-Length header must not be present in redirected request"
        assert not req2.body, \
            "Redirected body must be empty, not '%s'" % req2.body

    def test_max_redirect_times(self):
        self.mw.max_redirect_times = 1
        req = Request('http://scrapytest.org/302')
        rsp = Response('http://scrapytest.org/302',
                       headers={'Location': '/redirected'},
                       status=302)

        req = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req, Request)
        assert 'redirect_times' in req.meta
        self.assertEqual(req.meta['redirect_times'], 1)
        self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp,
                          self.spider)

    def test_ttl(self):
        self.mw.max_redirect_times = 100
        req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1})
        rsp = Response('http://www.scrapytest.org/302',
                       headers={'Location': '/redirected'},
                       status=302)

        req = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req, Request)
        self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp,
                          self.spider)

    def test_redirect_urls(self):
        req1 = Request('http://scrapytest.org/first')
        rsp1 = Response('http://scrapytest.org/first',
                        headers={'Location': '/redirected'},
                        status=302)
        req2 = self.mw.process_response(req1, rsp1, self.spider)
        rsp2 = Response('http://scrapytest.org/redirected',
                        headers={'Location': '/redirected2'},
                        status=302)
        req3 = self.mw.process_response(req2, rsp2, self.spider)

        self.assertEqual(req2.url, 'http://scrapytest.org/redirected')
        self.assertEqual(req2.meta['redirect_urls'],
                         ['http://scrapytest.org/first'])
        self.assertEqual(req3.url, 'http://scrapytest.org/redirected2')
        self.assertEqual(req3.meta['redirect_urls'], [
            'http://scrapytest.org/first', 'http://scrapytest.org/redirected'
        ])