示例#1
0
 def test_request_cacheability(self):
     res0 = Response(self.request.url, status=200,
                     headers={'Expires': self.tomorrow})
     req0 = Request('http://example.com')
     req1 = req0.replace(headers={'Cache-Control': 'no-store'})
     req2 = req0.replace(headers={'Cache-Control': 'no-cache'})
     with self._middleware() as mw:
         # response for a request with no-store must not be cached
         res1 = self._process_requestresponse(mw, req1, res0)
         self.assertEqualResponse(res1, res0)
         self.assertIsNone(mw.storage.retrieve_response(req1))
         # Re-do request without no-store and expect it to be cached
         res2 = self._process_requestresponse(mw, req0, res0)
         self.assertNotIn('cached', res2.flags)
         res3 = mw.process_request(req0)
         self.assertIn('cached', res3.flags)
         self.assertEqualResponse(res2, res3)
         # request with no-cache directive must not return cached response
         # but it allows new response to be stored
         res0b = res0.replace(body='foo')
         res4 = self._process_requestresponse(mw, req2, res0b)
         self.assertEqualResponse(res4, res0b)
         self.assertNotIn('cached', res4.flags)
         res5 = self._process_requestresponse(mw, req0, None)
         self.assertEqualResponse(res5, res0b)
         self.assertIn('cached', res5.flags)
示例#2
0
    def test_complex_cookies(self):
        # merge some cookies into jar
        cookies = [{'name': 'C1', 'value': 'value1', 'path': '/foo', 'domain': 'test.org'},
                {'name': 'C2', 'value': 'value2', 'path': '/bar', 'domain': 'test.org'},
                {'name': 'C3', 'value': 'value3', 'path': '/foo', 'domain': 'test.org'},
                {'name': 'C4', 'value': 'value4', 'path': '/foo', 'domain': 't.org'}]


        req = Request('http://test.org/', cookies=cookies)
        self.mw.process_request(req)

        # embed C1 and C3 for test.org/foo
        req = Request('http://test.org/foo')
        self.mw.process_request(req)
        self.assertIn(req.headers.get('Cookie'), ('C1=value1; C3=value3', 'C3=value3; C1=value1'))

        # embed C2 for test.org/bar
        req = Request('http://test.org/bar')
        self.mw.process_request(req)
        self.assertEquals(req.headers.get('Cookie'), 'C2=value2')

        # embed nothing for test.org/baz
        req = Request('http://test.org/baz')
        self.mw.process_request(req)
        self.assertNotIn('Cookie', req.headers)
示例#3
0
    def test_copy(self):
        def somecallback():
            pass

        r1 = Request('http://www.example.com', callback=somecallback,
            errback=somecallback, method='post', headers={'hello': 'world'},
            params={'a': 'b'}, body='blablabla', meta={'c': 'd'}, proxy='123',
            priority=10, history=[1, 2, 3], encoding='latin1')
        r2 = r1.copy()

        self.assertIs(r1.callback, somecallback)
        self.assertIs(r1.errback, somecallback)
        self.assertIs(r2.callback, r1.callback)
        self.assertIs(r2.errback, r2.errback)

        self.assertEqual(r1.url, r2.url)
        self.assertEqual(r1.method, r2.method)
        self.assertIsNot(r1.headers, r2.headers)
        self.assertDictEqual(r1.headers, r2.headers)
        self.assertIsNot(r1.meta, r2.meta)
        self.assertDictEqual(r1.meta, r2.meta)
        self.assertIsNot(r1.history, r2.history)
        self.assertListEqual(r1.history, r2.history)
        self.assertEqual(r1.body, r2.body)
        self.assertEqual(r1.proxy, r2.proxy)
        self.assertEqual(r1.priority, r2.priority)
        self.assertEqual(r1.encoding, r2.encoding)
示例#4
0
 def test_prepare_method(self):
     r = Request(url=gh_url)
     self.assertEqual(r.method, 'GET')
     self.assertEqual(r._prepare_method('gEt'), 'GET')
     self.assertEqual(r._prepare_method('post'), 'POST')
     self.assertEqual(r._prepare_method('f.adsf/dsaf,'), 'F.ADSF/DSAF,')
     self.assertIsInstance(r._prepare_method(u'get'), str)
 def test_get_handler(self):
     h = self.handler._get_handler(Request('file:///etc/fstab'))
     self.assertIsInstance(h, FileDownloadHandler)
     h = self.handler._get_handler(Request('http://www.github.com/'))
     self.assertIsInstance(h, HttpDownloadHandler)
     self.assertRaises(NotSupported, self.handler._get_handler,
                       Request('https://www.githib.com/'))
    def test_bad_scheme(self):
        mw = Filter(self._get_engine(FILTER_SCHEMES=['mailto']))

        good1 = Request('http://a.b/')
        good2 = mw.process_request(good1)
        self.assertIs(good1, good2)

        bad1 = Request('mailto:[email protected]?subject=News')
        self.assertRaises(FilterError, mw.process_request, bad1)
示例#7
0
 def _process_site_url(self, url):
     for rule, cb in self._cbs:
         if rule.search(url):
             req = Request(url, callback=cb)
             req = self.process_site_request(req)
             if req:
                 req.callback = partial(self._site_request_callback, req.callback)
                 req.errback = partial(self._site_request_errback, req.errback)
             return req
    def test_url_length_limit(self):
        mw = Filter(self._get_engine(FILTER_URL_LENGTH_LIMIT=11))

        good1 = Request('http://a.b/')
        good2 = mw.process_request(good1)
        self.assertIs(good1, good2)

        bad1 = Request('http://a.bc/')
        self.assertRaises(FilterError, mw.process_request, bad1)
 def test_timeout_download_from_spider(self):
     meta = {'DOWNLOAD_TIMEOUT': 0.2}
     # client connects but no data is received
     request = Request(self.getURL('wait'), meta=meta)
     d = self.download_request(request)
     yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)
     # client connects, server send headers and some body bytes but hangs
     request = Request(self.getURL('hang-after-headers'), meta=meta)
     d = self.download_request(request)
     yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)
示例#10
0
 def _process_site_url(self, url):
     for rule, cb in self._cbs:
         if rule.search(url):
             req = Request(url, callback=cb)
             req = self.process_site_request(req)
             if req:
                 req.callback = partial(self._site_request_callback,
                                        req.callback)
                 req.errback = partial(self._site_request_errback,
                                       req.errback)
             return req
示例#11
0
    def test_basic(self):
        headers = {'Set-Cookie': 'C1=value1; path=/'}
        req = Request('http://test.org/')
        self.assertIs(req, self.mw.process_request(req))
        self.assertNotIn('Cookie', req.headers)

        res = Response('http://test.org/', request=req, headers=headers)
        self.assertIs(res, self.mw.process_response(res))

        req2 = Request('http://test.org/sub1/')
        self.assertIs(req2, self.mw.process_request(req2))
        self.assertEquals(req2.headers.get('Cookie'), 'C1=value1')
示例#12
0
    def test_merge_request_cookies(self):
        req = Request('http://test.org/', cookies={'galleta': 'salada'})
        self.assertIs(self.mw.process_request(req), req)
        self.assertEquals(req.headers.get('Cookie'), 'galleta=salada')

        headers = {'Set-Cookie': 'C1=value1; path=/'}
        res = Response('http://test.org/', request=req, headers=headers)
        self.assertIs(self.mw.process_response(res), res)

        req2 = Request('http://test.org/sub1/')
        self.assertIs(self.mw.process_request(req2), req2)
        self.assertEquals(req2.headers.get('Cookie'), 'C1=value1; galleta=salada')
示例#13
0
 def test_request(self):
     req = Request(url='http://github.com', meta={'a': 'b'})
     req.history = ['a', 'b']
     r = Response(url='', request=req)
     self.assertIs(r.request, req)
     self.assertIs(r.meta, req.meta)
     self.assertIs(r.history, req.history)
     self.assertIs(r.original_url, req.original_url)
     r = Response(url='')
     from crawlmi.http.response.response import _no_request_error
     self.assertRaisesRegexp(AttributeError, _no_request_error, lambda: r.meta)
     self.assertRaisesRegexp(AttributeError, _no_request_error, lambda: r.history)
     self.assertRaisesRegexp(AttributeError, _no_request_error, lambda: r.original_url)
示例#14
0
    def test_replace(self):
        r1 = Request('http://www.example.com', method='GET')
        headers = Headers(dict(r1.headers, key='value'))
        r2 = r1.replace(method='POST', body='New body', headers=headers)
        self.assertEqual(r1.url, r2.url)
        self.assertEqual((r1.method, r2.method), ('GET', 'POST'))
        self.assertEqual((r1.body, r2.body), ('', 'New body'))
        self.assertEqual((r1.headers, r2.headers), (Headers(), headers))

        r3 = Request('http://www.example.com', meta={'a': 1})
        r4 = r3.replace(url='http://www.example.com/2', body='', meta={})
        self.assertEqual(r4.url, 'http://www.example.com/2')
        self.assertEqual(r4.body, '')
        self.assertEqual(r4.meta, {})
示例#15
0
    def test_process_reqeust_restart(self):
        old_request = Request('http://gh.com/')
        new_request = Request('http://new.com/')

        def preq(r):
            if r is old_request:
                raise RestartPipeline(new_request)
            return r

        pm = self._get_pm(self._build('M1', preq=preq),
                          self._build('M2', preq=True))
        result = pm.process_request(old_request)
        self.assertIs(result, new_request)
        self.assertListEqual(self.mws, ['M1', 'M1', 'M2'])
    def test_process_request(self):
        engine = get_engine()
        mw = DuplicateFilter(engine)

        r1 = Request('http://test.org/1')
        r2 = Request('http://test.org/2')
        r3 = Request('http://test.org/2')

        self.assertIs(mw.process_request(r1), r1)
        self.assertIs(mw.process_request(r2), r2)
        self.assertIsNone(mw.process_request(r3))

        engine.signals.send(clear_duplicate_filter)
        self.assertIs(mw.process_request(r3), r3)
示例#17
0
 def test_404(self):
     req = Request('http://www.scrapytest.org/404')
     rsp = Response('http://www.scrapytest.org/404',
                    body='',
                    status=404,
                    request=req)
     self.assertIs(self.mw.process_response(rsp), rsp)
 def test_request(self):
     req = Request(url='http://github.com', meta={'a': 'b'})
     req.history = ['a', 'b']
     r = Response(url='', request=req)
     self.assertIs(r.request, req)
     self.assertIs(r.meta, req.meta)
     self.assertIs(r.history, req.history)
     self.assertIs(r.original_url, req.original_url)
     r = Response(url='')
     from crawlmi.http.response.response import _no_request_error
     self.assertRaisesRegexp(AttributeError, _no_request_error,
                             lambda: r.meta)
     self.assertRaisesRegexp(AttributeError, _no_request_error,
                             lambda: r.history)
     self.assertRaisesRegexp(AttributeError, _no_request_error,
                             lambda: r.original_url)
    def setUp(self):
        engine = get_engine()
        self.stats = engine.stats
        self.mw = DownloaderStats(engine)

        self.req = Request('http://github.com')
        self.resp = Response('scrapytest.org', status=400, request=self.req)
 def test_payload(self):
     body = '1' * 100  # PayloadResource requires body length to be 100
     request = Request(self.getURL('payload'), method='POST', body=body)
     d = self.download_request(request)
     d.addCallback(lambda r: r.body)
     d.addCallback(self.assertEquals, body)
     return d
    def test_host_header_seted_in_request_headers(self):
        def _test(response):
            self.assertEquals(response.body, 'example.com')
            self.assertEquals(request.headers.get('Host'), 'example.com')

        request = Request(self.getURL('host'), headers={'Host': 'example.com'})
        return self.download_request(request).addCallback(_test)
    def test_host_header_not_in_request_headers(self):
        def _test(response):
            self.assertEquals(response.body, '127.0.0.1:%d' % self.portno)
            self.assertEquals(request.headers, {})

        request = Request(self.getURL('host'))
        return self.download_request(request).addCallback(_test)
    def test_redirect_302(self):
        url = 'http://www.example.com/302'
        url2 = 'http://www.example.com/redirected2'
        req = Request(url,
                      method='POST',
                      body='test',
                      headers={
                          'Content-Type': 'text/plain',
                          'Content-length': '4'
                      })
        resp = Response(url,
                        headers={'Location': url2},
                        status=302,
                        request=req)

        req2 = self.mw.process_response(resp)
        self.assertIsInstance(req2, Request)
        self.assertEqual(req2.url, url2)
        self.assertEqual(req2.method, 'GET')
        self.assertNotIn(
            'Content-Type', req2.headers,
            'Content-Type header must not be present in redirected request')
        self.assertNotIn(
            'Content-Length', req2.headers,
            'Content-Length header must not be present in redirected request')
        self.assertEqual(req2.body, '',
                         'Redirected body must be empty, not `%s`' % req2.body)

        # response without Location header but with status code is 3XX should be ignored
        del resp.headers['Location']
        self.assertIs(self.mw.process_response(resp), resp)
示例#24
0
 def test_req_or_resp(self):
     req = Request('http://github.com/', meta={'a': 10, 'x': 'y'})
     self.assertEqual(self.settings.get('a', req_or_resp=req), 10)
     self.assertEqual(self.settings.get('x', req_or_resp=req), 'y')
     resp = Response('', request=req)
     self.assertEqual(self.settings.get('a', req_or_resp=resp), 10)
     self.assertEqual(self.settings.get('x', req_or_resp=resp), 'y')
示例#25
0
    def test_download(self):
        self.engine.start()
        del self.sp.received[:]

        req = Request('http://github.com/')
        self.engine.download(req)
        self.clock.advance(0)
        self.check_signals([signals.request_received])
        self.assertEqual(len(self.engine.request_queue), 1)

        # pipeline None
        self.pipeline.req = lambda req: None
        self.engine.download(req)
        self.clock.advance(0)
        self.assertEqual(len(self.engine.request_queue), 1)

        # pipeline response
        self.pipeline.req = lambda req: Response('')
        self.engine.download(req)
        self.clock.advance(0)
        self.assertEqual(len(self.engine.response_queue), 1)

        # download and stop
        self.pipeline.req = lambda req: Response('')
        d = self.engine.download(req)
        self.engine.stop('finished')
        self.clock.advance(0)
        return d
 def test_update_headers(self):
     headers = {'Accept-Language': ['es'], 'Test-Header': ['test']}
     req = Request('http://github.com/', headers=headers)
     self.assertDictEqual(req.headers, headers)
     req = self.dh.process_request(req)
     self.defaults.update(headers)
     self.assertDictEqual(req.headers, self.defaults)
示例#27
0
 def setUp(self):
     self.mws = []
     self.actions = []
     self.req = Request('http://gh.com/')
     self.resp = Response('http://gh.com/', request=self.req)
     self.fail = Failure(Exception())
     self.fail.request = self.req
 def test_nothing(self):
     body = '''<html><head></head><body></body></html>'''
     req = Request('http://a.com')
     rsp = HtmlResponse(req.url, body=body, request=req)
     rsp2 = self.mw.process_response(rsp)
     self.assertIs(rsp, rsp2)
     self.assertNotIn('canonical_url', rsp.meta)
 def test_priority_adjust(self):
     req = Request('http://a.com')
     resp = Response('http://a.com',
                     headers={'Location': 'http://a.com/redirected'},
                     status=301,
                     request=req)
     req2 = self.mw.process_response(resp)
     assert req2.priority > req.priority
 def test_meta_refresh_with_high_interval(self):
     # meta-refresh with high intervals don't trigger redirects
     req = Request(url='http://example.org')
     rsp = HtmlResponse(url='http://example.org',
                        body=self._body(interval=1000),
                        request=req)
     rsp2 = self.mw.process_response(rsp)
     self.assertIs(rsp, rsp2)
示例#31
0
 def _clientfactory(*args, **kwargs):
     timeout = kwargs.pop('timeout', 0)
     download_size = kwargs.pop('download_size', 0)
     f = CrawlmiHTPPClientFactory(Request(*args, **kwargs),
                                  timeout=timeout,
                                  download_size=download_size)
     f.deferred.addCallback(lambda r: r.body)
     return f
    def test_tags(self):
        engine = get_engine()
        mw = DuplicateFilter(engine)

        r1 = Request('http://test.org/', meta={'df_tag': '1'})
        r2 = Request('http://test.org/', meta={'df_tag': '2'})
        r3 = Request('http://test.org/', meta={'df_tag': '2'})

        self.assertIs(mw.process_request(r1), r1)
        self.assertIs(mw.process_request(r2), r2)
        self.assertIsNone(mw.process_request(r3))

        engine.signals.send(clear_duplicate_filter, df_tag='2')

        self.assertIsNone(mw.process_request(r1))
        self.assertIs(mw.process_request(r2), r2)
        self.assertIsNone(mw.process_request(r3))
    def test_download_without_proxy(self):
        def _test(response):
            self.assertEquals(response.status, 200)
            self.assertEquals(response.url, request.url)
            self.assertEquals(response.body, '/path/to/resource')

        request = Request(self.getURL('path/to/resource'))
        return self.download_request(request).addCallback(_test)
示例#34
0
    def run(self, args, options):
        if len(args) != 1:
            raise UsageError()
        url = any_to_uri(args[0])
        request = Request(url, callback=open_in_browser)

        self.engine.download(request)
        self.process.start()
示例#35
0
 def test_priority_adjust(self):
     req = Request('http://www.scrapytest.org/503')
     rsp = Response('http://www.scrapytest.org/503',
                    body='',
                    status=503,
                    request=req)
     req2 = self.mw.process_response(rsp)
     self.assertTrue(req2.priority < req.priority)
示例#36
0
    def test_encode_params(self):
        r = Request(url=gh_url)
        r_latin1 = Request(url=gh_url, encoding='latin1')

        # test interface
        self.assertEqual(r._encode_params('mimino'), 'mimino')
        self.assertEqual(r._encode_params(u'mi\xa3no'), 'mi\xc2\xa3no')
        self.assertEqual(r_latin1._encode_params(u'mi\xa3no'), 'mi\xa3no')
        self.assertEqual(r._encode_params({'hello': 'world'}), 'hello=world')
        self.assertIn(r._encode_params({'a': 'b', 'c': 'd'}), ['a=b&c=d', 'c=d&a=b'])
        self.assertEqual(r._encode_params([('a', 'b'), ('c', 'd')]), 'a=b&c=d')
        self.assertEqual(r._encode_params([('a', ''), ('c', '10')]), 'a=&c=10')

        self.assertRaises(Exception, r._encode_params)
        self.assertRaises(Exception, r._encode_params, 10)
        self.assertRaises(Exception, r._encode_params, ['hello', 'world'])

        # test quoting
        self.assertEqual(r._encode_params(
            [('a', u'mi\xa3no'), ('b', 'mi\xc2\xa3no')]),
            'a=mi%C2%A3no&b=mi%C2%A3no')
        self.assertEqual(r._encode_params(
            {'! #$%&\'()*+,': '/:;=?@[]~'}),
            '%21+%23%24%25%26%27%28%29%2A%2B%2C=%2F%3A%3B%3D%3F%40%5B%5D%7E')
示例#37
0
    def test_prepare_body(self):
        r = Request(url=gh_url)
        r_latin1 = Request(url=gh_url, encoding='latin1')

        self.assertEqual(r._prepare_body(''), '')

        body = r._prepare_body(u'Price: \xa3100')
        self.assertIsInstance(body, str)
        self.assertEqual(body, 'Price: \xc2\xa3100')
        latin_body = r_latin1._prepare_body(u'Price: \xa3100')
        self.assertEqual(latin_body, 'Price: \xa3100')
        self.assertEqual(r._prepare_body(10), '10')
示例#38
0
    def test_request_fingerprint(self):
        r1 = Request('http://www.example.com/query?id=111&cat=222')
        r2 = Request('http://www.example.com/query?cat=222&id=111')
        self.assertEqual(request_fingerprint(r1), request_fingerprint(r1))
        self.assertEqual(request_fingerprint(r1), request_fingerprint(r2))

        r1 = Request('http://www.example.com/hnnoticiaj1.aspx?78132,199')
        r2 = Request('http://www.example.com/hnnoticiaj1.aspx?78160,199')
        self.assertNotEqual(request_fingerprint(r1), request_fingerprint(r2))

        # make sure caching is working
        self.assertEqual(request_fingerprint(r1), _fingerprint_cache[r1][None])

        r1 = Request('http://www.example.com/members/offers.html')
        r2 = Request('http://www.example.com/members/offers.html')
        r2.headers['SESSIONID'] = 'somehash'
        self.assertEqual(request_fingerprint(r1), request_fingerprint(r2))

        r1 = Request('http://www.example.com/')
        r2 = Request('http://www.example.com/')
        r2.headers['Accept-Language'] = 'en'
        r3 = Request('http://www.example.com/')
        r3.headers['Accept-Language'] = 'en'
        r3.headers['SESSIONID'] = 'somehash'

        self.assertEqual(request_fingerprint(r1), request_fingerprint(r2), request_fingerprint(r3))

        self.assertEqual(request_fingerprint(r1),
                         request_fingerprint(r1, include_headers=['Accept-Language']))

        self.assertNotEqual(request_fingerprint(r1),
            request_fingerprint(r2, include_headers=['Accept-Language']))

        self.assertEqual(request_fingerprint(r3, include_headers=['accept-language', 'sessionid']),
                         request_fingerprint(r3, include_headers=['SESSIONID', 'Accept-Language']))

        r1 = Request('http://www.example.com')
        r2 = Request('http://www.example.com', method='POST')
        r3 = Request('http://www.example.com', method='POST', body='request body')

        self.assertNotEqual(request_fingerprint(r1), request_fingerprint(r2))
        self.assertNotEqual(request_fingerprint(r2), request_fingerprint(r3))

        # cached fingerprint must be cleared on request copy
        r1 = Request('http://www.example.com')
        fp1 = request_fingerprint(r1)
        r2 = r1.replace(url = 'http://www.example.com/other')
        fp2 = request_fingerprint(r2)
        self.assertNotEqual(fp1, fp2)