def test_complex_cookies(self): # merge some cookies into jar cookies = [{'name': 'C1', 'value': 'value1', 'path': '/foo', 'domain': 'test.org'}, {'name': 'C2', 'value': 'value2', 'path': '/bar', 'domain': 'test.org'}, {'name': 'C3', 'value': 'value3', 'path': '/foo', 'domain': 'test.org'}, {'name': 'C4', 'value': 'value4', 'path': '/foo', 'domain': 't.org'}] req = Request('http://test.org/', cookies=cookies) self.mw.process_request(req) # embed C1 and C3 for test.org/foo req = Request('http://test.org/foo') self.mw.process_request(req) self.assertIn(req.headers.get('Cookie'), ('C1=value1; C3=value3', 'C3=value3; C1=value1')) # embed C2 for test.org/bar req = Request('http://test.org/bar') self.mw.process_request(req) self.assertEquals(req.headers.get('Cookie'), 'C2=value2') # embed nothing for test.org/baz req = Request('http://test.org/baz') self.mw.process_request(req) self.assertNotIn('Cookie', req.headers)
def test_get_handler(self): h = self.handler._get_handler(Request('file:///etc/fstab')) self.assertIsInstance(h, FileDownloadHandler) h = self.handler._get_handler(Request('http://www.github.com/')) self.assertIsInstance(h, HttpDownloadHandler) self.assertRaises(NotSupported, self.handler._get_handler, Request('https://www.githib.com/'))
def test_url_length_limit(self): mw = Filter(self._get_engine(FILTER_URL_LENGTH_LIMIT=11)) good1 = Request('http://a.b/') good2 = mw.process_request(good1) self.assertIs(good1, good2) bad1 = Request('http://a.bc/') self.assertRaises(FilterError, mw.process_request, bad1)
def test_bad_scheme(self): mw = Filter(self._get_engine(FILTER_SCHEMES=['mailto'])) good1 = Request('http://a.b/') good2 = mw.process_request(good1) self.assertIs(good1, good2) bad1 = Request('mailto:[email protected]?subject=News') self.assertRaises(FilterError, mw.process_request, bad1)
def test_timeout_download_from_spider(self): meta = {'DOWNLOAD_TIMEOUT': 0.2} # client connects but no data is received request = Request(self.getURL('wait'), meta=meta) d = self.download_request(request) yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError) # client connects, server send headers and some body bytes but hangs request = Request(self.getURL('hang-after-headers'), meta=meta) d = self.download_request(request) yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)
def test_merge_request_cookies(self): req = Request('http://test.org/', cookies={'galleta': 'salada'}) self.assertIs(self.mw.process_request(req), req) self.assertEquals(req.headers.get('Cookie'), 'galleta=salada') headers = {'Set-Cookie': 'C1=value1; path=/'} res = Response('http://test.org/', request=req, headers=headers) self.assertIs(self.mw.process_response(res), res) req2 = Request('http://test.org/sub1/') self.assertIs(self.mw.process_request(req2), req2) self.assertEquals(req2.headers.get('Cookie'), 'C1=value1; galleta=salada')
def test_basic(self): headers = {'Set-Cookie': 'C1=value1; path=/'} req = Request('http://test.org/') self.assertIs(req, self.mw.process_request(req)) self.assertNotIn('Cookie', req.headers) res = Response('http://test.org/', request=req, headers=headers) self.assertIs(res, self.mw.process_response(res)) req2 = Request('http://test.org/sub1/') self.assertIs(req2, self.mw.process_request(req2)) self.assertEquals(req2.headers.get('Cookie'), 'C1=value1')
def test_process_request(self): engine = get_engine() mw = DuplicateFilter(engine) r1 = Request('http://test.org/1') r2 = Request('http://test.org/2') r3 = Request('http://test.org/2') self.assertIs(mw.process_request(r1), r1) self.assertIs(mw.process_request(r2), r2) self.assertIsNone(mw.process_request(r3)) engine.signals.send(clear_duplicate_filter) self.assertIs(mw.process_request(r3), r3)
def test_process_reqeust_restart(self): old_request = Request('http://gh.com/') new_request = Request('http://new.com/') def preq(r): if r is old_request: raise RestartPipeline(new_request) return r pm = self._get_pm(self._build('M1', preq=preq), self._build('M2', preq=True)) result = pm.process_request(old_request) self.assertIs(result, new_request) self.assertListEqual(self.mws, ['M1', 'M1', 'M2'])
def test_host_header_seted_in_request_headers(self): def _test(response): self.assertEquals(response.body, 'example.com') self.assertEquals(request.headers.get('Host'), 'example.com') request = Request(self.getURL('host'), headers={'Host': 'example.com'}) return self.download_request(request).addCallback(_test)
def test_host_header_not_in_request_headers(self): def _test(response): self.assertEquals(response.body, '127.0.0.1:%d' % self.portno) self.assertEquals(request.headers, {}) request = Request(self.getURL('host')) return self.download_request(request).addCallback(_test)
def test_redirect_302(self): url = 'http://www.example.com/302' url2 = 'http://www.example.com/redirected2' req = Request(url, method='POST', body='test', headers={ 'Content-Type': 'text/plain', 'Content-length': '4' }) resp = Response(url, headers={'Location': url2}, status=302, request=req) req2 = self.mw.process_response(resp) self.assertIsInstance(req2, Request) self.assertEqual(req2.url, url2) self.assertEqual(req2.method, 'GET') self.assertNotIn( 'Content-Type', req2.headers, 'Content-Type header must not be present in redirected request') self.assertNotIn( 'Content-Length', req2.headers, 'Content-Length header must not be present in redirected request') self.assertEqual(req2.body, '', 'Redirected body must be empty, not `%s`' % req2.body) # response without Location header but with status code is 3XX should be ignored del resp.headers['Location'] self.assertIs(self.mw.process_response(resp), resp)
def test_payload(self): body = '1' * 100 # PayloadResource requires body length to be 100 request = Request(self.getURL('payload'), method='POST', body=body) d = self.download_request(request) d.addCallback(lambda r: r.body) d.addCallback(self.assertEquals, body) return d
def test_req_or_resp(self): req = Request('http://github.com/', meta={'a': 10, 'x': 'y'}) self.assertEqual(self.settings.get('a', req_or_resp=req), 10) self.assertEqual(self.settings.get('x', req_or_resp=req), 'y') resp = Response('', request=req) self.assertEqual(self.settings.get('a', req_or_resp=resp), 10) self.assertEqual(self.settings.get('x', req_or_resp=resp), 'y')
def test_download(self): self.engine.start() del self.sp.received[:] req = Request('http://github.com/') self.engine.download(req) self.clock.advance(0) self.check_signals([signals.request_received]) self.assertEqual(len(self.engine.request_queue), 1) # pipeline None self.pipeline.req = lambda req: None self.engine.download(req) self.clock.advance(0) self.assertEqual(len(self.engine.request_queue), 1) # pipeline response self.pipeline.req = lambda req: Response('') self.engine.download(req) self.clock.advance(0) self.assertEqual(len(self.engine.response_queue), 1) # download and stop self.pipeline.req = lambda req: Response('') d = self.engine.download(req) self.engine.stop('finished') self.clock.advance(0) return d
def test_404(self): req = Request('http://www.scrapytest.org/404') rsp = Response('http://www.scrapytest.org/404', body='', status=404, request=req) self.assertIs(self.mw.process_response(rsp), rsp)
def test_request_cacheability(self): res0 = Response(self.request.url, status=200, headers={'Expires': self.tomorrow}) req0 = Request('http://example.com') req1 = req0.replace(headers={'Cache-Control': 'no-store'}) req2 = req0.replace(headers={'Cache-Control': 'no-cache'}) with self._middleware() as mw: # response for a request with no-store must not be cached res1 = self._process_requestresponse(mw, req1, res0) self.assertEqualResponse(res1, res0) self.assertIsNone(mw.storage.retrieve_response(req1)) # Re-do request without no-store and expect it to be cached res2 = self._process_requestresponse(mw, req0, res0) self.assertNotIn('cached', res2.flags) res3 = mw.process_request(req0) self.assertIn('cached', res3.flags) self.assertEqualResponse(res2, res3) # request with no-cache directive must not return cached response # but it allows new response to be stored res0b = res0.replace(body='foo') res4 = self._process_requestresponse(mw, req2, res0b) self.assertEqualResponse(res4, res0b) self.assertNotIn('cached', res4.flags) res5 = self._process_requestresponse(mw, req0, None) self.assertEqualResponse(res5, res0b) self.assertIn('cached', res5.flags)
def test_update_headers(self): headers = {'Accept-Language': ['es'], 'Test-Header': ['test']} req = Request('http://github.com/', headers=headers) self.assertDictEqual(req.headers, headers) req = self.dh.process_request(req) self.defaults.update(headers) self.assertDictEqual(req.headers, self.defaults)
def setUp(self): self.mws = [] self.actions = [] self.req = Request('http://gh.com/') self.resp = Response('http://gh.com/', request=self.req) self.fail = Failure(Exception()) self.fail.request = self.req
def test_nothing(self): body = '''<html><head></head><body></body></html>''' req = Request('http://a.com') rsp = HtmlResponse(req.url, body=body, request=req) rsp2 = self.mw.process_response(rsp) self.assertIs(rsp, rsp2) self.assertNotIn('canonical_url', rsp.meta)
def setUp(self): engine = get_engine() self.stats = engine.stats self.mw = DownloaderStats(engine) self.req = Request('http://github.com') self.resp = Response('scrapytest.org', status=400, request=self.req)
def test_priority_adjust(self): req = Request('http://a.com') resp = Response('http://a.com', headers={'Location': 'http://a.com/redirected'}, status=301, request=req) req2 = self.mw.process_response(resp) assert req2.priority > req.priority
def test_meta_refresh_with_high_interval(self): # meta-refresh with high intervals don't trigger redirects req = Request(url='http://example.org') rsp = HtmlResponse(url='http://example.org', body=self._body(interval=1000), request=req) rsp2 = self.mw.process_response(rsp) self.assertIs(rsp, rsp2)
def _clientfactory(*args, **kwargs): timeout = kwargs.pop('timeout', 0) download_size = kwargs.pop('download_size', 0) f = CrawlmiHTPPClientFactory(Request(*args, **kwargs), timeout=timeout, download_size=download_size) f.deferred.addCallback(lambda r: r.body) return f
def test_tags(self): engine = get_engine() mw = DuplicateFilter(engine) r1 = Request('http://test.org/', meta={'df_tag': '1'}) r2 = Request('http://test.org/', meta={'df_tag': '2'}) r3 = Request('http://test.org/', meta={'df_tag': '2'}) self.assertIs(mw.process_request(r1), r1) self.assertIs(mw.process_request(r2), r2) self.assertIsNone(mw.process_request(r3)) engine.signals.send(clear_duplicate_filter, df_tag='2') self.assertIsNone(mw.process_request(r1)) self.assertIs(mw.process_request(r2), r2) self.assertIsNone(mw.process_request(r3))
def test_priority_adjust(self): req = Request('http://www.scrapytest.org/503') rsp = Response('http://www.scrapytest.org/503', body='', status=503, request=req) req2 = self.mw.process_response(rsp) self.assertTrue(req2.priority < req.priority)
def run(self, args, options): if len(args) != 1: raise UsageError() url = any_to_uri(args[0]) request = Request(url, callback=open_in_browser) self.engine.download(request) self.process.start()
def test_download_without_proxy(self): def _test(response): self.assertEquals(response.status, 200) self.assertEquals(response.url, request.url) self.assertEquals(response.body, '/path/to/resource') request = Request(self.getURL('path/to/resource')) return self.download_request(request).addCallback(_test)
def test_tag(self): body = '''<html><head><link rel="canonical" href="%s" /></head></html>''' # absolute url req = Request('http://a.com/pom') rsp = HtmlResponse(req.url, body=body % 'https://b.sk/hello', request=req) rsp2 = self.mw.process_response(rsp) self.assertIs(rsp, rsp2) self.assertEqual(rsp.meta['canonical_url'], 'https://b.sk/hello') # relative url req = Request('http://a.com/pom') rsp = HtmlResponse(req.url, body=body % '/hello/world', request=req) rsp2 = self.mw.process_response(rsp) self.assertIs(rsp, rsp2) self.assertEqual(rsp.meta['canonical_url'], 'http://a.com/hello/world')
def test_get_slot(self): key, slot = self.dwn._get_slot(Request('http://www.github.com/')) self.assertEqual(key, 'www.github.com') key2, slot2 = self.dwn._get_slot( Request('http://www.github.com/hello/world#bla')) self.assertEqual(key2, 'www.github.com') self.assertIs(slot2, slot) key3, slot3 = self.dwn._get_slot(Request('http://sites.github.com/')) self.assertEqual(key3, 'sites.github.com') self.assertIsNot(slot3, slot) self.assertEqual(len(self.dwn.slots), 2) # don't use domain specific slots self.dwn.use_domain_specific = False key, slot = self.dwn._get_slot(Request('http://www.github.com/')) self.assertEqual(key, '') key2, slot2 = self.dwn._get_slot(Request('http://sites.github.com/')) self.assertIs(slot2, slot)