def test_magic_response2(): # check 'body' handling and another 'headers' format mw = _get_mw() req = PrerenderRequest('http://example.com/', magic_response=True, headers={'foo': 'bar'}, dont_send_headers=True) req = mw.process_request(req, None) assert 'headers' not in req.meta['prerender']['args'] resp_data = { 'body': base64.b64encode(b"binary data").decode('ascii'), 'headers': { 'Content-Type': 'text/plain' }, } resp = TextResponse("http://myprerender.example.com/execute", headers={b'Content-Type': b'application/json'}, body=json.dumps(resp_data).encode('utf8')) resp2 = mw.process_response(req, resp, None) assert resp2.data == resp_data assert resp2.body == b'binary data' assert resp2.headers == {b'Content-Type': [b'text/plain']} assert resp2.prerender_response_headers == { b'Content-Type': [b'application/json'] } assert resp2.status == resp2.prerender_response_status == 200 assert resp2.url == "http://example.com/"
def test_magic_response_http_error(): mw = _get_mw() req = PrerenderRequest('http://example.com/foo') req = mw.process_request(req, None) resp_data = { "info": { "error": "http404", "message": "Lua error: [string \"function main(prerender)\r...\"]:3: http404", "line_number": 3, "type": "LUA_ERROR", "source": "[string \"function main(prerender)\r...\"]" }, "description": "Error happened while executing Lua script", "error": 400, "type": "ScriptError" } resp = TextResponse("http://myprerender.example.com/execute", status=400, headers={b'Content-Type': b'application/json'}, body=json.dumps(resp_data).encode('utf8')) resp = mw.process_response(req, resp, None) assert resp.data == resp_data assert resp.status == 404 assert resp.prerender_response_status == 400 assert resp.url == "http://example.com/foo"
def _get_req(): return PrerenderRequest( url="http://example.com", endpoint='execute', magic_response=True, args={'lua_source': 'function main(prerender) end'}, )
def requests(): url1 = "http://example.com/foo?x=1&y=2" url2 = "http://example.com/foo?y=2&x=1" url3 = "http://example.com/foo?x=1&y=2&z=3" url4 = "http://example.com/foo?x=1&y=2#id2" url5 = "http://example.com/foo?x=1&y=2#!id2" request_kwargs = [ dict(url=url1), # 0 dict(url=url1, method='POST'), # 1 dict(url=url1, endpoint='render.har'), # 2 dict(url=url2), # 3 dict(url=url1, args={'wait': 0.5}), # 4 dict(url=url2, args={'wait': 0.5}), # 5 dict(url=url3), # 6 dict(url=url2, method='POST'), # 7 dict(args={'wait': 0.5}), # 8 dict(args={'wait': 0.5}), # 9 dict(args={'wait': 0.7}), # 10 dict(url=url4), # 11 ] prerender_requests = [ PrerenderRequest(**kwargs) for kwargs in request_kwargs ] scrapy_requests = [ scrapy.Request(url=url1), # 12 scrapy.Request(url=url2), # 13 scrapy.Request(url=url4), # 14 scrapy.Request(url=url5), # 15 ] return prerender_requests + scrapy_requests
def parse_3(self, response): # Prerender (Twisted) drops requests with huge http headers, # but this one should work, as cookies are not sent # to Prerender itself. yield {'response': response} yield PrerenderRequest(self.url + "#bar", self.parse_4, endpoint='execute', args={'lua_source': DEFAULT_SCRIPT}, cookies={'bomb': BOMB})
def request_with_cookies(cookies): req = PrerenderRequest('http://example.com/foo', endpoint='execute', args={'lua_source': 'function main() end'}, magic_response=True, cookies=cookies) req = cookie_mw.process_request(req, None) or req req = mw.process_request(req, None) or req return req
def parse(self, response): le = LinkExtractor() for link in le.extract_links(response): yield PrerenderRequest(link.url, self.parse_link, endpoint='render.json', args={ 'har': 1, 'html': 1, })
def test_dont_process_response(): mw = _get_mw() req = PrerenderRequest( "http://example.com/", endpoint="render", dont_process_response=True, ) req2 = mw.process_request(req, None) resp = Response("http://example.com/") resp2 = mw.process_response(req2, resp, None) assert resp2.__class__ is Response assert resp2 is resp
def test_change_response_class_to_text(): mw = _get_mw() req = PrerenderRequest('http://example.com/', magic_response=True) req = mw.process_request(req, None) # Such response can come when downloading a file, # or returning prerender:html(): the headers say it's binary, # but it can be decoded so it becomes a TextResponse. resp = TextResponse('http://myprerender.example.com/execute', headers={b'Content-Type': b'application/pdf'}, body=b'ascii binary data', encoding='utf-8') resp2 = mw.process_response(req, resp, None) assert isinstance(resp2, TextResponse) assert resp2.url == 'http://example.com/' assert resp2.headers == {b'Content-Type': [b'application/pdf']} assert resp2.body == b'ascii binary data'
def test_unicode_url(): mw = _get_mw() req = PrerenderRequest( # note unicode URL u"http://example.com/", endpoint='execute') req2 = mw.process_request(req, None) res = {'html': '<html><body>Hello</body></html>'} res_body = json.dumps(res) response = TextResponse( "http://myprerender.example.com/execute", # Scrapy doesn't pass request to constructor # request=req2, headers={b'Content-Type': b'application/json'}, body=res_body.encode('utf8')) response2 = mw.process_response(req2, response, None) assert response2.url == "http://example.com/"
def test_prerender_request_no_url(): mw = _get_mw() lua_source = "function main(prerender) return {result='ok'} end" req1 = PrerenderRequest(meta={ 'prerender': { 'args': { 'lua_source': lua_source }, 'endpoint': 'execute', } }) req = mw.process_request(req1, None) assert req.url == 'http://127.0.0.1:8050/execute' assert json.loads(to_native_str(req.body)) == { 'url': 'about:blank', 'lua_source': lua_source }
def test_prerender_request(): mw = _get_mw() cookie_mw = _get_cookie_mw() req = PrerenderRequest("http://example.com?foo=bar&url=1&wait=100") assert repr(req) == "<GET http://example.com?foo=bar&url=1&wait=100>" # check request preprocessing req2 = cookie_mw.process_request(req, None) or req req2 = mw.process_request(req2, None) or req2 assert req2 is not None assert req2 is not req assert req2.url == "http://127.0.0.1:8050/render" assert req2.headers == {b'Content-Type': [b'application/json']} assert req2.method == 'POST' assert isinstance(req2, PrerenderRequest) assert repr( req2 ) == "<GET http://example.com?foo=bar&url=1&wait=100 via http://127.0.0.1:8050/render>" expected_body = {'url': req.url} assert json.loads(to_native_str(req2.body)) == expected_body # check response post-processing response = TextResponse( "http://127.0.0.1:8050/render", # Scrapy doesn't pass request to constructor # request=req2, headers={b'Content-Type': b'text/html'}, body=b"<html><body>Hello</body></html>") response2 = mw.process_response(req2, response, None) response2 = cookie_mw.process_response(req2, response2, None) assert isinstance(response2, scrapy_prerender.PrerenderTextResponse) assert response2 is not response assert response2.real_url == req2.url assert response2.url == req.url assert response2.body == b"<html><body>Hello</body></html>" assert response2.css("body").extract_first() == "<body>Hello</body>" assert response2.headers == {b'Content-Type': [b'text/html']} # check .replace method response3 = response2.replace(status=404) assert response3.status == 404 assert isinstance(response3, scrapy_prerender.PrerenderTextResponse) for attr in ['url', 'real_url', 'headers', 'body']: assert getattr(response3, attr) == getattr(response2, attr)
def test_change_response_class_to_json_binary(): mw = _get_mw() # We set magic_response to False, because it's not a kind of data we would # expect from prerender: we just return binary data. # If we set magic_response to True, the middleware will fail, # but this is ok because magic_response presumes we are expecting # a valid prerender json response. req = PrerenderRequest('http://example.com/', magic_response=False) req = mw.process_request(req, None) resp = Response( 'http://myprerender.example.com/execute', headers={b'Content-Type': b'application/json'}, body=b'non-decodable data: \x98\x11\xe7\x17\x8f', ) resp2 = mw.process_response(req, resp, None) assert isinstance(resp2, Response) assert resp2.url == 'http://example.com/' assert resp2.headers == {b'Content-Type': [b'application/json']} assert resp2.body == b'non-decodable data: \x98\x11\xe7\x17\x8f'
def test_prerender_request_meta(): meta = {'foo': 'bar'} req = PrerenderRequest('http://example.com', meta=meta) assert 'prerender' in req.meta assert req.meta['foo'] == 'bar' assert meta == {'foo': 'bar'}
def test_magic_response(): mw = _get_mw() cookie_mw = _get_cookie_mw() req = PrerenderRequest('http://example.com/', endpoint='execute', args={'lua_source': 'function main() end'}, magic_response=True, cookies=[{ 'name': 'foo', 'value': 'bar' }]) req = cookie_mw.process_request(req, None) or req req = mw.process_request(req, None) or req resp_data = { 'url': "http://exmaple.com/#id42", 'html': '<html><body>Hello 404</body></html>', 'http_status': 404, 'headers': [ { 'name': 'Content-Type', 'value': "text/html" }, { 'name': 'X-My-Header', 'value': "foo" }, { 'name': 'Set-Cookie', 'value': "bar=baz" }, ], 'cookies': [ { 'name': 'foo', 'value': 'bar' }, { 'name': 'bar', 'value': 'baz', 'domain': '.example.com' }, { 'name': 'session', 'value': '12345', 'path': '/', 'expires': '2055-07-24T19:20:30Z' }, ], } resp = TextResponse("http://myprerender.example.com/execute", headers={b'Content-Type': b'application/json'}, body=json.dumps(resp_data).encode('utf8')) resp2 = mw.process_response(req, resp, None) resp2 = cookie_mw.process_response(req, resp2, None) assert isinstance(resp2, scrapy_prerender.PrerenderJsonResponse) assert resp2.data == resp_data assert resp2.body == b'<html><body>Hello 404</body></html>' assert resp2.text == '<html><body>Hello 404</body></html>' assert resp2.headers == { b'Content-Type': [b'text/html'], b'X-My-Header': [b'foo'], b'Set-Cookie': [b'bar=baz'], } assert resp2.prerender_response_headers == { b'Content-Type': [b'application/json'] } assert resp2.status == 404 assert resp2.prerender_response_status == 200 assert resp2.url == "http://exmaple.com/#id42" assert len(resp2.cookiejar) == 3 cookies = [c for c in resp2.cookiejar] assert {(c.name, c.value) for c in cookies} == {('bar', 'baz'), ('foo', 'bar'), ('session', '12345')} # send second request using the same session and check the resulting cookies req = PrerenderRequest('http://example.com/foo', endpoint='execute', args={'lua_source': 'function main() end'}, magic_response=True, cookies={'spam': 'ham'}) req = cookie_mw.process_request(req, None) or req req = mw.process_request(req, None) or req resp_data = { 'html': '<html><body>Hello</body></html>', 'headers': [ { 'name': 'Content-Type', 'value': "text/html" }, { 'name': 'X-My-Header', 'value': "foo" }, { 'name': 'Set-Cookie', 'value': "bar=baz" }, ], 'cookies': [ { 'name': 'spam', 'value': 'ham' }, { 'name': 'egg', 'value': 'spam' }, { 'name': 'bar', 'value': 'baz', 'domain': '.example.com' }, #{'name': 'foo', 'value': ''}, -- this won't be in response { 'name': 'session', 'value': '12345', 'path': '/', 'expires': '2056-07-24T19:20:30Z' }, ], } resp = TextResponse("http://myprerender.example.com/execute", headers={b'Content-Type': b'application/json'}, body=json.dumps(resp_data).encode('utf8')) resp2 = mw.process_response(req, resp, None) resp2 = cookie_mw.process_response(req, resp2, None) assert isinstance(resp2, scrapy_prerender.PrerenderJsonResponse) assert resp2.data == resp_data cookies = [c for c in resp2.cookiejar] assert {c.name for c in cookies} == {'session', 'egg', 'bar', 'spam'} for c in cookies: if c.name == 'session': assert c.expires == 2731692030 if c.name == 'spam': assert c.value == 'ham'
def parse(self, response): yield {'response': response} yield PrerenderRequest(self.url + '#foo')
def start_requests(self): yield PrerenderRequest(self.url)
def parse_1(self, response): yield {'response': response} yield PrerenderRequest(self.url + "#foo", self.parse_2, endpoint='execute', args={'lua_source': DEFAULT_SCRIPT})
def _request(self, url): return PrerenderRequest(url, endpoint='execute', args={'lua_source': DEFAULT_SCRIPT, 'x': 'yy'}, cache_args=['lua_source'])
def start_requests(self): yield PrerenderRequest(self.url, endpoint='execute', args={'lua_source': DEFAULT_SCRIPT})
def start_requests(self): yield PrerenderRequest(self.url + "#foo", endpoint='execute', args={'lua_source': DEFAULT_SCRIPT, 'foo': 'bar'})
def test_prerender_request_parameters(): mw = _get_mw() cookie_mw = _get_cookie_mw() def cb(): pass req = PrerenderRequest("http://example.com/#!start", cb, 'POST', body="foo=bar", prerender_url="http://myprerender.example.com", slot_policy=SlotPolicy.SINGLE_SLOT, endpoint="execute", prerender_headers={'X-My-Header': 'value'}, args={ "lua_source": "function main() end", "myarg": 3.0, }, magic_response=False, headers={'X-My-Header': 'value'}) req2 = cookie_mw.process_request(req, None) or req req2 = mw.process_request(req2, None) assert req2.meta['prerender'] == { 'endpoint': 'execute', 'prerender_url': "http://myprerender.example.com", 'slot_policy': SlotPolicy.SINGLE_SLOT, 'prerender_headers': { 'X-My-Header': 'value' }, 'magic_response': False, 'session_id': 'default', 'http_status_from_error_code': True, 'args': { 'url': "http://example.com/#!start", 'http_method': 'POST', 'body': 'foo=bar', 'cookies': [], 'lua_source': 'function main() end', 'myarg': 3.0, 'headers': { 'X-My-Header': 'value', } }, } assert req2.callback == cb assert req2.headers == { b'Content-Type': [b'application/json'], b'X-My-Header': [b'value'], } # check response post-processing res = { 'html': '<html><body>Hello</body></html>', 'num_divs': 0.0, } res_body = json.dumps(res) response = TextResponse( "http://myprerender.example.com/execute", # Scrapy doesn't pass request to constructor # request=req2, headers={b'Content-Type': b'application/json'}, body=res_body.encode('utf8')) response2 = mw.process_response(req2, response, None) response2 = cookie_mw.process_response(req2, response2, None) assert isinstance(response2, scrapy_prerender.PrerenderJsonResponse) assert response2 is not response assert response2.real_url == req2.url assert response2.url == req.meta['prerender']['args']['url'] assert response2.data == res assert response2.body == res_body.encode('utf8') assert response2.text == response2.body_as_unicode() == res_body assert response2.encoding == 'utf8' assert response2.headers == {b'Content-Type': [b'application/json']} assert response2.prerender_response_headers == response2.headers assert response2.status == response2.prerender_response_status == 200
def test_cache_args(): spider = scrapy.Spider(name='foo') mw = _get_mw() mw.crawler.spider = spider mw.spider_opened(spider) dedupe_mw = PrerenderDeduplicateArgsMiddleware() # ========= Send first request - it should use save_args: lua_source = 'function main(prerender) end' req = PrerenderRequest('http://example.com/foo', endpoint='execute', args={'lua_source': lua_source}, cache_args=['lua_source']) assert req.meta['prerender']['args']['lua_source'] == lua_source # <---- spider req, = list(dedupe_mw.process_start_requests([req], spider)) # ----> scheduler assert req.meta['prerender']['args']['lua_source'] != lua_source assert list(mw._argument_values.values()) == [lua_source] assert list(mw._argument_values.keys()) == [ req.meta['prerender']['args']['lua_source'] ] # <---- scheduler # process request before sending it to the downloader req = mw.process_request(req, spider) or req # -----> downloader assert req.meta['prerender']['args']['lua_source'] == lua_source assert req.meta['prerender']['args']['save_args'] == ['lua_source'] assert 'load_args' not in req.meta['prerender']['args'] assert req.meta['prerender']['_local_arg_fingerprints'] == { 'lua_source': list(mw._argument_values.keys())[0] } # <---- downloader resp_body = b'{}' resp = TextResponse( "http://example.com", headers={ b'Content-Type': b'application/json', b'X-Prerender-Saved-Arguments': b'lua_source=ba001160ef96fe2a3f938fea9e6762e204a562b3' }, body=resp_body) resp = mw.process_response(req, resp, None) # ============ Send second request - it should use load_args req2 = PrerenderRequest('http://example.com/bar', endpoint='execute', args={'lua_source': lua_source}, cache_args=['lua_source']) req2, item = list( dedupe_mw.process_spider_output(resp, [req2, { 'key': 'value' }], spider)) assert item == {'key': 'value'} # ----> scheduler assert req2.meta['prerender']['args']['lua_source'] != lua_source # <---- scheduler # process request before sending it to the downloader req2 = mw.process_request(req2, spider) or req2 # -----> downloader assert req2.meta['prerender']['args']['load_args'] == { "lua_source": "ba001160ef96fe2a3f938fea9e6762e204a562b3" } assert "lua_source" not in req2.meta['prerender']['args'] assert "save_args" not in req2.meta['prerender']['args'] assert json.loads(req2.body.decode('utf8')) == { 'load_args': { 'lua_source': 'ba001160ef96fe2a3f938fea9e6762e204a562b3' }, 'url': 'http://example.com/bar' } # <---- downloader resp = TextResponse("http://example.com/bar", headers={b'Content-Type': b'application/json'}, body=b'{}') resp = mw.process_response(req, resp, spider) # =========== Third request is dispatched to another server where # =========== arguments are expired: req3 = PrerenderRequest('http://example.com/baz', endpoint='execute', args={'lua_source': lua_source}, cache_args=['lua_source']) req3, = list(dedupe_mw.process_spider_output(resp, [req3], spider)) # ----> scheduler assert req3.meta['prerender']['args']['lua_source'] != lua_source # <---- scheduler req3 = mw.process_request(req3, spider) or req3 # -----> downloader assert json.loads(req3.body.decode('utf8')) == { 'load_args': { 'lua_source': 'ba001160ef96fe2a3f938fea9e6762e204a562b3' }, 'url': 'http://example.com/baz' } # <---- downloader resp_body = json.dumps({ "type": "ExpiredArguments", "description": "Arguments stored with ``save_args`` are expired", "info": { "expired": ["html"] }, "error": 498 }) resp = TextResponse("127.0.0.1:8050", headers={b'Content-Type': b'application/json'}, status=498, body=resp_body.encode('utf8')) req4 = mw.process_response(req3, resp, spider) assert isinstance(req4, PrerenderRequest) # process this request again req4, = list(dedupe_mw.process_spider_output(resp, [req4], spider)) req4 = mw.process_request(req4, spider) or req4 # it should become save_args request after all middlewares assert json.loads(req4.body.decode('utf8')) == { 'lua_source': 'function main(prerender) end', 'save_args': ['lua_source'], 'url': 'http://example.com/baz' } assert mw._remote_keys == {}
def test_meta_None(): req1 = PrerenderRequest('http://example.com') req2 = PrerenderRequest('http://example.com', meta=None) assert req1.meta == req2.meta
def test_prerender_request_url_with_fragment(): mw = _get_mw() url = "http://example.com#id1" req = PrerenderRequest(url) req = mw.process_request(req, None) assert json.loads(to_native_str(req.body)) == {'url': url}
def parse(self, response): yield PrerenderRequest(self.url + "#egg", self.parse_1, endpoint='execute', args={'lua_source': DEFAULT_SCRIPT}, cookies={'x-set-prerender': '1'})