def get_test_requests(): test_requests = [] original1 = Request( url="https://httpbin.org/anything", method="GET", meta={ "crawlera_fetch": { "args": { "render": "no", "region": "us", "iptype": "datacenter", "device": "mobile", } } }, ) expected1 = Request( url=SETTINGS["CRAWLERA_FETCH_URL"], callback=foo_spider.foo_callback, method="POST", headers={ "Authorization": basic_auth_header(SETTINGS["CRAWLERA_FETCH_APIKEY"], SETTINGS["CRAWLERA_FETCH_APIPASS"]), "Content-Type": "application/json", "Accept": "application/json", "X-Crawlera-JobId": "1/2/3", }, meta={ "crawlera_fetch": { "args": { "render": "no", "region": "us", "iptype": "datacenter", "device": "mobile", }, "original_request": request_to_dict(original1, spider=foo_spider), "timing": { "start_ts": mocked_time() }, }, "download_slot": "httpbin.org", }, body=json.dumps({ "url": "https://httpbin.org/anything", "body": "", "render": "no", "region": "us", "iptype": "datacenter", "device": "mobile", }), ) test_requests.append({ "original": original1, "expected": expected1, }) original2 = FormRequest( url="https://httpbin.org/post", callback=foo_spider.foo_callback, meta={"crawlera_fetch": { "args": { "device": "desktop" } }}, formdata={"foo": "bar"}, ) expected2 = FormRequest( url=SETTINGS["CRAWLERA_FETCH_URL"], method="POST", headers={ "Authorization": basic_auth_header(SETTINGS["CRAWLERA_FETCH_APIKEY"], SETTINGS["CRAWLERA_FETCH_APIPASS"]), "Content-Type": "application/json", "Accept": "application/json", "X-Crawlera-JobId": "1/2/3", }, meta={ "crawlera_fetch": { "args": { "device": "desktop" }, "original_request": request_to_dict(original2, spider=foo_spider), "timing": { "start_ts": mocked_time() }, }, "download_slot": "httpbin.org", }, body=json.dumps({ "url": "https://httpbin.org/post", "method": "POST", "body": "foo=bar", "device": "desktop", }), ) test_requests.append({ "original": original2, "expected": expected2, }) test_requests.append({ "original": Request( url="https://example.org", method="HEAD", meta={"crawlera_fetch": { "skip": True }}, ), "expected": None, }) return test_requests
def test_process_response_error(): response_list = [ TextResponse( url="https://crawlera.com/fake/api/endpoint", request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": request_to_dict( Request("https://example.org"), spider=foo_spider, ), } }, ), headers={ "X-Crawlera-Error": "bad_proxy_auth", "Proxy-Authenticate": 'Basic realm="Crawlera"', "Content-Length": "0", "Date": "Mon, 04 May 2020 13:06:15 GMT", "Proxy-Connection": "close", "Connection": "close", }, ), TextResponse( url="https://crawlera.com/fake/api/endpoint", request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": request_to_dict( Request("https://example.org"), spider=foo_spider, ), } }, ), body=b'{"Bad": "JSON', ), TextResponse( url="https://crawlera.com/fake/api/endpoint", request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": request_to_dict( Request("https://example.org"), spider=foo_spider, ), } }, ), body=json.dumps({ "url": "https://example.org", "original_status": 503, "headers": {}, "crawlera_status": "fail", "crawlera_error": "serverbusy", "body_encoding": "plain", "body": "Server busy: too many outstanding requests", }), encoding="utf8", ), ] middleware_raise = get_test_middleware( settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True}) for response in response_list: with pytest.raises(CrawleraFetchException): middleware_raise.process_response(response.request, response, foo_spider) assert middleware_raise.stats.get_value( "crawlera_fetch/response_error") == 3 assert middleware_raise.stats.get_value( "crawlera_fetch/response_error/bad_proxy_auth") == 1 assert middleware_raise.stats.get_value( "crawlera_fetch/response_error/JSONDecodeError") == 1 assert middleware_raise.stats.get_value( "crawlera_fetch/response_error/serverbusy") == 1 middleware_log = get_test_middleware( settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False}) with LogCapture() as logs: for response in response_list: processed = middleware_log.process_response( response.request, response, foo_spider) assert response is processed logs.check_present( ( "crawlera-fetch-middleware", "WARNING", "Error downloading <GET https://example.org> (status: 200, X-Crawlera-Error header: bad_proxy_auth)", # noqa: E501 ), ( "crawlera-fetch-middleware", "WARNING", "Error decoding <GET https://example.org> (status: 200, message: Unterminated string starting at, lineno: 1, colno: 9)", # noqa: E501 ), ( "crawlera-fetch-middleware", "WARNING", "Error downloading <GET https://example.org> (Original status: 503, Fetch API error message: Server busy: too many outstanding requests, Request ID: unknown)", # noqa: E501 ), ) assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 3 assert middleware_log.stats.get_value( "crawlera_fetch/response_error/bad_proxy_auth") == 1 assert middleware_log.stats.get_value( "crawlera_fetch/response_error/JSONDecodeError") == 1 assert middleware_log.stats.get_value( "crawlera_fetch/response_error/serverbusy") == 1
"original": HtmlResponse( url=SETTINGS["CRAWLERA_FETCH_URL"], status=200, headers={ "Content-Type": "application/json", "Content-Encoding": "gzip", "Transfer-Encoding": "chunked", "Date": "Fri, 24 Apr 2020 18:06:42 GMT", "Proxy-Connection": "close", "Connection": "close", }, request=Request( url=SETTINGS["CRAWLERA_FETCH_URL"], meta={ "crawlera_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( Request("https://fake.host.com"), spider=dummy_spider, ), } }, ), body=b"""{"url":"https://fake.host.com","original_status":123,"headers":{"fake-header":"true"},"body":"foobar"}""", # noqa: E501 ), "expected": TextResponse( url="https://fake.host.com", status=123, headers={"Fake-Header": "true"}, body=b"""foobar""", # noqa: E501 ),
url=SETTINGS["CRAWLERA_FETCH_URL"], status=200, headers={ "Content-Type": "application/json", "Content-Encoding": "gzip", "Transfer-Encoding": "chunked", "Date": "Fri, 24 Apr 2020 18:06:42 GMT", "Proxy-Connection": "close", "Connection": "close", }, request=Request( url=SETTINGS["CRAWLERA_FETCH_URL"], meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": request_to_dict( Request("https://fake.host.com"), spider=foo_spider, ), } }, ), body= b"""{"url":"https://fake.host.com","original_status":123,"headers":{"fake-header":"true"},"body":"foobar"}""", # noqa: E501 ), "expected": TextResponse( url="https://fake.host.com",
def test_process_response_error(): response_list = [ TextResponse( url="https://crawlera.com/fake/api/endpoint", request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": { "url": "https://example.org", "method": "GET" }, } }, ), headers={ "X-Crawlera-Error": "bad_proxy_auth", "Proxy-Authenticate": 'Basic realm="Crawlera"', "Content-Length": "0", "Date": "Mon, 04 May 2020 13:06:15 GMT", "Proxy-Connection": "close", "Connection": "close", }, ), TextResponse( url="https://crawlera.com/fake/api/endpoint", request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": { "url": "https://example.org", "method": "GET" }, } }, ), body=b'{"Bad": "JSON', ), ] middleware_raise = get_test_middleware( settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True}) for response in response_list: with pytest.raises(CrawleraFetchException): middleware_raise.process_response(response.request, response, Spider("foo")) assert middleware_raise.stats.get_value( "crawlera_fetch/response_error") == 2 assert middleware_raise.stats.get_value( "crawlera_fetch/response_error/bad_proxy_auth") == 1 assert middleware_raise.stats.get_value( "crawlera_fetch/response_error/JSONDecodeError") == 1 middleware_log = get_test_middleware( settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False}) with LogCapture() as logs: for response in response_list: processed = middleware_log.process_response( response.request, response, Spider("foo")) assert response is processed logs.check_present( ( "crawlera-fetch-middleware", "ERROR", "Error downloading <GET https://example.org> (status: 200, X-Crawlera-Error header: bad_proxy_auth)", # noqa: E501 ), ( "crawlera-fetch-middleware", "ERROR", "Error decoding <GET https://example.org> (status: 200, message: Unterminated string starting at, lineno: 1, colno: 9)", # noqa: E501 ), ) assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 2 assert middleware_log.stats.get_value( "crawlera_fetch/response_error/bad_proxy_auth") == 1 assert middleware_log.stats.get_value( "crawlera_fetch/response_error/JSONDecodeError") == 1