def get_test_requests():
    test_requests = []

    original1 = Request(
        url="https://httpbin.org/anything",
        method="GET",
        meta={
            "crawlera_fetch": {
                "args": {
                    "render": "no",
                    "region": "us",
                    "iptype": "datacenter",
                    "device": "mobile",
                }
            }
        },
    )
    expected1 = Request(
        url=SETTINGS["CRAWLERA_FETCH_URL"],
        callback=foo_spider.foo_callback,
        method="POST",
        headers={
            "Authorization":
            basic_auth_header(SETTINGS["CRAWLERA_FETCH_APIKEY"],
                              SETTINGS["CRAWLERA_FETCH_APIPASS"]),
            "Content-Type":
            "application/json",
            "Accept":
            "application/json",
            "X-Crawlera-JobId":
            "1/2/3",
        },
        meta={
            "crawlera_fetch": {
                "args": {
                    "render": "no",
                    "region": "us",
                    "iptype": "datacenter",
                    "device": "mobile",
                },
                "original_request": request_to_dict(original1,
                                                    spider=foo_spider),
                "timing": {
                    "start_ts": mocked_time()
                },
            },
            "download_slot": "httpbin.org",
        },
        body=json.dumps({
            "url": "https://httpbin.org/anything",
            "body": "",
            "render": "no",
            "region": "us",
            "iptype": "datacenter",
            "device": "mobile",
        }),
    )
    test_requests.append({
        "original": original1,
        "expected": expected1,
    })

    original2 = FormRequest(
        url="https://httpbin.org/post",
        callback=foo_spider.foo_callback,
        meta={"crawlera_fetch": {
            "args": {
                "device": "desktop"
            }
        }},
        formdata={"foo": "bar"},
    )
    expected2 = FormRequest(
        url=SETTINGS["CRAWLERA_FETCH_URL"],
        method="POST",
        headers={
            "Authorization":
            basic_auth_header(SETTINGS["CRAWLERA_FETCH_APIKEY"],
                              SETTINGS["CRAWLERA_FETCH_APIPASS"]),
            "Content-Type":
            "application/json",
            "Accept":
            "application/json",
            "X-Crawlera-JobId":
            "1/2/3",
        },
        meta={
            "crawlera_fetch": {
                "args": {
                    "device": "desktop"
                },
                "original_request": request_to_dict(original2,
                                                    spider=foo_spider),
                "timing": {
                    "start_ts": mocked_time()
                },
            },
            "download_slot": "httpbin.org",
        },
        body=json.dumps({
            "url": "https://httpbin.org/post",
            "method": "POST",
            "body": "foo=bar",
            "device": "desktop",
        }),
    )
    test_requests.append({
        "original": original2,
        "expected": expected2,
    })

    test_requests.append({
        "original":
        Request(
            url="https://example.org",
            method="HEAD",
            meta={"crawlera_fetch": {
                "skip": True
            }},
        ),
        "expected":
        None,
    })

    return test_requests
def test_process_response_error():
    response_list = [
        TextResponse(
            url="https://crawlera.com/fake/api/endpoint",
            request=Request(
                url="https://crawlera.com/fake/api/endpoint",
                meta={
                    "crawlera_fetch": {
                        "timing": {
                            "start_ts": mocked_time()
                        },
                        "original_request":
                        request_to_dict(
                            Request("https://example.org"),
                            spider=foo_spider,
                        ),
                    }
                },
            ),
            headers={
                "X-Crawlera-Error": "bad_proxy_auth",
                "Proxy-Authenticate": 'Basic realm="Crawlera"',
                "Content-Length": "0",
                "Date": "Mon, 04 May 2020 13:06:15 GMT",
                "Proxy-Connection": "close",
                "Connection": "close",
            },
        ),
        TextResponse(
            url="https://crawlera.com/fake/api/endpoint",
            request=Request(
                url="https://crawlera.com/fake/api/endpoint",
                meta={
                    "crawlera_fetch": {
                        "timing": {
                            "start_ts": mocked_time()
                        },
                        "original_request":
                        request_to_dict(
                            Request("https://example.org"),
                            spider=foo_spider,
                        ),
                    }
                },
            ),
            body=b'{"Bad": "JSON',
        ),
        TextResponse(
            url="https://crawlera.com/fake/api/endpoint",
            request=Request(
                url="https://crawlera.com/fake/api/endpoint",
                meta={
                    "crawlera_fetch": {
                        "timing": {
                            "start_ts": mocked_time()
                        },
                        "original_request":
                        request_to_dict(
                            Request("https://example.org"),
                            spider=foo_spider,
                        ),
                    }
                },
            ),
            body=json.dumps({
                "url":
                "https://example.org",
                "original_status":
                503,
                "headers": {},
                "crawlera_status":
                "fail",
                "crawlera_error":
                "serverbusy",
                "body_encoding":
                "plain",
                "body":
                "Server busy: too many outstanding requests",
            }),
            encoding="utf8",
        ),
    ]

    middleware_raise = get_test_middleware(
        settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True})
    for response in response_list:
        with pytest.raises(CrawleraFetchException):
            middleware_raise.process_response(response.request, response,
                                              foo_spider)

    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error") == 3
    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error/bad_proxy_auth") == 1
    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error/JSONDecodeError") == 1
    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error/serverbusy") == 1

    middleware_log = get_test_middleware(
        settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False})
    with LogCapture() as logs:
        for response in response_list:
            processed = middleware_log.process_response(
                response.request, response, foo_spider)
            assert response is processed

    logs.check_present(
        (
            "crawlera-fetch-middleware",
            "WARNING",
            "Error downloading <GET https://example.org> (status: 200, X-Crawlera-Error header: bad_proxy_auth)",  # noqa: E501
        ),
        (
            "crawlera-fetch-middleware",
            "WARNING",
            "Error decoding <GET https://example.org> (status: 200, message: Unterminated string starting at, lineno: 1, colno: 9)",  # noqa: E501
        ),
        (
            "crawlera-fetch-middleware",
            "WARNING",
            "Error downloading <GET https://example.org> (Original status: 503, Fetch API error message: Server busy: too many outstanding requests, Request ID: unknown)",  # noqa: E501
        ),
    )

    assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 3
    assert middleware_log.stats.get_value(
        "crawlera_fetch/response_error/bad_proxy_auth") == 1
    assert middleware_log.stats.get_value(
        "crawlera_fetch/response_error/JSONDecodeError") == 1
    assert middleware_log.stats.get_value(
        "crawlera_fetch/response_error/serverbusy") == 1
Пример #3
0
 "original": HtmlResponse(
     url=SETTINGS["CRAWLERA_FETCH_URL"],
     status=200,
     headers={
         "Content-Type": "application/json",
         "Content-Encoding": "gzip",
         "Transfer-Encoding": "chunked",
         "Date": "Fri, 24 Apr 2020 18:06:42 GMT",
         "Proxy-Connection": "close",
         "Connection": "close",
     },
     request=Request(
         url=SETTINGS["CRAWLERA_FETCH_URL"],
         meta={
             "crawlera_fetch": {
                 "timing": {"start_ts": mocked_time()},
                 "original_request": request_to_dict(
                     Request("https://fake.host.com"),
                     spider=dummy_spider,
                 ),
             }
         },
     ),
     body=b"""{"url":"https://fake.host.com","original_status":123,"headers":{"fake-header":"true"},"body":"foobar"}""",  # noqa: E501
 ),
 "expected": TextResponse(
     url="https://fake.host.com",
     status=123,
     headers={"Fake-Header": "true"},
     body=b"""foobar""",  # noqa: E501
 ),
Пример #4
0
     url=SETTINGS["CRAWLERA_FETCH_URL"],
     status=200,
     headers={
         "Content-Type": "application/json",
         "Content-Encoding": "gzip",
         "Transfer-Encoding": "chunked",
         "Date": "Fri, 24 Apr 2020 18:06:42 GMT",
         "Proxy-Connection": "close",
         "Connection": "close",
     },
     request=Request(
         url=SETTINGS["CRAWLERA_FETCH_URL"],
         meta={
             "crawlera_fetch": {
                 "timing": {
                     "start_ts": mocked_time()
                 },
                 "original_request":
                 request_to_dict(
                     Request("https://fake.host.com"),
                     spider=foo_spider,
                 ),
             }
         },
     ),
     body=
     b"""{"url":"https://fake.host.com","original_status":123,"headers":{"fake-header":"true"},"body":"foobar"}""",  # noqa: E501
 ),
 "expected":
 TextResponse(
     url="https://fake.host.com",
Пример #5
0
def test_process_response_error():
    response_list = [
        TextResponse(
            url="https://crawlera.com/fake/api/endpoint",
            request=Request(
                url="https://crawlera.com/fake/api/endpoint",
                meta={
                    "crawlera_fetch": {
                        "timing": {
                            "start_ts": mocked_time()
                        },
                        "original_request": {
                            "url": "https://example.org",
                            "method": "GET"
                        },
                    }
                },
            ),
            headers={
                "X-Crawlera-Error": "bad_proxy_auth",
                "Proxy-Authenticate": 'Basic realm="Crawlera"',
                "Content-Length": "0",
                "Date": "Mon, 04 May 2020 13:06:15 GMT",
                "Proxy-Connection": "close",
                "Connection": "close",
            },
        ),
        TextResponse(
            url="https://crawlera.com/fake/api/endpoint",
            request=Request(
                url="https://crawlera.com/fake/api/endpoint",
                meta={
                    "crawlera_fetch": {
                        "timing": {
                            "start_ts": mocked_time()
                        },
                        "original_request": {
                            "url": "https://example.org",
                            "method": "GET"
                        },
                    }
                },
            ),
            body=b'{"Bad": "JSON',
        ),
    ]

    middleware_raise = get_test_middleware(
        settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True})
    for response in response_list:
        with pytest.raises(CrawleraFetchException):
            middleware_raise.process_response(response.request, response,
                                              Spider("foo"))

    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error") == 2
    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error/bad_proxy_auth") == 1
    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error/JSONDecodeError") == 1

    middleware_log = get_test_middleware(
        settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False})
    with LogCapture() as logs:
        for response in response_list:
            processed = middleware_log.process_response(
                response.request, response, Spider("foo"))
            assert response is processed

    logs.check_present(
        (
            "crawlera-fetch-middleware",
            "ERROR",
            "Error downloading <GET https://example.org> (status: 200, X-Crawlera-Error header: bad_proxy_auth)",  # noqa: E501
        ),
        (
            "crawlera-fetch-middleware",
            "ERROR",
            "Error decoding <GET https://example.org> (status: 200, message: Unterminated string starting at, lineno: 1, colno: 9)",  # noqa: E501
        ),
    )

    assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 2
    assert middleware_log.stats.get_value(
        "crawlera_fetch/response_error/bad_proxy_auth") == 1
    assert middleware_log.stats.get_value(
        "crawlera_fetch/response_error/JSONDecodeError") == 1