def test_process_response_disabled():
    middleware = get_test_middleware(
        settings={"CRAWLERA_FETCH_ENABLED": False})
    for case in test_responses:
        response = case["original"]
        assert middleware.process_response(response.request, response,
                                           foo_spider) is response
示例#2
0
def test_log_formatter_scrapy_1():
    middleware = get_test_middleware()
    logformatter = CrawleraFetchLogFormatter()
    formatter = Formatter()

    for case in get_test_requests():
        original = case["original"]
        response = Response(original.url)
        processed = middleware.process_request(original, foo_spider)

        crawlera_meta = original.meta.get("crawlera_fetch") or {}
        if crawlera_meta.get("skip"):
            assert processed is None
            continue

        # crawled
        result = logformatter.crawled(processed, response, foo_spider)
        assert result["args"]["request"] == str(original)
        record = LogRecord(name="logger",
                           pathname="n/a",
                           lineno=1,
                           exc_info=None,
                           **result)
        logstr = formatter.format(record)
        expected = "Crawled (200) {request} ['original url: {url}'] (referer: None)".format(
            request=original, url=original.url)
        assert logstr == expected
def test_process_request_disabled():
    middleware = get_test_middleware(
        settings={"CRAWLERA_FETCH_ENABLED": False})
    for case in get_test_requests():
        request = case["original"]
        with shub_jobkey_env_variable():
            assert middleware.process_request(request, foo_spider) is None
def test_process_request_single_download_slot():
    middleware = get_test_middleware(
        settings={
            "CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY": DownloadSlotPolicy.Single
        })

    for case in get_test_requests():
        original = case["original"]
        expected = case["expected"]
        if expected:
            expected.meta["download_slot"] = "__crawlera_fetch__"

        with shub_jobkey_env_variable():
            processed = middleware.process_request(original, foo_spider)

        crawlera_meta = original.meta.get("crawlera_fetch")
        if crawlera_meta.get("skip"):
            assert processed is None
        else:
            assert type(processed) is type(expected)
            assert processed.url == expected.url
            assert processed.method == expected.method
            assert processed.headers == expected.headers
            assert processed.meta == expected.meta
            processed_text = processed.body.decode(processed.encoding)
            expected_text = expected.body.decode(expected.encoding)
            assert json.loads(processed_text) == json.loads(expected_text)
def test_process_request_scrapy_1():
    from tests.utils import get_test_middleware

    middleware = get_test_middleware()
    request = Request("https://example.org")
    with shub_jobkey_env_variable():
        processed = middleware.process_request(request, foo_spider)
    assert processed.flags == ["original url: https://example.org"]
def test_stats(mocked_time):
    middleware = get_test_middleware()
    spider = Spider("foo")

    count = 100
    nums = list(range(count))
    random.shuffle(nums)
    status_list = [random.randint(1, 15) for _ in range(count)]
    method_list = [
        random.choice(["GET", "POST", "PUT", "DELETE", "HEAD"])
        for _ in range(count)
    ]

    # expected values
    latencies = [2**n - n for n in nums]
    total_latency = sum(latencies)
    avg_latency = total_latency / count
    max_latency = max(latencies)

    for n, status, method in zip(nums, status_list, method_list):
        request = Request("https://example.org", method=method)
        mocked_time.return_value = n  # start_ts
        processed_request = middleware.process_request(request, spider)

        response = TextResponse(
            url="https://example.org",
            request=processed_request,
            body=json.dumps({
                "headers": {},
                "original_status": status,
                "body": "",
                "url": "http://"
            }).encode("utf-8"),
        )

        mocked_time.return_value = 2**n  # end_ts
        middleware.process_response(processed_request, response, spider)

    middleware.spider_closed(spider, "finished")

    assert middleware.stats.get_value("crawlera_fetch/request_count") == count
    assert middleware.stats.get_value("crawlera_fetch/response_count") == count
    assert middleware.stats.get_value(
        "crawlera_fetch/total_latency") == total_latency
    assert middleware.stats.get_value(
        "crawlera_fetch/avg_latency") == avg_latency
    assert middleware.stats.get_value(
        "crawlera_fetch/max_latency") == max_latency
    for status in set(status_list):
        sc = middleware.stats.get_value(
            "crawlera_fetch/response_status_count/{}".format(status))
        assert sc == status_list.count(status)
    for method in set(method_list):
        mc = middleware.stats.get_value(
            "crawlera_fetch/request_method_count/{}".format(method))
        assert mc == method_list.count(method)
示例#7
0
def test_log_formatter_scrapy_2():
    middleware = get_test_middleware()
    logformatter = CrawleraFetchLogFormatter()
    formatter = Formatter()
    spider = Spider("foo")

    for case in deepcopy(test_requests):
        original = case["original"]
        response = Response(original.url)
        processed = middleware.process_request(original, spider)

        crawlera_meta = original.meta.get("crawlera_fetch") or {}
        if crawlera_meta.get("skip"):
            assert processed is None
            continue

        # crawled
        result = logformatter.crawled(processed, response, spider)
        assert result["args"]["request"] == str(original)
        record = LogRecord(name="logger",
                           pathname="n/a",
                           lineno=1,
                           exc_info=None,
                           **result)
        logstr = formatter.format(record)
        assert logstr == "Crawled (200) %s (referer: None)" % str(original)

        # spider_error
        result = logformatter.spider_error(Failure(Exception("exc")),
                                           processed, response, spider)
        assert result["args"]["request"] == str(original)
        record = LogRecord(name="logger",
                           pathname="n/a",
                           lineno=1,
                           exc_info=None,
                           **result)
        logstr = formatter.format(record)
        assert logstr == "Spider error processing %s (referer: None)" % str(
            original)

        # download_error
        result = logformatter.download_error(Failure(Exception("exc")),
                                             processed, spider, "foo")
        assert result["args"]["request"] == str(original)
        record = LogRecord(name="logger",
                           pathname="n/a",
                           lineno=2,
                           exc_info=None,
                           **result)
        logstr = formatter.format(record)
        assert logstr == "Error downloading %s: foo" % str(original)
示例#8
0
def test_process_request_default_args():
    middleware = get_test_middleware(
        settings={"CRAWLERA_FETCH_DEFAULT_ARGS": {"foo": "bar", "answer": "42"}}
    )

    for case in get_test_requests():
        original = case["original"]
        processed = middleware.process_request(original, foo_spider)

        crawlera_meta = original.meta.get("crawlera_fetch")
        if crawlera_meta.get("skip"):
            assert processed is None
        else:
            processed_text = processed.body.decode(processed.encoding)
            processed_json = json.loads(processed_text)
            assert processed_json["foo"] == "bar"
            assert processed_json["answer"] == "42"
示例#9
0
def test_process_response_skip():
    response = TextResponse(
        url="https://example.org",
        status=200,
        headers={
            "Content-Encoding": "gzip",
            "Transfer-Encoding": "chunked",
            "Date": "Fri, 24 Apr 2020 18:06:42 GMT",
        },
        request=Request(url="https://example.org",
                        meta={"crawlera_fetch": {
                            "skip": True
                        }}),
        body=b"""<html></html>""",
    )

    middleware = get_test_middleware()
    processed = middleware.process_response(response.request, response,
                                            Spider("foo"))

    assert response is processed
def test_process_request():
    middleware = get_test_middleware()

    for case in get_test_requests():
        original = case["original"]
        expected = case["expected"]

        with shub_jobkey_env_variable():
            processed = middleware.process_request(original, foo_spider)

        crawlera_meta = original.meta.get("crawlera_fetch")
        if crawlera_meta.get("skip"):
            assert processed is None
        else:
            assert type(processed) is type(expected)
            assert processed.url == expected.url
            assert processed.method == expected.method
            assert processed.headers == expected.headers
            assert processed.meta == expected.meta
            processed_text = processed.body.decode(processed.encoding)
            expected_text = expected.body.decode(expected.encoding)
            assert json.loads(processed_text) == json.loads(expected_text)
def test_process_response():
    middleware = get_test_middleware()

    for case in test_responses:
        original = case["original"]
        expected = case["expected"]

        processed = middleware.process_response(original.request, original,
                                                foo_spider)

        assert type(processed) is type(expected)
        assert processed.url == expected.url
        assert processed.status == expected.status
        assert processed.headers == expected.headers
        assert processed.body == expected.body

        crawlera_meta = processed.meta.get("crawlera_fetch") or {}
        if crawlera_meta.get("upstream_response"):
            assert crawlera_meta["upstream_response"]["body"] == json.loads(
                original.text)
            assert crawlera_meta["upstream_response"][
                "headers"] == original.headers
            assert crawlera_meta["upstream_response"][
                "status"] == original.status
def test_process_response_error():
    response_list = [
        TextResponse(
            url="https://crawlera.com/fake/api/endpoint",
            request=Request(
                url="https://crawlera.com/fake/api/endpoint",
                meta={
                    "crawlera_fetch": {
                        "timing": {
                            "start_ts": mocked_time()
                        },
                        "original_request":
                        request_to_dict(
                            Request("https://example.org"),
                            spider=foo_spider,
                        ),
                    }
                },
            ),
            headers={
                "X-Crawlera-Error": "bad_proxy_auth",
                "Proxy-Authenticate": 'Basic realm="Crawlera"',
                "Content-Length": "0",
                "Date": "Mon, 04 May 2020 13:06:15 GMT",
                "Proxy-Connection": "close",
                "Connection": "close",
            },
        ),
        TextResponse(
            url="https://crawlera.com/fake/api/endpoint",
            request=Request(
                url="https://crawlera.com/fake/api/endpoint",
                meta={
                    "crawlera_fetch": {
                        "timing": {
                            "start_ts": mocked_time()
                        },
                        "original_request":
                        request_to_dict(
                            Request("https://example.org"),
                            spider=foo_spider,
                        ),
                    }
                },
            ),
            body=b'{"Bad": "JSON',
        ),
        TextResponse(
            url="https://crawlera.com/fake/api/endpoint",
            request=Request(
                url="https://crawlera.com/fake/api/endpoint",
                meta={
                    "crawlera_fetch": {
                        "timing": {
                            "start_ts": mocked_time()
                        },
                        "original_request":
                        request_to_dict(
                            Request("https://example.org"),
                            spider=foo_spider,
                        ),
                    }
                },
            ),
            body=json.dumps({
                "url":
                "https://example.org",
                "original_status":
                503,
                "headers": {},
                "crawlera_status":
                "fail",
                "crawlera_error":
                "serverbusy",
                "body_encoding":
                "plain",
                "body":
                "Server busy: too many outstanding requests",
            }),
            encoding="utf8",
        ),
    ]

    middleware_raise = get_test_middleware(
        settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True})
    for response in response_list:
        with pytest.raises(CrawleraFetchException):
            middleware_raise.process_response(response.request, response,
                                              foo_spider)

    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error") == 3
    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error/bad_proxy_auth") == 1
    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error/JSONDecodeError") == 1
    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error/serverbusy") == 1

    middleware_log = get_test_middleware(
        settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False})
    with LogCapture() as logs:
        for response in response_list:
            processed = middleware_log.process_response(
                response.request, response, foo_spider)
            assert response is processed

    logs.check_present(
        (
            "crawlera-fetch-middleware",
            "WARNING",
            "Error downloading <GET https://example.org> (status: 200, X-Crawlera-Error header: bad_proxy_auth)",  # noqa: E501
        ),
        (
            "crawlera-fetch-middleware",
            "WARNING",
            "Error decoding <GET https://example.org> (status: 200, message: Unterminated string starting at, lineno: 1, colno: 9)",  # noqa: E501
        ),
        (
            "crawlera-fetch-middleware",
            "WARNING",
            "Error downloading <GET https://example.org> (Original status: 503, Fetch API error message: Server busy: too many outstanding requests, Request ID: unknown)",  # noqa: E501
        ),
    )

    assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 3
    assert middleware_log.stats.get_value(
        "crawlera_fetch/response_error/bad_proxy_auth") == 1
    assert middleware_log.stats.get_value(
        "crawlera_fetch/response_error/JSONDecodeError") == 1
    assert middleware_log.stats.get_value(
        "crawlera_fetch/response_error/serverbusy") == 1
示例#13
0
def test_process_response_error():
    response_list = [
        TextResponse(
            url="https://crawlera.com/fake/api/endpoint",
            request=Request(
                url="https://crawlera.com/fake/api/endpoint",
                meta={
                    "crawlera_fetch": {
                        "timing": {
                            "start_ts": mocked_time()
                        },
                        "original_request": {
                            "url": "https://example.org",
                            "method": "GET"
                        },
                    }
                },
            ),
            headers={
                "X-Crawlera-Error": "bad_proxy_auth",
                "Proxy-Authenticate": 'Basic realm="Crawlera"',
                "Content-Length": "0",
                "Date": "Mon, 04 May 2020 13:06:15 GMT",
                "Proxy-Connection": "close",
                "Connection": "close",
            },
        ),
        TextResponse(
            url="https://crawlera.com/fake/api/endpoint",
            request=Request(
                url="https://crawlera.com/fake/api/endpoint",
                meta={
                    "crawlera_fetch": {
                        "timing": {
                            "start_ts": mocked_time()
                        },
                        "original_request": {
                            "url": "https://example.org",
                            "method": "GET"
                        },
                    }
                },
            ),
            body=b'{"Bad": "JSON',
        ),
    ]

    middleware_raise = get_test_middleware(
        settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True})
    for response in response_list:
        with pytest.raises(CrawleraFetchException):
            middleware_raise.process_response(response.request, response,
                                              Spider("foo"))

    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error") == 2
    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error/bad_proxy_auth") == 1
    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error/JSONDecodeError") == 1

    middleware_log = get_test_middleware(
        settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False})
    with LogCapture() as logs:
        for response in response_list:
            processed = middleware_log.process_response(
                response.request, response, Spider("foo"))
            assert response is processed

    logs.check_present(
        (
            "crawlera-fetch-middleware",
            "ERROR",
            "Error downloading <GET https://example.org> (status: 200, X-Crawlera-Error header: bad_proxy_auth)",  # noqa: E501
        ),
        (
            "crawlera-fetch-middleware",
            "ERROR",
            "Error decoding <GET https://example.org> (status: 200, message: Unterminated string starting at, lineno: 1, colno: 9)",  # noqa: E501
        ),
    )

    assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 2
    assert middleware_log.stats.get_value(
        "crawlera_fetch/response_error/bad_proxy_auth") == 1
    assert middleware_log.stats.get_value(
        "crawlera_fetch/response_error/JSONDecodeError") == 1