Exemplo n.º 1
0
def test_post_request():
    mw = _get_mw()
    for body in [b'', b'foo=bar']:
        req1 = scrapy.Request("http://example.com",
                              method="POST",
                              body=body,
                              meta={'splash': {'endpoint': 'render.html'}})
        req = mw.process_request(req1, None)
        assert json.loads(to_native_str(req.body)) == {
            'url': 'http://example.com',
            'http_method': 'POST',
            'body': to_native_str(body),
        }
Exemplo n.º 2
0
def test_post_request():
    mw = _get_mw()
    for body in [b'', b'foo=bar']:
        req1 = scrapy.Request("http://example.com",
                              method="POST",
                              body=body,
                              meta={'splash': {'endpoint': 'render.html'}})
        req = mw.process_request(req1, None)
        assert json.loads(to_native_str(req.body)) == {
            'url': 'http://example.com',
            'http_method': 'POST',
            'body': to_native_str(body),
        }
Exemplo n.º 3
0
    def __init__(self,
                 url=None,
                 callback=None,
                 method='GET',
                 endpoint='render',
                 args=None,
                 splash_url=None,
                 slot_policy=SlotPolicy.PER_DOMAIN,
                 splash_headers=None,
                 dont_process_response=False,
                 dont_send_headers=False,
                 magic_response=True,
                 session_id='default',
                 http_status_from_error_code=True,
                 cache_args=None,
                 meta=None,
                 **kwargs):

        if url is None:
            url = 'about:blank'
        url = to_native_str(url)

        meta = copy.deepcopy(meta) or {}
        splash_meta = meta.setdefault('splash', {})
        splash_meta.setdefault('endpoint', endpoint)
        splash_meta.setdefault('slot_policy', slot_policy)
        if splash_url is not None:
            splash_meta['splash_url'] = splash_url
        if splash_headers is not None:
            splash_meta['splash_headers'] = splash_headers
        if dont_process_response:
            splash_meta['dont_process_response'] = True
        else:
            splash_meta.setdefault('magic_response', magic_response)
        if dont_send_headers:
            splash_meta['dont_send_headers'] = True
        if http_status_from_error_code:
            splash_meta['http_status_from_error_code'] = True
        if cache_args is not None:
            splash_meta['cache_args'] = cache_args

        if session_id is not None:
            if splash_meta['endpoint'].strip('/') == 'execute':
                splash_meta.setdefault('session_id', session_id)

        _args = {'url': url}  # put URL to args in order to preserve #fragment
        _args.update(args or {})
        _args.update(splash_meta.get('args', {}))
        splash_meta['args'] = _args

        # This is not strictly required, but it strengthens Splash
        # requests against AjaxCrawlMiddleware
        meta['ajax_crawlable'] = True

        super(SplashRequest, self).__init__(url,
                                            callback,
                                            method,
                                            meta=meta,
                                            **kwargs)
Exemplo n.º 4
0
def test_url_with_fragment():
    mw = _get_mw()
    url = "http://example.com#id1"
    req = scrapy.Request("http://example.com", meta={
        'splash': {'args': {'url': url}}
    })
    req = mw.process_request(req, None)
    assert json.loads(to_native_str(req.body)) == {'url': url}
Exemplo n.º 5
0
def test_url_with_fragment():
    mw = _get_mw()
    url = "http://example.com#id1"
    req = scrapy.Request("http://example.com", meta={
        'splash': {'args': {'url': url}}
    })
    req = mw.process_request(req, None)
    assert json.loads(to_native_str(req.body)) == {'url': url}
Exemplo n.º 6
0
    def __init__(self,
                 url=None,
                 callback=None,
                 method='GET',
                 endpoint='render.html',
                 args=None,
                 splash_url=None,
                 slot_policy=SlotPolicy.PER_DOMAIN,
                 splash_headers=None,
                 dont_process_response=False,
                 dont_send_headers=False,
                 magic_response=True,
                 session_id='default',
                 http_status_from_error_code=True,
                 cache_args=None,
                 meta=None,
                 **kwargs):

        if url is None:
            url = 'about:blank'
        url = to_native_str(url)

        meta = meta or {}
        splash_meta = meta.setdefault('splash', {})
        splash_meta.setdefault('endpoint', endpoint)
        splash_meta.setdefault('slot_policy', slot_policy)
        if splash_url is not None:
            splash_meta['splash_url'] = splash_url
        if splash_headers is not None:
            splash_meta['splash_headers'] = splash_headers
        if dont_process_response:
            splash_meta['dont_process_response'] = True
        else:
            splash_meta.setdefault('magic_response', magic_response)
        if dont_send_headers:
            splash_meta['dont_send_headers'] = True
        if http_status_from_error_code:
            splash_meta['http_status_from_error_code'] = True
        if cache_args is not None:
            splash_meta['cache_args'] = cache_args

        if session_id is not None:
            if splash_meta['endpoint'].strip('/') == 'execute':
                splash_meta.setdefault('session_id', session_id)

        _args = {'url': url}  # put URL to args in order to preserve #fragment
        _args.update(args or {})
        _args.update(splash_meta.get('args', {}))
        splash_meta['args'] = _args

        # This is not strictly required, but it strengthens Splash
        # requests against AjaxCrawlMiddleware
        meta['ajax_crawlable'] = True

        super(SplashRequest, self).__init__(url, callback, method, meta=meta,
                                            **kwargs)
Exemplo n.º 7
0
def test_float_wait_arg():
    mw = _get_mw()
    req1 = scrapy.Request("http://example.com", meta={
        'splash': {
            'endpoint': 'render.html',
            'args': {'wait': 0.5}
        }
    })
    req = mw.process_request(req1, None)
    assert json.loads(to_native_str(req.body)) == {'url': req1.url, 'wait': 0.5}
Exemplo n.º 8
0
def test_float_wait_arg():
    mw = _get_mw()
    req1 = scrapy.Request("http://example.com", meta={
        'splash': {
            'endpoint': 'render.html',
            'args': {'wait': 0.5}
        }
    })
    req = mw.process_request(req1, None)
    assert json.loads(to_native_str(req.body)) == {'url': req1.url, 'wait': 0.5}
Exemplo n.º 9
0
def test_override_splash_url():
    mw = _get_mw()
    req1 = scrapy.Request("http://example.com", meta={
        'splash': {
            'endpoint': 'render.png',
            'splash_url': 'http://splash.example.com'
        }
    })
    req = mw.process_request(req1, None)
    assert req.url == 'http://splash.example.com/render.png'
    assert json.loads(to_native_str(req.body)) == {'url': req1.url}
Exemplo n.º 10
0
def test_override_splash_url():
    mw = _get_mw()
    req1 = scrapy.Request("http://example.com", meta={
        'splash': {
            'endpoint': 'render.png',
            'splash_url': 'http://splash.example.com'
        }
    })
    req = mw.process_request(req1, None)
    assert req.url == 'http://splash.example.com/render.png'
    assert json.loads(to_native_str(req.body)) == {'url': req1.url}
Exemplo n.º 11
0
def test_splash_request_no_url():
    mw = _get_mw()
    lua_source = "function main(splash) return {result='ok'} end"
    req1 = SplashRequest(meta={'splash': {
        'args': {'lua_source': lua_source},
        'endpoint': 'execute',
    }})
    req = mw.process_request(req1, None)
    assert req.url == 'http://127.0.0.1:8050/execute'
    assert json.loads(to_native_str(req.body)) == {
        'url': 'about:blank',
        'lua_source': lua_source
    }
Exemplo n.º 12
0
def test_splash_request_no_url():
    mw = _get_mw()
    lua_source = "function main(splash) return {result='ok'} end"
    req1 = SplashRequest(meta={'splash': {
        'args': {'lua_source': lua_source},
        'endpoint': 'execute',
    }})
    req = mw.process_request(req1, None)
    assert req.url == 'http://127.0.0.1:8050/execute'
    assert json.loads(to_native_str(req.body)) == {
        'url': 'about:blank',
        'lua_source': lua_source
    }
Exemplo n.º 13
0
def test_splash_request():
    mw = _get_mw()
    cookie_mw = _get_cookie_mw()

    req = SplashRequest("http://example.com?foo=bar&url=1&wait=100")
    assert repr(req) == "<GET http://example.com?foo=bar&url=1&wait=100>"

    # check request preprocessing
    req2 = cookie_mw.process_request(req, None) or req
    req2 = mw.process_request(req2, None) or req2
    assert req2 is not None
    assert req2 is not req
    assert req2.url == "http://127.0.0.1:8050/render.html"
    assert req2.headers == {b'Content-Type': [b'application/json']}
    assert req2.method == 'POST'
    assert isinstance(req2, SplashRequest)
    assert repr(
        req2
    ) == "<GET http://example.com?foo=bar&url=1&wait=100 via http://127.0.0.1:8050/render.html>"

    expected_body = {'url': req.url}
    assert json.loads(to_native_str(req2.body)) == expected_body

    # check response post-processing
    response = TextResponse(
        "http://127.0.0.1:8050/render.html",
        # Scrapy doesn't pass request to constructor
        # request=req2,
        headers={b'Content-Type': b'text/html'},
        body=b"<html><body>Hello</body></html>")
    response2 = mw.process_response(req2, response, None)
    response2 = cookie_mw.process_response(req2, response2, None)
    assert isinstance(response2, scrapy_splash.SplashTextResponse)
    assert response2 is not response
    assert response2.real_url == req2.url
    assert response2.url == req.url
    assert response2.body == b"<html><body>Hello</body></html>"
    assert response2.css("body").extract_first() == "<body>Hello</body>"
    assert response2.headers == {b'Content-Type': [b'text/html']}

    # check .replace method
    response3 = response2.replace(status=404)
    assert response3.status == 404
    assert isinstance(response3, scrapy_splash.SplashTextResponse)
    for attr in ['url', 'real_url', 'headers', 'body']:
        assert getattr(response3, attr) == getattr(response2, attr)
Exemplo n.º 14
0
def test_splash_request():
    mw = _get_mw()
    cookie_mw = _get_cookie_mw()

    req = SplashRequest("http://example.com?foo=bar&url=1&wait=100")
    assert repr(req) == "<GET http://example.com?foo=bar&url=1&wait=100>"

    # check request preprocessing
    req2 = cookie_mw.process_request(req, None) or req
    req2 = mw.process_request(req2, None) or req2
    assert req2 is not None
    assert req2 is not req
    assert req2.url == "http://127.0.0.1:8050/render.html"
    assert req2.headers == {b'Content-Type': [b'application/json']}
    assert req2.method == 'POST'
    assert isinstance(req2, SplashRequest)
    assert repr(req2) == "<GET http://example.com?foo=bar&url=1&wait=100 via http://127.0.0.1:8050/render.html>"

    expected_body = {'url': req.url}
    assert json.loads(to_native_str(req2.body)) == expected_body

    # check response post-processing
    response = TextResponse("http://127.0.0.1:8050/render.html",
                            # Scrapy doesn't pass request to constructor
                            # request=req2,
                            headers={b'Content-Type': b'text/html'},
                            body=b"<html><body>Hello</body></html>")
    response2 = mw.process_response(req2, response, None)
    response2 = cookie_mw.process_response(req2, response2, None)
    assert isinstance(response2, scrapy_splash.SplashTextResponse)
    assert response2 is not response
    assert response2.real_url == req2.url
    assert response2.url == req.url
    assert response2.body == b"<html><body>Hello</body></html>"
    assert response2.css("body").extract_first() == "<body>Hello</body>"
    assert response2.headers == {b'Content-Type': [b'text/html']}

    # check .replace method
    response3 = response2.replace(status=404)
    assert response3.status == 404
    assert isinstance(response3, scrapy_splash.SplashTextResponse)
    for attr in ['url', 'real_url', 'headers', 'body']:
        assert getattr(response3, attr) == getattr(response2, attr)
Exemplo n.º 15
0
def test_splash_request_url_with_fragment():
    mw = _get_mw()
    url = "http://example.com#id1"
    req = SplashRequest(url)
    req = mw.process_request(req, None)
    assert json.loads(to_native_str(req.body)) == {'url': url}
Exemplo n.º 16
0
def test_splash_request_url_with_fragment():
    mw = _get_mw()
    url = "http://example.com#id1"
    req = SplashRequest(url)
    req = mw.process_request(req, None)
    assert json.loads(to_native_str(req.body)) == {'url': url}