def decode_request(request: Request) -> httpx.Request: """ Build a httpx Request from httpcore request args. """ method, url, headers, stream = request return httpx.Request(method, url, headers=headers, stream=stream)
def null_request(method='GET', url="http://www.example.org/", **kwargs): return httpx.Request(method, url, **kwargs)
def build_request(self, method, url): headers = http.DEFAULT_CLIENT_OPTIONS["headers"].copy() headers["Authorization"] = f"token {self._token}" return httpx.Request(method, url, headers=headers)
def test_data_pattern(lookup, data, expected): request = httpx.Request("POST", "https://foo.bar/", data=data) match = Data(data, lookup=lookup).match(request) assert bool(match) is expected
def test_method_pattern(lookup, value, expected): request = httpx.Request("GET", "https://foo.bar/") assert bool(Method(value, lookup=lookup).match(request)) is expected
def test_port_pattern(lookup, port, url, expected): request = httpx.Request("GET", url) assert bool(Port(port, lookup=lookup).match(request)) is expected
def test_url_pattern(lookup, value, context, url, expected): request = httpx.Request("GET", url) match = URL(value, lookup=lookup).match(request) assert bool(match) is expected assert match.context == context
def test_content_length_header(): request = httpx.Request("POST", "http://example.org", data=b"test 123") assert request.headers["Content-Length"] == "8"
def test_json_encoded_data(): request = httpx.Request("POST", "http://example.org", json={"test": 123}) request.read() assert request.headers["Content-Type"] == "application/json" assert request.content == b'{"test": 123}'
async def test_deprecated_apis(): with respx.mock: url = "https://foo.bar/" # Response kwargs among request kwargs with warnings.catch_warnings(record=True) as w: respx.get(url, status_code=201) respx.get(url, headers={}) respx.get(url, content_type="foo/bar") respx.get(url, content="") respx.get(url, text="") respx.get(url, html="") respx.get(url, json={}) respx.get(url, pass_through=True) assert len(w) == 8 # Add route by http method string with warnings.catch_warnings(record=True) as w: respx.add("GET", url) assert len(w) == 1 # Alias and aliases with warnings.catch_warnings(record=True) as w: request_pattern = respx.get(url, alias="index") name = request_pattern.alias aliased_pattern = respx.mock.aliases["index"] assert aliased_pattern is request_pattern assert name == request_pattern.name assert len(w) == 3 # RequestPattern with warnings.catch_warnings(record=True) as w: callback = lambda req, res: res # pragma: nocover request_pattern = RequestPattern(callback) assert request_pattern.has_side_effect request_pattern = RequestPattern("GET", "https://foo.bar/", pass_through=True) assert request_pattern.is_pass_through assert len(w) == 2 # ResponseTemplate with warnings.catch_warnings(record=True) as w: request = httpx.Request("GET", "https://foo.bar/") callback = lambda request: ResponseTemplate(201) request_pattern = RequestPattern(callback, response=ResponseTemplate(444)) assert request_pattern.resolve(request).status_code == 201 request_pattern = RequestPattern("GET", response=ResponseTemplate(444)) assert request_pattern.resolve(request).status_code == 444 assert len(w) == 5 # Mixing callback and response details with pytest.raises(NotImplementedError): callback = lambda request: ResponseTemplate(201) # pragma: nocover respx.Router().add(callback, status_code=201) # Async callback with pytest.raises(NotImplementedError): async def callback(request): return None # pragma: nocover mock_response = MockResponse(content=callback) request = httpx.Request("GET", "http://foo.bar/") mock_response.as_response(request)
def test_no_content(): request = httpx.Request("GET", "http://example.org") assert "Content-Length" not in request.headers
def test_request_user_agent_pipeline(): pl = pls.RequestUserAgentPipeline(user_agent="ant") req = httpx.Request("GET", "https://www.hi.com") assert pl.process(req) is req assert req.headers["User-Agent"] == "ant"
def test_request_duplicate_filter_pipeline(): pl = pls.RequestDuplicateFilterPipeline() req = httpx.Request("GET", "http://test.com") assert pl.process(req) is req with pytest.raises(Dropped): pl.process(req)
async def test_pipeline(): pl = pls.Pipeline() pl.process(httpx.Request("GET", "https://test.com"))
def test_scheme_pattern(lookup, scheme, expected): request = httpx.Request("GET", "https://foo.bar/") assert bool(Scheme(scheme, lookup=lookup).match(request)) is expected
def test_request_repr(): request = httpx.Request("GET", "http://example.org") assert repr(request) == "<Request('GET', 'http://example.org')>"
def test_host_pattern(lookup, host, expected): request = httpx.Request("GET", "https://foo.bar/") assert bool(Host(host, lookup=lookup).match(request)) is expected
async def test_cannot_access_content_without_read(): # Ensure a request may still be streamed if it has been read. # Needed for cases such as authentication classes that read the request body. request = httpx.Request("POST", "http://example.org", json={"test": 123}) with pytest.raises(httpx.RequestNotRead): request.content
def test_params_pattern(lookup, params, url, expected): request = httpx.Request("GET", url) assert bool(Params(params, lookup=lookup).match(request)) is expected
def test_override_host_header(): headers = {"host": "1.2.3.4:80"} request = httpx.Request("GET", "http://example.org", headers=headers) assert request.headers["Host"] == "1.2.3.4:80"
def test_content_pattern(lookup, content, expected): request = httpx.Request("POST", "https://foo.bar/", content=b"foobar") match = Content(content, lookup=lookup).match(request) assert bool(match) is expected
def test_override_accept_encoding_header(): headers = {"Accept-Encoding": "identity"} request = httpx.Request("GET", "http://example.org", headers=headers) assert request.headers["Accept-Encoding"] == "identity"
def test_json_pattern(lookup, value, json, expected): request = httpx.Request("POST", "https://foo.bar/", json=json) match = JSON(value, lookup=lookup).match(request) assert bool(match) is expected
def build_request(self, method: str, url: str) -> httpx.Request: headers = http.DEFAULT_HEADERS.copy() headers["Authorization"] = f"token {self._token}" return httpx.Request(method, url, headers=headers)
def test_headers_pattern(lookup, headers, request_headers, expected): request = httpx.Request("GET", "http://foo.bar/", headers=request_headers, json={"foo": "bar"}) assert bool(Headers(headers, lookup=lookup).match(request)) is expected
import brotli import pytest import httpx from httpx.content_streams import AsyncIteratorStream from httpx.decoders import ( BrotliDecoder, DeflateDecoder, GZipDecoder, IdentityDecoder, LineDecoder, TextDecoder, ) REQUEST = httpx.Request("GET", "https://example.org") def test_deflate(): body = b"test 123" compressor = zlib.compressobj(9, zlib.DEFLATED, -zlib.MAX_WBITS) compressed_body = compressor.compress(body) + compressor.flush() headers = [(b"Content-Encoding", b"deflate")] response = httpx.Response(200, headers=headers, content=compressed_body, request=REQUEST) assert response.content == body
class TestStaticSpider: """Tests class StaticSpider""" async def test_specific_static_attributes_are_correctly_instantiated(self): config = Configuration(user_agent='mozilla/5.0') spider = StaticSpider(urls=['http://foo.com'], parse=lambda x, y: None, config=config) assert config == spider._config assert isinstance(spider._lock, anyio.Lock) assert isinstance(spider._queue, Queue) assert isinstance(spider._start_time, float) assert isinstance(spider._http_client, httpx.AsyncClient) assert isinstance(spider._robots_analyser, RobotsAnalyzer) # _fetch tests @respx.mock async def test_fetch_method_returns_httpx_response(self, anyio_spider): url = 'http://foo.com' respx.get('http://foo.com') % {'text': 'content'} response = await anyio_spider._fetch(url) assert 'content' == response.text assert 200 == response.status_code assert url == f'{response.url}' @respx.mock async def test_middlewares_are_applied_when_fetching_resources( self, capsys): def log_middleware(fetch): async def wrapper(*args, **kwargs): print('before fetching') return await fetch(*args, **kwargs) print('after fetching') return wrapper url = 'http://foo.com' respx.get(url) config = Configuration(response_middlewares=[log_middleware]) spider = StaticSpider(urls=[url], parse=lambda x, y: None, config=config) response = await spider._fetch(url) assert 200 == response.status_code out, _ = capsys.readouterr() assert 'before fetching' in out assert 'after fetching' in out # _get_static_response test @pytest.mark.parametrize( ('url', 'text', 'httpx_response'), [ ('file:///home/kevin/page.html', 'hello world', None), ('', '', httpx.Response(200, request=httpx.Request('GET', 'http://foo.com'))), ], ) async def test_should_return_static_response_when_giving_correct_input( self, url, text, httpx_response, anyio_spider): static_response = anyio_spider._get_static_response( url, text, httpx_response) assert isinstance(static_response, StaticResponse) assert static_response._reachable_urls is anyio_spider.reachable_urls assert static_response._followed_urls is anyio_spider.followed_urls assert static_response._queue is anyio_spider._queue assert url == static_response._url assert text == static_response._text assert static_response._httpx_response is httpx_response # _handle_url tests @pytest.mark.parametrize( ('reachable_urls', 'unreachable_urls', 'robots_excluded_urls'), [(set(), set(), {'http://foo.com'}), (set(), {'http://foo.com'}, set()), ({'http://foo.com'}, set(), set())], ) async def test_should_do_nothing_if_url_is_already_present_in_one_url_set( self, mocker, anyio_spider, reachable_urls, unreachable_urls, robots_excluded_urls): url = 'http://foo.com' logger_mock = mocker.patch('logging.Logger.debug') anyio_spider.reachable_urls = reachable_urls anyio_spider.unreachable_urls = unreachable_urls anyio_spider.robots_excluded_urls = robots_excluded_urls await anyio_spider._handle_url(url) logger_mock.assert_any_call('url %s has already been processed', url) async def test_should_read_file_content_when_giving_a_file_url( self, tmp_path): parse_args = [] hello_file = tmp_path / 'hello.txt' hello_file.write_text('hello world') file_url = hello_file.resolve().as_uri() async def parse(spider, response): parse_args.extend([spider, response]) static_spider = StaticSpider(urls=[file_url], parse=parse) await static_spider._handle_url(file_url) assert parse_args[0] is static_spider static_response = parse_args[1] assert isinstance(static_response, StaticResponse) assert file_url == static_response._url assert 'hello world' == static_response._text assert static_response._httpx_response is None assert {file_url} == static_spider.reachable_urls async def test_should_not_called_parse_method_when_file_cannot_be_opened( self, tmp_path, mocker): logger_mock = mocker.patch('logging.Logger.exception') hello_file = tmp_path / 'hello.txt' file_url = hello_file.resolve().as_uri() parse_args = [] async def parse(spider, response): parse_args.extend([spider, response]) static_spider = StaticSpider(urls=[file_url], parse=parse) await static_spider._handle_url(file_url) assert [] == parse_args logger_mock.assert_any_call('unable to open file %s', file_url) assert {file_url} == static_spider.unreachable_urls @respx.mock @pytest.mark.parametrize('status_code', [404, 500]) async def test_should_not_called_parse_method_if_httpx_response_is_an_error_one( self, mocker, status_code): parse_args = [] url = 'http://foo.com' def parse(spider, response): parse_args.extend([spider, response]) respx.get(url) % status_code logger_mock = mocker.patch('logging.Logger.info') static_spider = StaticSpider(urls=[url], parse=parse) await static_spider._handle_url(url) assert [] == parse_args logger_mock.assert_any_call( 'fetching url %s returns an error with status code %s', url, status_code) # for an unknown reason asyncio timer on Windows does not work correctly which makes # static_spider._total_fetch_time to be equal to 0.0 and therefore the test fails # this si why the test is only run on trio backend @respx.mock @pytest.mark.parametrize('anyio_backend', ['trio']) async def test_should_fetch_content_when_giving_http_url( self, anyio_backend): parse_args = [] url = 'http://foo.com' async def parse(spider, response): parse_args.extend([spider, response]) respx.get(url) % {'text': 'http content'} static_spider = StaticSpider(urls=[url], parse=parse) await static_spider._handle_url(url) assert parse_args[0] is static_spider static_response = parse_args[1] assert isinstance(static_response, StaticResponse) assert '' == static_response._url assert '' == static_response._text assert 200 == static_response._httpx_response.status_code assert 'http content' == static_response._httpx_response.text assert 1 == static_spider.request_counter assert static_spider._total_fetch_time > 0 @respx.mock async def test_should_raise_errors_if_parse_function_raises_error_and_ignore_errors_is_false( self): async def parse(*_): raise ValueError('just a test') url = 'http://foo.com' respx.get(url) static_spider = StaticSpider(urls=[url], parse=parse, ignore_errors=False) with pytest.raises(ValueError) as exc_info: await static_spider._handle_url(url) assert 'just a test' == str(exc_info.value) @respx.mock async def test_should_not_raise_error_if_parse_function_raises_error_and_ignore_errors_is_true( self): async def parse(*_): raise ValueError('just a test') url = 'http://foo.com' respx.get(url) static_spider = StaticSpider(urls=[url], parse=parse, ignore_errors=True) try: await static_spider._handle_url(url) except ValueError: pytest.fail('ValueError was raised and it should not happen') # save_item tests async def test_should_call_item_processors_and_reject_item_if_one_processor_returns_none( self, capsys, mocker): logger_mock = mocker.patch('logging.Logger.debug') data = {'banana': True} def processor_1(item): print("I'm a processor") return item async def processor_2(item): await anyio.sleep(0) if 'banana' in item: return return item config = Configuration(item_processors=[processor_1, processor_2]) static_spider = StaticSpider(urls=['http://foo.com'], parse=lambda x, y: None, config=config) await static_spider.save_item(data) logger_mock.assert_any_call('item %s was rejected', data) out, _ = capsys.readouterr() assert "I'm a processor" in out async def test_should_save_content_to_backup_file(self, tmp_path, capsys): def processor(item): print("I'm a processor") return item backup = tmp_path / 'backup.mp' fruit_1 = {'fruit': 'pineapple'} fruit_2 = {'fruit': 'orange'} config = Configuration(backup_filename=f'{backup.resolve()}', item_processors=[processor]) static_spider = StaticSpider(urls=['https://foo.com'], parse=lambda x, y: None, config=config) await static_spider.save_item(fruit_1) await static_spider.save_item(fruit_2) out, _ = capsys.readouterr() assert [fruit_1, fruit_2 ] == [item async for item in read_mp(f'{backup.resolve()}')] assert "I'm a processor" in out # _get_request_delay tests @respx.mock @pytest.mark.parametrize(('robots_content', 'value'), [('Disallow: /', -1), ('Crawl-delay: 2', 2)]) async def test_should_return_robots_txt_value_when_follow_robots_txt_is_true( self, robots_content, value): url = 'http://foo.com' respx.get(f'{url}/robots.txt') % { 'text': f'User-agent:*\n{robots_content}' } static_spider = StaticSpider( urls=[url], parse=lambda x, y: None, config=Configuration(follow_robots_txt=True)) assert value == await static_spider._get_request_delay(url) @respx.mock async def test_should_return_config_delay_when_follow_robots_txt_is_false( self): url = 'http://foo.com' request = respx.get(f'{url}/robots.txt') % { 'text': 'User-agent:*\nDisallow: ' } config = Configuration(min_request_delay=3, max_request_delay=3) static_spider = StaticSpider(urls=[url], parse=lambda x, y: None, config=config) assert not request.called assert 3 == await static_spider._get_request_delay(url) # simple test of run and statistics methods, more reliable tests are below @respx.mock # it is very weird but when I try to combine this test with the next one (to have only one test) I get # a strange error related to mocking. This is why I have two tests, one to check exclusion works and another # to check the rest of the logic async def test_should_exclude_url_when_robots_txt_excludes_it(self): url = 'http://foo.com' respx.get(f'{url}/robots.txt') % 401 async def parse(*_) -> None: pass static_spider = StaticSpider( urls=[url], parse=parse, config=Configuration(follow_robots_txt=True)) await static_spider.run() assert static_spider.reachable_urls == set() assert static_spider.robots_excluded_urls == {url} # trio is the only backend checked for the same reason explained above for the test # test_should_fetch_content_when_giving_http_url @respx.mock @pytest.mark.parametrize('anyio_backend', ['trio']) async def test_should_return_correct_statistics_after_running_spider( self, anyio_backend): url1 = 'http://foo.com' respx.get(url1, path='/') respx.get(f'{url1}', path='/robots.txt') % 404 async def parse(*_) -> None: pass static_spider = StaticSpider( urls=[url1], parse=parse, config=Configuration(follow_robots_txt=True)) await static_spider.run() stats = static_spider.statistics() assert stats.reachable_urls == {url1} assert stats.unreachable_urls == set() assert stats.followed_urls == set() assert stats.robot_excluded_urls == set() assert stats.request_counter == 1 assert stats.total_time > 0 assert stats.average_fetch_time > 0
def test_cookies_pattern(lookup, cookies, request_cookies, expected): request = httpx.Request("GET", "http://foo.bar/", cookies=request_cookies, json={"foo": "bar"}) assert bool(Cookies(cookies, lookup=lookup).match(request)) is expected
def build_github_app_request(self, method, url, force=False): headers = http.DEFAULT_CLIENT_OPTIONS["headers"].copy() headers[ "Authorization"] = f"Bearer {github_app.get_or_create_jwt(force)}" return httpx.Request(method, url, headers=headers)
def request_base(): request = httpx.Request("GET", "https://example.com") with httpx.Client() as client: response = client.send(request)