async def test_should_return_false_when_requesting_forbidden_url( self, httpx_mock, tmp_path, robots_content, url_path): analyzer = RobotsAnalyzer(user_agent='Googlebot', robots_cache=tmp_path) httpx_mock.get('/robots.txt') % {'text': robots_content} assert await analyzer.can_fetch(f'http://example.com/{url_path}/1' ) is False
async def test_should_call_http_client_aclose_method(self, tmp_path): http_client_mock = mock.AsyncMock() analyzer = RobotsAnalyzer( robots_cache=tmp_path, user_agent='Mozilla/5.0', http_client=http_client_mock, ) await analyzer.close() http_client_mock.aclose.assert_awaited_once()
def test_should_correctly_instantiate_class_without_giving_httpx_response( self, tmp_path): analyzer = RobotsAnalyzer(user_agent='Mozilla/5.0', robots_cache=tmp_path) assert 'Mozilla/5.0' == analyzer._user_agent assert tmp_path == analyzer._robots_cache assert isinstance(analyzer._http_client, httpx.AsyncClient) assert 'Mozilla/5.0' == analyzer._http_client.headers['User-Agent'] assert isinstance(analyzer._robots_parser, RobotFileParser) assert_dicts(analyzer._robots_mapping, {}) assert_dicts(analyzer._delay_mapping, {})
async def test_should_call_can_fetch_only_one_time(self, mocker, tmp_path): url = 'http://example.com/page/1' can_fetch_mock = mocker.patch( 'scalpel.any_io.robots.RobotsAnalyzer.can_fetch', new=mock.AsyncMock()) can_fetch_mock.return_value = False analyzer = RobotsAnalyzer(robots_cache=tmp_path, user_agent='Mozilla/5.0') assert -1 == await analyzer.get_request_delay(url, 0) assert -1 == await analyzer.get_request_delay(url, 0) can_fetch_mock.assert_awaited_once_with(url)
def test_should_correctly_instantiate_class_with_httpx_response_passed_as_argument( self, tmp_path): http_client = httpx.AsyncClient(headers={'User-Agent': 'python-httpx'}) analyzer = RobotsAnalyzer(user_agent='Mozilla/5.0', robots_cache=tmp_path, http_client=http_client) assert 'Mozilla/5.0' == analyzer._user_agent assert tmp_path == analyzer._robots_cache assert isinstance(analyzer._http_client, httpx.AsyncClient) assert 'python-httpx' == analyzer._http_client.headers['User-Agent'] assert isinstance(analyzer._robots_parser, RobotFileParser) assert_dicts(analyzer._robots_mapping, {}) assert_dicts(analyzer._delay_mapping, {})
async def test_should_return_delay_if_it_is_in_internal_delay_mapping( self, mocker, tmp_path): crawl_delay_mock = mocker.patch( 'urllib.robotparser.RobotFileParser.crawl_delay') can_fetch_mock = mocker.patch( 'scalpel.any_io.robots.RobotsAnalyzer.can_fetch') delay = 2 analyzer = RobotsAnalyzer(robots_cache=tmp_path, user_agent='Mozilla/5.0') analyzer._delay_mapping['example.com'] = delay assert await analyzer.get_request_delay('http://example.com/page/1', 0) == delay can_fetch_mock.assert_not_called() crawl_delay_mock.assert_not_called()
def anyio_analyzer(tmp_path): return RobotsAnalyzer(user_agent='Mozilla/5.0', robots_cache=tmp_path)