def test_should_read_file_content_when_giving_a_file_url(self, tmp_path): parse_args = [] hello_file = tmp_path / 'hello.txt' hello_file.write_text('Hello world!') file_url = hello_file.resolve().as_uri() def parse(sel_spider, response): parse_args.extend([sel_spider, response]) spider = SeleniumSpider( urls=[file_url], parse=parse, config=Configuration(selenium_driver_log_file=None)) spider._handle_url(file_url) assert parse_args[0] is spider sel_response = parse_args[1] assert isinstance(sel_response, SeleniumResponse) assert '<body><pre>Hello world!</pre></body>' in sel_response.driver.page_source assert {file_url} == spider.reachable_urls assert set() == spider.unreachable_urls assert 1 == spider.request_counter assert spider._total_fetch_time > 0 # cleanup spider._cleanup()
def test_should_fetch_content_when_giving_http_url(self, mocker): parse_args = [] url = 'http://foo.com' def parse(sel_spider, response): parse_args.extend([sel_spider, response]) respx.get(f'{url}/robots.txt', status_code=404) mocker.patch('selenium.webdriver.remote.webdriver.WebDriver.get') mocker.patch( 'selenium.webdriver.remote.webdriver.WebDriver.current_window_handle', 'handle') config = Configuration(follow_robots_txt=True, selenium_driver_log_file=None) spider = SeleniumSpider(urls=[url], parse=parse, config=config) spider._handle_url(url) assert parse_args[0] is spider selenium_response = parse_args[1] assert isinstance(selenium_response, SeleniumResponse) assert selenium_response.driver is spider._driver assert 'handle' == selenium_response.handle assert {url} == spider.reachable_urls assert set() == spider.unreachable_urls assert 1 == spider.request_counter assert spider._total_fetch_time > 0 # cleanup spider._cleanup()
def test_should_do_nothing_if_url_is_already_present_in_one_url_set( self, mocker, reachable_urls, unreachable_urls, robots_excluded_urls ): url = 'http://foo.com' logger_mock = mocker.patch('logging.Logger.debug') config = Configuration(selenium_driver_log_file=None) spider = SeleniumSpider(urls=['http://bar.com'], parse=lambda x, y: None, config=config) spider.reachable_urls = reachable_urls spider.unreachable_urls = unreachable_urls spider.robots_excluded_urls = robots_excluded_urls spider._handle_url(url) logger_mock.assert_any_call('url %s has already been processed', url) # cleanup spider._cleanup()
def test_should_not_raise_error_if_parse_function_raises_error_and_ignore_errors_is_true(self, mocker): def parse(*_): raise ValueError('simple error') url = 'http://foo.com' mocker.patch('selenium.webdriver.remote.webdriver.WebDriver.get') config = Configuration(selenium_driver_log_file=None) spider = SeleniumSpider(urls=[url], parse=parse, config=config, ignore_errors=True) try: spider._handle_url(url) except ValueError: pytest.fail('unexpected ValueError raised when ignore_errors is set to true') # cleanup spider._cleanup()
def test_should_raise_error_if_parse_function_raises_error_and_ignore_errors_is_false(self, mocker): def parse(*_): raise ValueError('simple error') url = 'http://foo.com' mocker.patch('selenium.webdriver.remote.webdriver.WebDriver.get') config = Configuration(selenium_driver_log_file=None) spider = SeleniumSpider(urls=[url], parse=parse, config=config, ignore_errors=False) with pytest.raises(ValueError) as exc_info: spider._handle_url(url) assert 'simple error' == str(exc_info.value) # cleanup spider._cleanup()
def test_should_not_called_parse_method_if_url_is_not_accessible(self, mocker): parse_args = [] url = 'http://foo.com' def parse(sel_spider, response): parse_args.extend([sel_spider, response]) respx.get(f'{url}/robots.txt') % 404 mocker.patch('selenium.webdriver.remote.webdriver.WebDriver.get', side_effect=WebDriverException) config = Configuration(follow_robots_txt=True, selenium_driver_log_file=None) spider = SeleniumSpider(urls=[url], parse=parse, config=config) spider._handle_url(url) assert [] == parse_args assert {url} == spider.unreachable_urls assert set() == spider.reachable_urls assert 0 == spider.request_counter == spider._total_fetch_time # cleanup spider._cleanup()
def test_should_not_call_parse_method_when_file_cannot_be_opened(self, mocker, tmp_path): logger_mock = mocker.patch('logging.Logger.exception') hello_file = tmp_path / 'hello.txt' file_url = hello_file.resolve().as_uri() parse_args = [] def parse(sel_spider, response): parse_args.extend([sel_spider, response]) spider = SeleniumSpider(urls=[file_url], parse=parse, config=Configuration(selenium_driver_log_file=None)) spider._handle_url(file_url) assert [] == parse_args logger_mock.assert_any_call(f'unable to open file {file_url}') assert {file_url} == spider.unreachable_urls assert set() == spider.reachable_urls assert 0 == spider.request_counter == spider._total_fetch_time # cleanup spider._cleanup()
def test_should_not_called_parse_method_if_url_is_forbidden_by_robots_txt(self, mocker): parse_args = [] url = 'http://foo.com' def parse(sel_spider, response): parse_args.extend([sel_spider, response]) respx.get(f'{url}/robots.txt') % 401 logger_mock = mocker.patch('logging.Logger.info') config = Configuration(follow_robots_txt=True, selenium_driver_log_file=None) spider = SeleniumSpider(urls=[url], parse=parse, config=config) spider._handle_url(url) assert [] == parse_args assert {url} == spider.robots_excluded_urls logger_mock.assert_any_call( 'robots.txt rule has forbidden the processing of url %s or the url is not reachable', url ) # cleanup spider._cleanup()