Пример #1
0
    def test_should_raise_error_if_parse_function_raises_error_and_ignore_errors_is_false(self):
        def parse(*_):
            raise ValueError('just a test')

        url = 'http://foo.com'
        respx.get(url)
        static_spider = StaticSpider(urls=[url], parse=parse, ignore_errors=False)

        with pytest.raises(ValueError) as exc_info:
            static_spider._handle_url(url)

        assert 'just a test' == str(exc_info.value)
Пример #2
0
    def test_should_not_raise_error_if_parse_function_raises_error_and_ignore_errors_is_true(self):
        def parse(*_):
            raise ValueError('just a test')

        url = 'http://foo.com'
        respx.get(url)
        static_spider = StaticSpider(urls=[url], parse=parse, ignore_errors=True)

        try:
            static_spider._handle_url(url)
        except ValueError:
            pytest.fail('unexpected ValueError raised when ignore_errors is set to true')
Пример #3
0
    def test_should_not_called_parse_method_if_httpx_response_is_an_error_one(self, mocker, status_code):
        parse_args = []
        url = 'http://foo.com'

        def parse(spider, response):
            parse_args.extend([spider, response])

        respx.get(url) % status_code
        logger_mock = mocker.patch('logging.Logger.info')
        static_spider = StaticSpider(urls=[url], parse=parse)
        static_spider._handle_url(url)

        assert [] == parse_args
        logger_mock.assert_any_call('fetching url %s returns an error with status code %s', url, status_code)
Пример #4
0
    def test_should_not_called_parse_method_when_file_cannot_be_opened(self, tmp_path, mocker):
        logger_mock = mocker.patch('logging.Logger.exception')
        hello_file = tmp_path / 'hello.txt'
        file_url = hello_file.resolve().as_uri()
        parse_args = []

        def parse(spider, response):
            parse_args.extend([spider, response])

        static_spider = StaticSpider(urls=[file_url], parse=parse)
        static_spider._handle_url(file_url)

        assert [] == parse_args
        logger_mock.assert_any_call('unable to open file %s', file_url)
        assert {file_url} == static_spider.unreachable_urls
        assert set() == static_spider.reachable_urls
Пример #5
0
    def test_should_not_called_parse_method_if_url_is_forbidden_by_robots_txt(self, mocker):
        parse_args = []
        url = 'http://foo.com'

        def parse(spider, response):
            parse_args.extend([spider, response])

        respx.get(f'{url}/robots.txt') % 401
        logger_mock = mocker.patch('logging.Logger.info')
        static_spider = StaticSpider(urls=[url], parse=parse, config=Configuration(follow_robots_txt=True))
        static_spider._handle_url(url)

        assert [] == parse_args
        assert {url} == static_spider.robots_excluded_urls
        logger_mock.assert_any_call(
            'robots.txt rule has forbidden the processing of url %s or the url is not reachable', url
        )
Пример #6
0
    def test_should_read_file_content_when_giving_a_file_url(self, tmp_path):
        parse_args = []
        hello_file = tmp_path / 'hello.txt'
        hello_file.write_text('hello world')
        file_url = hello_file.resolve().as_uri()

        def parse(spider, response):
            parse_args.extend([spider, response])

        static_spider = StaticSpider(urls=[file_url], parse=parse)
        static_spider._handle_url(file_url)

        assert parse_args[0] is static_spider
        static_response = parse_args[1]
        assert isinstance(static_response, StaticResponse)
        assert file_url == static_response._url
        assert 'hello world' == static_response._text
        assert static_response._httpx_response is None
        assert {file_url} == static_spider.reachable_urls
Пример #7
0
    def test_should_fetch_content_when_giving_http_url(self):
        parse_args = []
        url = 'http://foo.com'

        def parse(spider, response):
            parse_args.extend([spider, response])

        respx.get(url) % {'status_code': 200, 'text': 'http content'}
        static_spider = StaticSpider(urls=[url], parse=parse)
        static_spider._handle_url(url)

        assert parse_args[0] is static_spider
        static_response = parse_args[1]
        assert isinstance(static_response, StaticResponse)
        assert '' == static_response._url
        assert '' == static_response._text
        assert 200 == static_response._httpx_response.status_code
        assert 'http content' == static_response._httpx_response.text
        assert 1 == static_spider.request_counter
        assert static_spider._total_fetch_time > 0