Пример #1
0
    def test_should_raise_error_when_file_is_not_path_or_string(
            self, env_file):
        with pytest.raises(TypeError) as exc_info:
            Configuration.load_from_dotenv(env_file)

        assert f'env file must be of type Path or str but you provided {type(env_file)}' == str(
            exc_info.value)
Пример #2
0
    def test_should_raise_error_when_file_has_not_correct_type(
            self, test_file):
        with pytest.raises(TypeError) as exc_info:
            Configuration._check_file(test_file, 'txt')

        assert f'txt file must be of type Path or str but you provided {type(test_file)}' == str(
            exc_info.value)
Пример #3
0
    def test_should_raise_error_when_file_is_not_valid_yaml(self, tmp_path):
        yaml_file = tmp_path / 'foo.yaml'
        lines = """
        [scalpel]
        foo = bar
        """
        yaml_file.write_text(lines)

        with pytest.raises(DecodeError):
            Configuration.load_from_yaml(yaml_file)
Пример #4
0
 def test_should_not_raise_error_when_value_is_none(self):
     try:
         Configuration(selenium_driver_log_file=None)
     except ValueError:
         pytest.fail(
             'unexpected error when setting selenium_driver_log_path with None value'
         )
Пример #5
0
 def test_should_not_raise_error_when_giving_correct_config_argument(
         self, default_spider_arguments):
     config = Configuration(fetch_timeout=0)
     try:
         Spider(**default_spider_arguments, config=config)
     except TypeError as e:
         pytest.fail(f'unexpected error when instantiating spider: {e}')
Пример #6
0
    def test_request_property_is_between_min_and_max_delay(
            self, min_delay, max_delay):
        config = Configuration(min_request_delay=min_delay,
                               max_request_delay=max_delay)

        assert config.min_request_delay <= config.request_delay
        assert config.request_delay <= config.max_request_delay
Пример #7
0
    async def test_should_work_with_http_url(self, page_content, tmp_path,
                                             anyio_backend):
        url = 'http://quotes.com'
        respx.get(url, path='/robots.txt') % 404
        respx.get(url, path='/') % {'html': page_content('page1.html')}
        for i in range(2, 4):
            respx.get(url, path=f'/page{i}.html') % {
                'html': page_content(f'page{i}.html')
            }

        backup_path = tmp_path / 'backup.mp'
        config = Configuration(item_processors=[self.processor],
                               backup_filename=f'{backup_path}',
                               follow_robots_txt=True)
        static_spider = StaticSpider(urls=[url],
                                     parse=self.parse,
                                     config=config)
        await static_spider.run()
        stats = static_spider.statistics()
        followed_urls = {f'{url}/page{i}.html' for i in range(2, 4)}

        assert stats.reachable_urls == {url} | followed_urls
        assert stats.followed_urls == followed_urls
        assert stats.request_counter == 3
        assert stats.average_fetch_time > 0
        await self.common_assert(stats, backup_path)
Пример #8
0
 def tests_should_return_default_backup_filename_when_no_one_is_given(
         self, mocker):
     mocker.patch(
         'uuid.uuid4',
         return_value=uuid.UUID('84a49591-c522-4a1c-971c-cf0282c6a759'))
     config = Configuration()
     assert 'backup-84a49591-c522-4a1c-971c-cf0282c6a759.mp' == config.backup_filename
Пример #9
0
    def test_should_fetch_content_when_giving_http_url(self, mocker):
        parse_args = []
        url = 'http://foo.com'

        def parse(sel_spider, response):
            parse_args.extend([sel_spider, response])

        respx.get(f'{url}/robots.txt', status_code=404)
        mocker.patch('selenium.webdriver.remote.webdriver.WebDriver.get')
        mocker.patch(
            'selenium.webdriver.remote.webdriver.WebDriver.current_window_handle',
            'handle')
        config = Configuration(follow_robots_txt=True,
                               selenium_driver_log_file=None)
        spider = SeleniumSpider(urls=[url], parse=parse, config=config)
        spider._handle_url(url)

        assert parse_args[0] is spider
        selenium_response = parse_args[1]
        assert isinstance(selenium_response, SeleniumResponse)
        assert selenium_response.driver is spider._driver
        assert 'handle' == selenium_response.handle
        assert {url} == spider.reachable_urls
        assert set() == spider.unreachable_urls
        assert 1 == spider.request_counter
        assert spider._total_fetch_time > 0

        # cleanup
        spider._cleanup()
Пример #10
0
    def test_should_save_correct_output_when_giving_file_url(
            self, page_1_file_url, tmp_path, browser):
        backup_path = tmp_path / 'backup.mp'
        config = Configuration(item_processors=[self.processor],
                               backup_filename=f'{backup_path}',
                               selenium_driver_log_file=None,
                               selenium_browser=browser)
        spider = SeleniumSpider(urls=[page_1_file_url],
                                parse=self.parse,
                                config=config)
        spider.run()
        stats = spider.statistics()
        followed_urls = {
            page_1_file_url.replace('1', '2'),
            page_1_file_url.replace('1', '3')
        }

        assert followed_urls == stats.followed_urls
        assert {page_1_file_url} | followed_urls == stats.reachable_urls
        assert 3 == stats.request_counter
        assert stats.total_time > 0
        assert stats.average_fetch_time == spider._total_fetch_time / stats.request_counter
        assert set() == stats.unreachable_urls
        assert set() == stats.robot_excluded_urls
        assert stats.total_time > 0

        albert_count = 0
        for item in read_mp(backup_path, decoder=datetime_decoder):
            assert isinstance(item['date'], datetime)
            if item['author'] == 'Albert Einstein':
                print(item)
                albert_count += 1

        assert albert_count == 3
Пример #11
0
    def test_should_read_file_content_when_giving_a_file_url(self, tmp_path):
        parse_args = []
        hello_file = tmp_path / 'hello.txt'
        hello_file.write_text('Hello world!')
        file_url = hello_file.resolve().as_uri()

        def parse(sel_spider, response):
            parse_args.extend([sel_spider, response])

        spider = SeleniumSpider(
            urls=[file_url],
            parse=parse,
            config=Configuration(selenium_driver_log_file=None))
        spider._handle_url(file_url)

        assert parse_args[0] is spider
        sel_response = parse_args[1]
        assert isinstance(sel_response, SeleniumResponse)
        assert '<body><pre>Hello world!</pre></body>' in sel_response.driver.page_source
        assert {file_url} == spider.reachable_urls
        assert set() == spider.unreachable_urls
        assert 1 == spider.request_counter
        assert spider._total_fetch_time > 0

        # cleanup
        spider._cleanup()
Пример #12
0
    def test_should_not_raise_error_when_giving_correct_path(self, tmp_path):
        try:
            Configuration(robots_cache_folder=tmp_path)
        except (FileNotFoundError, PermissionError) as e:
            pytest.fail(f'unexpected error when instantiating Configuration with robots_cache_folder: {e}')

        p = tmp_path / 'dummy_file'
        assert not p.exists()
Пример #13
0
 def test_should_not_raise_error_when_value_is_a_compatible_string(
         self, str_browser, browser):
     try:
         config = Configuration(selenium_browser=str_browser)
         assert browser is config.selenium_browser
     except ValueError:
         pytest.fail(
             'unexpected error when setting selenium browser attribute')
Пример #14
0
    def test_should_convert_string_to_callable_list(self, math_module):
        config = Configuration(
            item_processors='custom_math.add, custom_math.minus',
            response_middlewares='custom_math.add:custom_math.minus'
        )

        assert [math_module.add, math_module.minus] == config.response_middlewares
        assert [math_module.add, math_module.minus] == config.item_processors
Пример #15
0
 def test_should_not_raise_error_when_value_is_a_browser_enum_member(
         self, value):
     try:
         config = Configuration(selenium_browser=value)
         assert value is config.selenium_browser
     except ValueError:
         pytest.fail(
             'unexpected error when setting selenium browser attribute')
Пример #16
0
 def test_should_not_raise_error_when_msgpack_encoder_or_decoder_is_a_callable(
         self, parameter):
     try:
         Configuration(**parameter)
     except Exception as e:
         pytest.fail(
             f'unexpected error when instantiating msgpack encoder or decoder: {e}'
         )
Пример #17
0
    def test_should_return_correct_config_when_given_correct_toml_file(self, tmp_path):
        toml_file = tmp_path / 'settings.toml'
        lines = """
        [scalpel]
        foo = "bar"
        user_agent = "Mozilla/5.0"
        fetch_timeout = 4.0
        follow_robots_txt = true
        """
        toml_file.write_text(lines)
        expected_config = Configuration(fetch_timeout=4.0, user_agent='Mozilla/5.0', follow_robots_txt=True)

        for item in [f'{toml_file}', toml_file]:
            config = Configuration.load_from_toml(item)
            assert expected_config.fetch_timeout == config.fetch_timeout
            assert expected_config.user_agent == config.user_agent
            assert expected_config.follow_robots_txt == config.follow_robots_txt
Пример #18
0
    def test_should_return_correct_config_when_given_correct_yaml_file(self, tmp_path):
        lines = """---
        scalpel:
          fetch_timeout: 4.0
          user_agent: Mozilla/5.0
          follow_robots_txt: true
          foo: bar
        """
        yaml_file = tmp_path / 'settings.yml'
        yaml_file.write_text(lines)
        expected_config = Configuration(fetch_timeout=4.0, user_agent='Mozilla/5.0', follow_robots_txt=True)

        for item in [f'{yaml_file}', yaml_file]:
            config = Configuration.load_from_yaml(item)
            assert expected_config.fetch_timeout == config.fetch_timeout
            assert expected_config.user_agent == config.user_agent
            assert expected_config.follow_robots_txt == config.follow_robots_txt
Пример #19
0
    def test_default_value_is_a_string_when_fake_user_agent_fails(self, mocker):
        class FailUserAgent:
            def __init__(self):
                raise FakeUserAgentError

        mocker.patch('scalpel.core.config.UserAgent', new=FailUserAgent)

        config = Configuration()
        assert config.user_agent.startswith('Mozilla/5.0')
Пример #20
0
    async def test_should_return_robots_txt_value_when_follow_robots_txt_is_true(
            self, robots_content, value):
        url = 'http://foo.com'
        respx.get(f'{url}/robots.txt',
                  content=f'User-agent:*\n{robots_content}')
        static_spider = StaticSpider(
            urls=[url],
            parse=lambda x, y: None,
            config=Configuration(follow_robots_txt=True))

        assert value == await static_spider._get_request_delay(url)
Пример #21
0
    async def test_should_return_config_delay_when_follow_robots_txt_is_false(
            self):
        url = 'http://foo.com'
        request = respx.get(f'{url}/robots.txt',
                            content='User-agent:*\nDisallow: ')
        config = Configuration(min_request_delay=3, max_request_delay=3)
        static_spider = StaticSpider(urls=[url],
                                     parse=lambda x, y: None,
                                     config=config)

        assert not request.called
        assert 3 == await static_spider._get_request_delay(url)
Пример #22
0
    def test_specific_static_attributes_are_correctly_instantiated(self):
        config = Configuration(user_agent='mozilla/5.0')
        spider = StaticSpider(urls=['http://foo.com'], parse=lambda x, y: None, config=config)

        assert isinstance(spider._start_time, float)
        assert isinstance(spider._http_client, httpx.Client)
        assert isinstance(spider._robots_analyser, RobotsAnalyzer)
        assert config == spider._config
        assert isinstance(spider._lock, RLock)
        assert isinstance(spider._queue, JoinableQueue)
        assert len(spider.urls) == spider._queue.qsize()
        assert isinstance(spider._pool, Pool)
Пример #23
0
    def test_should_instantiate_correctly_driver_attribute(
            self, browser, name):
        config = Configuration(selenium_browser=browser,
                               selenium_driver_log_file=None)
        spider = self.CustomSpider(urls=['http://foo.com'],
                                   parse=lambda x, y: None,
                                   config=config)

        assert isinstance(spider.driver, WebDriver)
        assert name == spider.driver.name

        # cleanup
        spider.driver.quit()
Пример #24
0
    async def test_should_exclude_url_when_robots_txt_excludes_it(self):
        url = 'http://foo.com'
        respx.get(f'{url}/robots.txt') % 401

        async def parse(*_) -> None:
            pass

        static_spider = StaticSpider(
            urls=[url],
            parse=parse,
            config=Configuration(follow_robots_txt=True))
        await static_spider.run()
        assert static_spider.reachable_urls == set()
        assert static_spider.robots_excluded_urls == {url}
Пример #25
0
    async def test_should_do_nothing_if_url_is_already_present_in_one_url_set(
            self, mocker, reachable_urls, unreachable_urls, robots_excluded_urls
    ):
        url = 'http://foo.com'
        logger_mock = mocker.patch('logging.Logger.debug')

        config = Configuration(selenium_driver_log_file=None)
        spider = SeleniumSpider(urls=['http://foo.com'], parse=lambda x, y: None, config=config)
        spider.reachable_urls = reachable_urls
        spider.unreachable_urls = unreachable_urls
        spider.robots_excluded_urls = robots_excluded_urls
        await spider._handle_url(url)

        logger_mock.assert_any_call('url %s has already been processed', url)
Пример #26
0
    async def test_should_return_selenium_response_when_giving_correct_input(self, browser, handle):
        config = Configuration(selenium_driver_log_file=None, selenium_browser=browser)
        spider = SeleniumSpider(urls=['http://foo.com'], parse=lambda x, y: None, config=config)
        response = spider._get_selenium_response(handle)

        assert isinstance(response, SeleniumResponse)
        assert response.driver is spider._driver
        assert response.handle == handle
        assert response._reachable_urls == spider.reachable_urls
        assert response._followed_urls == spider.followed_urls
        assert response._queue == spider._queue

        # cleanup
        await spider._cleanup()
Пример #27
0
    async def test_selenium_attributes_are_correctly_instantiated(self):
        config = Configuration(selenium_driver_log_file=None)
        spider = SeleniumSpider(urls=['http://foo.com'], parse=lambda x, y: None, config=config)

        assert isinstance(spider._driver, WebDriver)
        assert isinstance(spider._start_time, float)
        assert isinstance(spider._http_client, httpx.AsyncClient)
        assert isinstance(spider._robots_analyser, RobotsAnalyzer)
        assert config == spider._config
        assert isinstance(spider._lock, trio.Lock)
        assert isinstance(spider._queue, Queue)

        # cleanup
        await spider._cleanup()
Пример #28
0
    def test_should_return_correct_config_given_correct_env_file(self, tmp_path, math_module):
        env_file = tmp_path / '.env'
        lines = """
        FOO = BAR
        SCALPEL_FOLLOW_ROBOTS_TXT = yes
        SCALPEL_FETCH_TIMEOUT = 2.0
        SCALPEL_RESPONSE_MIDDLEWARES = custom_math.add:custom_math.minus
        """
        env_file.write_text(lines)

        config = Configuration.load_from_dotenv(env_file)
        assert config.follow_robots_txt is True
        assert 2.0 == config.fetch_timeout
        assert [math_module.add, math_module.minus] == config.response_middlewares
Пример #29
0
    def test_should_return_non_empty_dict_when_scalpel_attributes_found(self):
        data = {
            'name': 'paul',
            'scalpel': {
                'min_request_delay': 1,
                'foo': 'bar',
                'USER_AGENT': 'Mozilla/5.0',
                'fruit': 'pineapple',
                '_config': 'foobar'
            }
        }
        expected = {'min_request_delay': 1, 'user_agent': 'Mozilla/5.0'}

        assert_dicts(expected, Configuration._scalpel_attributes(data))
Пример #30
0
    def test_should_not_raise_error_when_msgpack_encoder_or_decoder_is_a_string_representing_a_callable(
            self, math_module):
        config = None
        try:
            # ok, the functions don't look as normal msgpack encoder or decoder, but it is just to test that the feature
            # works as expected
            config = Configuration(msgpack_encoder='custom_math.add',
                                   msgpack_decoder='custom_math.minus')
        except Exception as e:
            pytest.fail(
                f'unexpected error when instantiating msgpack encoder or decoder: {e}'
            )

        assert math_module.add is config.msgpack_encoder
        assert math_module.minus is config.msgpack_decoder