def test_get_next_request_should_return_next_request_with_higher_priority( ) -> None: high_priority_request = CrawlRequest('http://test.com', priority=1) crawler_configuration = CrawlerConfiguration( [request, high_priority_request]) crawl_frontier = CrawlFrontier(crawler_configuration) assert crawl_frontier.get_next_request() is high_priority_request
def test_str_should_return_string_representation() -> None: crawler_configuration = CrawlerConfiguration([CrawlRequest('https://example.com')], filter_offsite_requests=True, allowed_domains=['example.com']) assert str(crawler_configuration) == 'CrawlerConfiguration(seed_requests=1 requests, ' \ 'filter_duplicate_requests=True, ' \ 'filter_offsite_requests=True, ' \ 'allowed_domains=1 domains)'
def test_add_request_should_add_allowed_request_to_queue_when_offsite_request_filter_is_enabled( ) -> None: crawler_configuration = CrawlerConfiguration( [], filter_offsite_requests=True, allowed_domains=['example.com']) crawl_frontier = CrawlFrontier(crawler_configuration) result = crawl_frontier.add_request(request) assert result is True assert crawl_frontier.get_next_request() is request
def test_add_request_should_add_duplicate_request_to_queue_when_duplicate_request_filter_is_disabled( ) -> None: crawler_configuration = CrawlerConfiguration( [request], filter_duplicate_requests=False) crawl_frontier = CrawlFrontier(crawler_configuration) crawl_frontier.get_next_request() result = crawl_frontier.add_request(request) assert result is True assert crawl_frontier.get_next_request() is request
def test_add_request_should_not_add_duplicate_request_to_queue_when_duplicate_request_filter_is_enabled( ) -> None: crawler_configuration = CrawlerConfiguration( [CrawlRequest(url='http://example.com/test?abc=def&ghi=jkl#fragment')]) crawl_frontier = CrawlFrontier(crawler_configuration) crawl_frontier.get_next_request() result = crawl_frontier.add_request( CrawlRequest(url='http://example.com/test?ghi=jkl&abc=def')) assert result is False assert crawl_frontier.get_next_request() is None
def test_allowed_domains_should_return_empty_list_when_no_allowed_domains_specified() -> None: crawler_configuration = CrawlerConfiguration([]) assert crawler_configuration.allowed_domains == []
def test_filter_offsite_requests_should_return_specified_value_when_specified() -> None: crawler_configuration = CrawlerConfiguration([], filter_offsite_requests=True) assert crawler_configuration.filter_offsite_requests is True
def test_filter_offsite_requests_should_return_default_value_when_not_specified() -> None: crawler_configuration = CrawlerConfiguration([]) assert crawler_configuration.filter_offsite_requests is False
def test_filter_duplicate_requests_should_return_specified_value_when_specified() -> None: crawler_configuration = CrawlerConfiguration([], filter_duplicate_requests=False) assert crawler_configuration.filter_duplicate_requests is False
def test_seed_requests_should_return_seed_requests() -> None: seed_requests = [CrawlRequest('https://example.com')] crawler_configuration = CrawlerConfiguration(seed_requests) assert crawler_configuration.seed_requests is seed_requests
def test_constructor_should_raise_value_error_when_invalid_domain_in_allowed_domains() -> None: with pytest.raises(ValueError) as exc_info: CrawlerConfiguration([], allowed_domains=['example.invalid']) assert str(exc_info.value) == 'Could not extract a valid domain from example.invalid'
def configure(self) -> CrawlerConfiguration: return CrawlerConfiguration([ CrawlRequest(first_page_url, success_func=self.on_first_page_response), CrawlRequest(second_page_url) ])
def configure(self) -> CrawlerConfiguration: return CrawlerConfiguration([CrawlRequest(first_page_url), CrawlRequest(second_page_url)])
def configure(self) -> CrawlerConfiguration: return CrawlerConfiguration([])
def configure(self) -> CrawlerConfiguration: return CrawlerConfiguration([CrawlRequest(request_url, headers={'foo': 'bar'})])
def test_get_next_request_should_return_none_when_queue_is_empty() -> None: crawler_configuration = CrawlerConfiguration([]) crawl_frontier = CrawlFrontier(crawler_configuration) assert crawl_frontier.get_next_request() is None
def test_has_next_request_should_return_true_when_queue_is_not_empty() -> None: crawler_configuration = CrawlerConfiguration([request]) crawl_frontier = CrawlFrontier(crawler_configuration) assert crawl_frontier.has_next_request() is True
def test_allowed_domains_should_return_domains_only() -> None: crawler_configuration = CrawlerConfiguration([], allowed_domains=['https://www.example.com:80/']) assert crawler_configuration.allowed_domains == ['www.example.com']
def configure(self) -> CrawlerConfiguration: return CrawlerConfiguration([CrawlRequest(redirect_origin_url)])
def configure(self) -> CrawlerConfiguration: return CrawlerConfiguration([CrawlRequest(request_url)])