def test_robots_given_lower_path_allowed_url(self): # allowed by /search/about after /search is forbidden url = "https://google.com/search/about" checker = RobotsIndex(True, 'duckduckbot') self.assertEqual(checker.size(), 0) self.assertFalse(check_link(url, checker)) self.assertEqual(checker.size(), 1) # subsequent checks should reuse known robots.txt file self.assertFalse(check_link(url + '/more', checker)) self.assertFalse(check_link(url + '/plus', checker)) self.assertFalse(check_link(url + '/extra', checker)) self.assertEqual(checker.size(), 1)
def test_robots_given_forbidden_url(self): # prohibited explicitly url = "https://github.com/search/" checker = RobotsIndex(True, 'duckduckbot') self.assertTrue(check_link(url, checker))
def test_robots_given_allowed_url(self): # allowed expliticly url = "https://www.google.com/m/finance" checker = RobotsIndex(True, 'duckduckbot') self.assertFalse(check_link(url, checker))
def test_robots_given_asterisk_path_allowed_url(self): # allowed by /*/*/tree/master url = "https://github.com/rivermont/spidy/tree/master" checker = RobotsIndex(True, 'duckduckbot') self.assertFalse(check_link(url, checker))
def test_check_link_given_short_url(self): url = "http://a" self.assertTrue(check_link(url))
def test_check_link_given_invalid_url2(self): url = "github.com" self.assertTrue(check_link(url))
def test_check_link_given_invalid_url(self): url = "www.blah.com" self.assertTrue(check_link(url))
def test_check_link_given_valid_url(self): url = "http://www.github.com" self.assertFalse(check_link(url))
def test_robots_given_lower_path_allowed_url(self): # allowed by /search/about after /search is forbidden url = "https://google.com/search/about" checker = init_robot_checker(True, 'duckduckbot', url) self.assertFalse(check_link(url, checker))