def onStatus(self, version, status, message): logger.warn('%s => Status %s' % (self.url, status)) self.status = int(status) if self.status == 401 or self.status == 403: # This means we're forbidden reppy.parse('''User-agent: *\nDisallow: /''', url=self.url, autorefresh=False, ttl=self.ttl) elif self.status != 200: # This means we're going to act like there wasn't one logger.warn('No robots.txt => %s' % self.url) reppy.parse('', url=self.url, autorefresh=False, ttl=self.ttl)
def test_set_ttl(self): self.assertTrue(True) r = reppy.parse(''' User-agent: * Disallow: /hello/''', ttl=-1) self.assertTrue(r.expired) r = reppy.parse(''' User-agent: * Disallow: /hello/''', ttl=2) self.assertTrue(r.remaining > 1) self.assertTrue(r.remaining < 2)
def test_unquoting_forward_slash(self): # But not with foward slash r = reppy.parse(''' User-agent: * Disallow: /a%2fb.html''') self.assertTrue(not r.allowed('/a%2fb.html', 't')) self.assertTrue( r.allowed('/a/b.html', 't')) r = reppy.parse(''' User-agent: * Disallow: /a/b.html''') self.assertTrue( r.allowed('/a%2fb.html', 't')) self.assertTrue(not r.allowed('/a/b.html', 't'))
def test_basic(self): # Test beginning matching r = reppy.parse(''' User-agent: * Disallow: /tmp''') self.assertTrue(not r.allowed('/tmp', 't')) self.assertTrue(not r.allowed('/tmp.html', 't')) self.assertTrue(not r.allowed('/tmp/a.html', 't')) r = reppy.parse(''' User-agent: * Disallow: /tmp/''') self.assertTrue( r.allowed('/tmp', 't')) self.assertTrue(not r.allowed('/tmp/', 't')) self.assertTrue(not r.allowed('/tmp/a.html', 't'))
def test_crawl_delays(self): r = reppy.parse('''# robots.txt for http://www.example.com/ User-agent: Foobot Disallow: * Crawl-Delay: 5 User-agent: Somebot Allow: /foo.html Crawl-Delay: .3 Allow: /bar.html Disallow: * User-agent: AnotherBot Disallow: * Sitemap: http://www.example.com/sitemap.xml User-agent: CamelBot Disallow: /foo.html Crawl-Delay: go away!''') self.assertTrue(not r.allowed('/foo.html', 'Foobot')) self.assertTrue( r.allowed('/foo.html', 'Somebot')) self.assertTrue( r.allowed('/bar.html', 'Somebot')) self.assertTrue(not r.allowed('/x.html', 'Somebot')) self.assertTrue(not r.allowed('/foo.html', 'AnotherBot')) self.assertEqual(r.crawlDelay('Foobot'), 5) self.assertEqual(r.crawlDelay('Blahbot'), None) self.assertEqual(r.crawlDelay('Somebot'), 0.3) self.assertEqual(r.sitemaps, ['http://www.example.com/sitemap.xml']) self.assertEqual(r.crawlDelay('CamelBot'), None)
def test_crawl_delays(self): r = reppy.parse( """# robots.txt for http://www.example.com/ User-agent: Foobot Disallow: * Crawl-Delay: 5 User-agent: Somebot Allow: /foo.html Crawl-Delay: .3 Allow: /bar.html Disallow: * User-agent: AnotherBot Disallow: * Sitemap: http://www.example.com/sitemap.xml User-agent: CamelBot Disallow: /foo.html Crawl-Delay: go away!""" ) self.assertTrue(not r.allowed("/foo.html", "Foobot")) self.assertTrue(r.allowed("/foo.html", "Somebot")) self.assertTrue(r.allowed("/bar.html", "Somebot")) self.assertTrue(not r.allowed("/x.html", "Somebot")) self.assertTrue(not r.allowed("/foo.html", "AnotherBot")) self.assertEqual(r.crawlDelay("Foobot"), 5) self.assertEqual(r.crawlDelay("Blahbot"), None) self.assertEqual(r.crawlDelay("Somebot"), 0.3) self.assertEqual(r.sitemaps, ["http://www.example.com/sitemap.xml"]) self.assertEqual(r.crawlDelay("CamelBot"), None)
def test_no_gifs_or_jpgs(self): r = reppy.parse( """ User-agent: * Disallow: /*.gif$ Disallow: /*.jpg$""" ) ua = "dotbot" self.assertTrue(r.allowed("/", ua)) self.assertTrue(r.allowed("/foo", ua)) self.assertTrue(r.allowed("/foo.html", ua)) self.assertTrue(r.allowed("/foo/bar", ua)) self.assertTrue(r.allowed("/foo/bar.html", ua)) self.assertTrue(not r.allowed("/test.jpg", ua)) self.assertTrue(not r.allowed("/foo/test.jpg", ua)) self.assertTrue(not r.allowed("/foo/bar/test.jpg", ua)) self.assertTrue(r.allowed("/the-jpg-extension-is-awesome.html", ua)) self.assertTrue(not r.allowed("/jpg.jpg", ua)) self.assertTrue(not r.allowed("/foojpg.jpg", ua)) self.assertTrue(not r.allowed("/bar/foojpg.jpg", ua)) self.assertTrue(not r.allowed("/.jpg.jpg", ua)) self.assertTrue(not r.allowed("/.jpg/.jpg", ua)) self.assertTrue(not r.allowed("/test.gif", ua)) self.assertTrue(not r.allowed("/foo/test.gif", ua)) self.assertTrue(not r.allowed("/foo/bar/test.gif", ua)) self.assertTrue(r.allowed("/the-gif-extension-is-awesome.html", ua))
def test_bad_syntax(self): r = reppy.parse( """# robots.txt for http://www.example.com/ # This is nonsense; UA most come first. Disallow: / User-agent: * # With apologies to Dr. Seuss, this syntax won't act as the author expects. # It will only match UA strings that contain "onebot twobot greenbot bluebot". # To match multiple UAs to a single rule, use multiple "User-agent:" lines. User-agent: onebot twobot greenbot bluebot Disallow: / # Blank lines indicate an end-of-record so the first UA listed here is ignored. User-agent: OneTwoFiveThreeSirBot # Note from Webmaster: add new user-agents below: User-agent: WotBehindTheRabbitBot User-agent: ItIsTheRabbitBot Disallow: /HolyHandGrenade/""" ) self.assertTrue(r.allowed("/", "onebot")) self.assertTrue(r.allowed("/foo/bar.html", "onebot")) self.assertTrue(r.allowed("/", "bluebot")) self.assertTrue(r.allowed("/foo/bar.html", "bluebot")) self.assertTrue(r.allowed("/HolyHandGrenade/Antioch.html", "OneTwoFiveThreeSirBot")) self.assertTrue(not r.allowed("/HolyHandGrenade/Antioch.html", "WotBehindTheRabbitBot"))
def test_wildcard(self): r = reppy.parse(''' User-agent: * Disallow: /hello/*/are/you''') testUrls = ['/hello/', '/hello/how/are/you', '/hi/how/are/you/'] allowed = ['/hello/', '/hi/how/are/you/'] self.assertEqual(r.allowed(testUrls, 't'), allowed)
def test_crawl_delay(self): r = reppy.parse(''' User-agent: agent Crawl-delay: 5 User-agent: * Crawl-delay: 1''') self.assertEqual(r.crawlDelay('agent'), 5) self.assertEqual(r.crawlDelay('testing'), 1)
def test_case_insensitivity(self): r = reppy.parse('''# robots.txt for http://www.example.com/ User-agent: Foobot Disallow: /''') self.assertTrue(not r.allowed('/', 'Foobot')) self.assertTrue(not r.allowed('/', 'FOOBOT')) self.assertTrue(not r.allowed('/', 'FoOBoT')) self.assertTrue(not r.allowed('/', 'foobot'))
def test_batch_queries(self): r = reppy.parse(''' User-agent: * Disallow: /a Disallow: /b Allow: /b/''') testUrls = ['/a/hello', '/a/howdy', '/b', '/b/hello'] allowed = ['/b/hello'] self.assertEqual(r.allowed(testUrls, 't'), allowed)
def test_case_insensitivity(self): '''Make sure user agent matches are case insensitive''' r = reppy.parse(''' User-agent: agent Disallow: /a''') self.assertTrue(r.disallowed('/a', 'Agent')) self.assertTrue(r.disallowed('/a', 'aGent')) self.assertTrue(r.disallowed('/a', 'AGeNt')) self.assertTrue(r.disallowed('/a', 'AGENT'))
def test_query(self): '''Make sure user agent matches are case insensitive''' r = reppy.parse(''' User-agent: agent Disallow: /a?howdy''') self.assertTrue( r.allowed('/a', 'agent')) self.assertTrue(not r.allowed('/a?howdy', 'agent')) self.assertTrue(not r.allowed('/a?howdy#fragment', 'agent')) self.assertTrue( r.allowed('/a?heyall', 'agent'))
def test_relative(self): # Basically, we should treat 'hello' like '/hello' r = reppy.parse(''' User-agent: * Disallow: hello Disallow: *goodbye''') ua = 'dotbot' self.assertTrue(not r.allowed('/hello', ua)) self.assertTrue(not r.allowed('/hello/everyone', ua))
def test_unquoting(self): # Now test escaping entities r = reppy.parse(''' User-agent: * Disallow: /a%3cd.html''') self.assertTrue(not r.allowed('/a%3cd.html', 't')) self.assertTrue(not r.allowed('/a%3Cd.html', 't')) # And case indpendent r = reppy.parse(''' User-agent: * Disallow: /a%3Cd.html''') self.assertTrue(not r.allowed('/a%3cd.html', 't')) self.assertTrue(not r.allowed('/a%3Cd.html', 't')) # And escape the urls themselves r = reppy.parse(''' User-agent: * Disallow: /%7ejoe/index.html''') self.assertTrue(not r.allowed('/~joe/index.html', 't')) self.assertTrue(not r.allowed('/%7ejoe/index.html', 't'))
def test_case_insensitivity(self): r = reppy.parse( """# robots.txt for http://www.example.com/ User-agent: Foobot Disallow: /""" ) self.assertTrue(not r.allowed("/", "Foobot")) self.assertTrue(not r.allowed("/", "FOOBOT")) self.assertTrue(not r.allowed("/", "FoOBoT")) self.assertTrue(not r.allowed("/", "foobot"))
def test_implicit(self): r = reppy.parse('''# robots.txt for http://www.example.com/ User-agent: * Disallow: / User-agent: foobot Disallow:''') self.assertTrue( r.allowed('/', 'foobot')) self.assertTrue( r.allowed('/bar.html', 'foobot')) self.assertTrue(not r.allowed('/', 'SomeOtherBot')) self.assertTrue(not r.allowed('/blahblahblah', 'SomeOtherBot'))
def test_no_googlebot_file(self): r = reppy.parse( """ User-agent: Googlebot Disallow: /no-google/blocked-page.html""" ) ua = "googlebot" self.assertTrue(r.allowed("/", ua)) self.assertTrue(r.allowed("/no-google/someotherfolder", ua)) self.assertTrue(r.allowed("/no-google/someotherfolder/somefile", ua)) self.assertTrue(not r.allowed("/no-google/blocked-page.html", ua))
def test_wildcards(self): r = reppy.parse( """# robots.txt for http://www.example.com/ User-agent: Rule1TestBot Disallow: /foo* User-agent: Rule2TestBot Disallow: /foo*/bar.html # Disallows anything containing the letter m! User-agent: Rule3TestBot Disallow: *m User-agent: Rule4TestBot Allow: /foo/bar.html Disallow: * User-agent: Rule5TestBot Disallow: /foo*/*bar.html User-agent: Rule6TestBot Allow: /foo$ Disallow: /foo""" ) self.assertTrue(r.allowed("/fo.html", "Rule1TestBot")) self.assertTrue(not r.allowed("/foo.html", "Rule1TestBot")) self.assertTrue(not r.allowed("/food", "Rule1TestBot")) self.assertTrue(not r.allowed("/foo/bar.html", "Rule1TestBot")) self.assertTrue(r.allowed("/fo.html", "Rule2TestBot")) self.assertTrue(not r.allowed("/foo/bar.html", "Rule2TestBot")) self.assertTrue(not r.allowed("/food/bar.html", "Rule2TestBot")) self.assertTrue(not r.allowed("/foo/a/b/c/x/y/z/bar.html", "Rule2TestBot")) self.assertTrue(r.allowed("/food/xyz.html", "Rule2TestBot")) self.assertTrue(not r.allowed("/foo.htm", "Rule3TestBot")) self.assertTrue(not r.allowed("/foo.html", "Rule3TestBot")) self.assertTrue(r.allowed("/foo", "Rule3TestBot")) self.assertTrue(not r.allowed("/foom", "Rule3TestBot")) self.assertTrue(not r.allowed("/moo", "Rule3TestBot")) self.assertTrue(not r.allowed("/foo/bar.html", "Rule3TestBot")) self.assertTrue(r.allowed("/foo/bar.txt", "Rule3TestBot")) self.assertTrue(not r.allowed("/fo.html", "Rule4TestBot")) self.assertTrue(not r.allowed("/foo.html", "Rule4TestBot")) self.assertTrue(not r.allowed("/foo", "Rule4TestBot")) self.assertTrue(r.allowed("/foo/bar.html", "Rule4TestBot")) self.assertTrue(not r.allowed("/foo/bar.txt", "Rule4TestBot")) self.assertTrue(not r.allowed("/foo/bar.html", "Rule5TestBot")) self.assertTrue(not r.allowed("/food/rebar.html", "Rule5TestBot")) self.assertTrue(r.allowed("/food/rebarf.html", "Rule5TestBot")) self.assertTrue(not r.allowed("/foo/a/b/c/rebar.html", "Rule5TestBot")) self.assertTrue(not r.allowed("/foo/a/b/c/bar.html", "Rule5TestBot")) self.assertTrue(r.allowed("/foo", "Rule6TestBot")) self.assertTrue(not r.allowed("/foo/", "Rule6TestBot")) self.assertTrue(not r.allowed("/foo/bar.html", "Rule6TestBot")) self.assertTrue(not r.allowed("/fooey", "Rule6TestBot"))
def test_disallow_all(self): # But not with foward slash r = reppy.parse(''' User-agent: * Disallow: /''') ua = 'dotbot' self.assertTrue(not r.allowed('/', ua)) self.assertTrue(not r.allowed('/foo', ua)) self.assertTrue(not r.allowed('/foo.html', ua)) self.assertTrue(not r.allowed('/foo/bar', ua)) self.assertTrue(not r.allowed('/foo/bar.html', ua))
def test_allow_all(self): # Now test escaping entities r = reppy.parse(''' User-agent: * Disallow: ''') ua = 'dotbot' self.assertTrue( r.allowed('/', ua)) self.assertTrue( r.allowed('/foo', ua)) self.assertTrue( r.allowed('/foo.html', ua)) self.assertTrue( r.allowed('/foo/bar', ua)) self.assertTrue( r.allowed('/foo/bar.html', ua))
def test_implicit(self): r = reppy.parse( """# robots.txt for http://www.example.com/ User-agent: * Disallow: / User-agent: foobot Disallow:""" ) self.assertTrue(r.allowed("/", "foobot")) self.assertTrue(r.allowed("/bar.html", "foobot")) self.assertTrue(not r.allowed("/", "SomeOtherBot")) self.assertTrue(not r.allowed("/blahblahblah", "SomeOtherBot"))
def test_no_googlebot_folder(self): r = reppy.parse( """ User-agent: Googlebot Disallow: /no-google/""" ) ua = "googlebot" self.assertTrue(not r.allowed("/no-google/", ua)) self.assertTrue(not r.allowed("/no-google/something", ua)) self.assertTrue(not r.allowed("/no-google/something.html", ua)) self.assertTrue(r.allowed("/", ua)) self.assertTrue(r.allowed("/somethingelse", ua))
def test_wildcards(self): r = reppy.parse('''# robots.txt for http://www.example.com/ User-agent: Rule1TestBot Disallow: /foo* User-agent: Rule2TestBot Disallow: /foo*/bar.html # Disallows anything containing the letter m! User-agent: Rule3TestBot Disallow: *m User-agent: Rule4TestBot Allow: /foo/bar.html Disallow: * User-agent: Rule5TestBot Disallow: /foo*/*bar.html User-agent: Rule6TestBot Allow: /foo$ Disallow: /foo''') self.assertTrue( r.allowed('/fo.html', 'Rule1TestBot')) self.assertTrue(not r.allowed('/foo.html', 'Rule1TestBot')) self.assertTrue(not r.allowed('/food', 'Rule1TestBot')) self.assertTrue(not r.allowed('/foo/bar.html', 'Rule1TestBot')) self.assertTrue( r.allowed('/fo.html', 'Rule2TestBot')) self.assertTrue(not r.allowed('/foo/bar.html', 'Rule2TestBot')) self.assertTrue(not r.allowed('/food/bar.html', 'Rule2TestBot')) self.assertTrue(not r.allowed('/foo/a/b/c/x/y/z/bar.html', 'Rule2TestBot')) self.assertTrue( r.allowed('/food/xyz.html', 'Rule2TestBot')) self.assertTrue(not r.allowed('/foo.htm', 'Rule3TestBot')) self.assertTrue(not r.allowed('/foo.html', 'Rule3TestBot')) self.assertTrue( r.allowed('/foo', 'Rule3TestBot')) self.assertTrue(not r.allowed('/foom', 'Rule3TestBot')) self.assertTrue(not r.allowed('/moo', 'Rule3TestBot')) self.assertTrue(not r.allowed('/foo/bar.html', 'Rule3TestBot')) self.assertTrue( r.allowed('/foo/bar.txt', 'Rule3TestBot')) self.assertTrue(not r.allowed('/fo.html', 'Rule4TestBot')) self.assertTrue(not r.allowed('/foo.html', 'Rule4TestBot')) self.assertTrue(not r.allowed('/foo', 'Rule4TestBot')) self.assertTrue( r.allowed('/foo/bar.html', 'Rule4TestBot')) self.assertTrue(not r.allowed('/foo/bar.txt', 'Rule4TestBot')) self.assertTrue(not r.allowed('/foo/bar.html', 'Rule5TestBot')) self.assertTrue(not r.allowed('/food/rebar.html', 'Rule5TestBot')) self.assertTrue( r.allowed('/food/rebarf.html', 'Rule5TestBot')) self.assertTrue(not r.allowed('/foo/a/b/c/rebar.html', 'Rule5TestBot')) self.assertTrue(not r.allowed('/foo/a/b/c/bar.html', 'Rule5TestBot')) self.assertTrue( r.allowed('/foo', 'Rule6TestBot')) self.assertTrue(not r.allowed('/foo/', 'Rule6TestBot')) self.assertTrue(not r.allowed('/foo/bar.html', 'Rule6TestBot')) self.assertTrue(not r.allowed('/fooey', 'Rule6TestBot'))
def test_bom(self): utf8_byte_order_mark = chr(0xef) + chr(0xbb) + chr(0xbf) r = reppy.parse('''%sUSERAGENT: FOOBOT %suser-agent:%s%s%sbarbot%s disallow: /foo/ ''' % (utf8_byte_order_mark, '\t', '\t', '\t', '\t', chr(0xb))) self.assertTrue( r.allowed('/', 'foobot')) self.assertTrue(not r.allowed('/foo/bar.html', 'foobot')) self.assertTrue( r.allowed('/foo/bar.html', 'AnotherBot')) self.assertTrue(not r.allowed('/foo/bar.html', 'Foobot Version 1.0')) self.assertTrue(not r.allowed('/foo/bar.html', 'Mozilla/5.0 (compatible; Foobot/2.1)')) self.assertTrue(not r.allowed('/foo/bar.html', 'barbot')) self.assertTrue( r.allowed('/tmp/', 'barbot'))
def test_disallowed(self): '''Make sure disallowed is the opposite of allowed''' r = reppy.parse(''' User-agent: * Disallow: /0xa Disallow: /0xb Disallow: /0xc Disallow: /0xd Disallow: /0xe Disallow: /0xf''') for i in range(1000): u = hex(random.randint(0, 16)) self.assertNotEqual(r.allowed(u, 't'), r.disallowed(u, 't'))
def test_utf8(self): s = '''# robots.txt for http://www.example.com/ UserAgent: Jävla-Foobot Disallow: / UserAgent: \u041b\u044c\u0432\u0456\u0432-bot Disallow: /totalitarianism/ '''.decode('utf-8') r = reppy.parse(s) ua = 'jävla-fanbot' self.assertTrue( r.allowed('/foo/bar.html', ua)) self.assertTrue(not r.allowed('/foo/bar.html', ua.replace('fan', 'foo'))) self.assertTrue( r.allowed('/', 'foobot')) self.assertTrue( r.allowed('/', 'Mozilla/5.0 (compatible; \u041b\u044c\u0432\u0456\u0432-bot/1.1)')) self.assertTrue(not r.allowed('/totalitarianism/foo.html', 'Mozilla/5.0 (compatible; \u041b\u044c\u0432\u0456\u0432-bot/1.1)'))
def test_rogerbot_only(self): r = reppy.parse( """ User-agent: * Disallow: /no-bots/block-all-bots-except-rogerbot-page.html User-agent: rogerbot Allow: /no-bots/block-all-bots-except-rogerbot-page.html""" ) ua = "notroger" self.assertTrue(not r.allowed("/no-bots/block-all-bots-except-rogerbot-page.html", ua)) self.assertTrue(r.allowed("/", ua)) ua = "rogerbot" self.assertTrue(r.allowed("/no-bots/block-all-bots-except-rogerbot-page.html", ua)) self.assertTrue(r.allowed("/", ua))
def test_bom(self): utf8_byte_order_mark = chr(0xEF) + chr(0xBB) + chr(0xBF) r = reppy.parse( """%sUSERAGENT: FOOBOT %suser-agent:%s%s%sbarbot%s disallow: /foo/ """ % (utf8_byte_order_mark, "\t", "\t", "\t", "\t", chr(0xB)) ) self.assertTrue(r.allowed("/", "foobot")) self.assertTrue(not r.allowed("/foo/bar.html", "foobot")) self.assertTrue(r.allowed("/foo/bar.html", "AnotherBot")) self.assertTrue(not r.allowed("/foo/bar.html", "Foobot Version 1.0")) self.assertTrue(not r.allowed("/foo/bar.html", "Mozilla/5.0 (compatible; Foobot/2.1)")) self.assertTrue(not r.allowed("/foo/bar.html", "barbot")) self.assertTrue(r.allowed("/tmp/", "barbot"))
def onError(self, *args, **kwargs): reppy.parse('', url=self.url, autorefresh=False, ttl=self.ttl)
def onSuccess(self, text, fetcher): reppy.parse(text, url=self.url, autorefresh=False, ttl=self.ttl)