class ReppyWrapper(python_common.web.robots_txt.parser_base.RobotsTxtParser): def __init__(self, content=None, expires=None): super(ReppyWrapper, self).__init__(content, expires) if content: self.parser = Rules('robots.txt', 200, content, self.expires) else: self.parser = None self.my_super = super(ReppyWrapper, self) def allowed(self, user_agent, url): return self.parser.allowed( url, user_agent) if self.parser else self.my_super.allowed( user_agent, url) def delay(self, user_agent): return self.parser.delay( user_agent) if self.parser else self.my_super.delay(user_agent) @property def expired(self): return self.parser.expired if self.parser else self.my_super.expired @property def sitemaps(self): return self.parser.sitemaps if self.parser else self.my_super.sitemaps
def __init__(self, content=None, expires=None): super(ReppyWrapper, self).__init__(content, expires) if content: self.parser = Rules('robots.txt', 200, content, self.expires) else: self.parser = None self.my_super = super(ReppyWrapper, self)
def test_status_forbidden_allow(self): '''Test that if the flag is given, we allow all sites when robots.txt is forbidden''' rules = Rules('http://example.com/robots.txt', 401, '', 0, disallow_forbidden=False) self.assertTrue(rules.allowed('/foo', 't')) self.assertTrue(rules.allowed('http://example.com/foo', 't'))
def test_status_allowed(self): '''If no robots.txt exists, we're given free range''' rules = Rules('http://example.com/robots.txt', 404, '', 0) self.assertTrue(rules.allowed('/foo', 't')) self.assertTrue(rules.allowed('http://example.com/foo', 't'))
def test_status_disallowed(self): '''Make sure that when we get a disallowed status, that we believe we're not allowed to crawl a site''' rules = Rules('http://example.com/robots.txt', 401, '', 0) self.assertTrue(not rules.allowed('/foo', 't')) self.assertTrue(not rules.allowed('http://example.com/foo', 't'))
from __future__ import print_function from contextlib import contextmanager import time from reppy.cache import RobotsCache from reppy.parser import Rules content = ''' User-agent: '*' Allow: / ''' cache = RobotsCache() cache.add(Rules('http://example.com/', 200, content, float('inf'))) @contextmanager def timer(count): '''Time this block.''' start = time.time() try: yield count finally: duration = time.time() - start print('Total: %s' % duration) print(' Avg: %s' % (duration / count)) print(' Rate: %s' % (count / duration))
def test_status_forbidden(self): '''Make sure that when we get a forbidden status, that we believe we're not allowed to crawl a site''' rules = Rules('http://example.com/robots.txt', 401, '', 0) self.assertTrue(not rules.allowed('/foo', 't')) self.assertTrue(not rules.allowed('http://example.com/foo', 't'))
def parse(strng): '''Helper to parse a string as a Rules object''' return Rules('http://example.com/robots.txt', 200, strng, 0)
def _parse_robots(self, response): #A lot of work to provide the expire time which we don't actually use ttl = max(self.min_ttl, Utility.get_ttl(response.headers, self.default_ttl)) rp = Rules(response.url, response.status, response.body, time.time() + ttl) rp.parse(response.body) self._parsers[urlparse_cached(response).netloc] = rp