def allowed_url(self): #FIXME: Should use the geturl address as it may have been redirected scheme, netloc, path, query, fragment = urlsplit(self.url) robot_url = urlunsplit([scheme, netloc, "robots.txt", "", ""]) #FIXME: Should cache robots.txt in a better persistent data structure if robot_url in ROBOT_CACHE: rp = ROBOT_CACHE[robot_url] else: rp = RobotExclusionRulesParser() try: rp.fetch(robot_url) # Currently if there's a problem we assume there is no robots.txt except IOError: # Should be catching the urllib2.URLError exception logging.debug("Couldn't retrieve robots.txt for %s" % robot_url) rp = None except UnicodeDecodeError: logging.debug("Unicode decode error for robots.txt at %s" % robot_url) rp = None except httplib.HTTPException: logging.debug("Generic HTTPException for robots.txt at %s" % robot_url) rp = None ROBOT_CACHE[robot_url] = rp if rp is None or rp.is_allowed("*", self.url): base_url = urlunsplit([scheme, netloc, "", "", ""]) # If there's a current delay on the site respect robots.txt and stall if self.db.exists(netloc): logging.debug("Obeying robot overlord for %s..." % netloc) URLHandler.add_to_busy(self.db, self.url) return False # Set a delay for any other requests to this site to respect robots.txt delay = rp.get_crawl_delay("*") if rp else None if delay: delay = int(math.ceil(float(rp.get_crawl_delay("*")))) else: delay = SETTINGS["DEFAULT_ROBOTS_DELAY"] self.db.setex(netloc, "1", delay) return True else: return False
class RerpWrapper(python_common.web.robots_txt.parser_base.RobotsTxtParser): def __init__(self, content=None, expires=None): super(RerpWrapper, self).__init__(content, expires) if content: self.parser = RobotExclusionRulesParser() self.parser.use_local_time = False self.parser.expiration_date = self.expires self.parser.parse(content) else: self.parser = None self.my_super = super(RerpWrapper, self) def allowed(self, user_agent, url): return self.parser.is_allowed( user_agent, url) if self.parser else self.my_super.allowed( user_agent, url) def delay(self, user_agent): return self.parser.get_crawl_delay( user_agent) if self.parser else self.my_super.delay(user_agent) @property def expired(self): return self.parser.is_expired if self.parser else self.my_super.expired @property def sitemaps(self): return self.parser.sitemaps if self.parser else self.my_super.sitemaps
class Robot: def __init__(self, url): self.url = Url(urljoin(url, '/robots.txt')) self.rerp = RobotExclusionRulesParser() self.rerp.user_agent = 'Mozilla/5.0' self.rerp.fetch(self.url.url()) def throttle_time(self): return self.rerp.get_crawl_delay('Mozilla/5.0') def should_block(self, url): return not self.rerp.is_allowed('Mozilla/5.0', url.url())