def is_url_allowed(self, url, syntax=GYM2008): allowed = True # Schemes and host names are not part of the robots.txt protocol, # so I ignore them. It is the caller's responsibility to make # sure they match. _, _, path, parameters, query, fragment = urllib_urlparse(url) url = urllib_urlunparse(("", "", path, parameters, query, fragment)) url = _unquote_path(url) if url == '/robots.txt': return True done = False i = 0 while not done: rule_type, path = self.rules[i] if (syntax == GYM2008) and ("*" in path or path.endswith("$")): # GYM2008-specific syntax applies here # http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40360 if path.endswith("$"): appendix = "$" path = path[:-1] else: appendix = "" # Multiple wildcards characters mean the same as one wildcard so they can be # condensed into one. If I don't do this, I run the risk of creating a # pathological regex. # ref: https://bitbucket.org/philip_semanchuk/robotexclusionrulesparser/issues/1 path = re.sub(r'\*+', '*', path) parts = path.split("*") pattern = ".*".join([re.escape(p) for p in parts]) + appendix if re.match(pattern, url): # Ding! done = True allowed = (rule_type == self.ALLOW) else: # Wildcards are either not present or are taken literally. if url.startswith(path): # Ding! done = True allowed = (rule_type == self.ALLOW) # A blank path means "nothing", so that effectively # negates the value above. # e.g. "Disallow: " means allow everything if not path: allowed = not allowed i += 1 if i == len(self.rules): done = True return allowed
def is_url_allowed(self, url, syntax=GYM2008): allowed = True # Schemes and host names are not part of the robots.txt protocol, # so I ignore them. It is the caller's responsibility to make # sure they match. _, _, path, parameters, query, fragment = urllib_urlparse(url) url = urllib_urlunparse(("", "", path, parameters, query, fragment)) url = _unquote_path(url) done = False i = 0 while not done: rule_type, path = self.rules[i] if (syntax == GYM2008) and ("*" in path or path.endswith("$")): # GYM2008-specific syntax applies here # http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40360 if path.endswith("$"): appendix = "$" path = path[:-1] else: appendix = "" parts = path.split("*") pattern = "%s%s" % \ (".*".join([re.escape(p) for p in parts]), appendix) if re.match(pattern, url): # Ding! done = True allowed = (rule_type == self.ALLOW) else: # Wildcards are either not present or are taken literally. if url.startswith(path): # Ding! done = True allowed = (rule_type == self.ALLOW) # A blank path means "nothing", so that effectively # negates the value above. # e.g. "Disallow: " means allow everything if not path: allowed = not allowed i += 1 if i == len(self.rules): done = True return allowed