def canVisit(self, url): topDomain = utils.getTopDomain(url) retrRules = self.__rulesDict__.get(topDomain, None) if retrRules is None: # Cache miss robotsUrl = utils.robotsTxt(url) roboFileBuf = utils.dlAndDecode(robotsUrl) if not self.parseRobotFile(topDomain, roboFileBuf): return False retr = self.__rulesDict__[topDomain] sp = tuple(filter(lambda a: a, url.split(topDomain))) if sp: firstCh = firstLetterCompile.search(sp[0]) if firstCh: # Time to probe fCh = firstCh.groups(1)[0] retr = self.__rulesDict__[topDomain]['disallow'] compList = retr.get(fCh, None) if compList: for comp in compList: if comp.search(sp[0]): return False return True return True
def addRobotRule(self, url): topDomain = utils.getTopDomain(url) if topDomain: robotPath = utils.robotsTxt(topDomain)