def get_robots(url): robots_directory = 'robots' robots_file_path = robots_directory+'/'+url if os.path.isfile(robots_file_path): robots_file = open(robots_file_path,"rb") # robots_parser = RobotExclusionRulesParser() # robots_parser.parse(content) robots_parser = pickle.load(robots_file) else: buffer = StringIO.StringIO() c = pycurl.Curl() c.setopt(c.URL, 'http://'+url+'/robots.txt') c.setopt(c.REFERER,'') c.setopt(c.USERAGENT,'Curl') c.setopt(c.FOLLOWLOCATION, 1) c.setopt(c.WRITEFUNCTION, buffer.write) try: c.perform() except pycurl.error, e: print "Error code: ", e[0] print "Error message: ", e[1] c.close() robots_parser = RobotExclusionRulesParser() robots_parser.parse('') return robots_parser c.close() # print buffer.getvalue() robots_parser = RobotExclusionRulesParser() robots_parser.parse(buffer.getvalue()) robots_file = open(robots_file_path,"wb") pickle.dump(robots_parser, robots_file)
def load_robot_rules(): """ load rules from the robots.txt if the online online version is not accessible, then the local version is loaded from disk """ rerp = RobotExclusionRulesParser() try: rerp.fetch(urlparse.urljoin(BASE_URL, '/robots.txt')) except: rerp.parse(open('robots.txt', 'r').read()) return rerp
class RerpRobotParser(RobotParser): def __init__(self, robotstxt_body, spider): from robotexclusionrulesparser import RobotExclusionRulesParser self.spider = spider self.rp = RobotExclusionRulesParser() robotstxt_body = decode_robotstxt(robotstxt_body, spider) self.rp.parse(robotstxt_body) @classmethod def from_crawler(cls, crawler, robotstxt_body): spider = None if not crawler else crawler.spider o = cls(robotstxt_body, spider) return o def allowed(self, url, user_agent): user_agent = to_unicode(user_agent) url = to_unicode(url) return self.rp.is_allowed(user_agent, url)
def check(self, hostkey, relurl): """ Return True if allowed to fetch, False if not, None if we do not have robots.txt for this entry. """ robotstxt, expiration = self.robots.get(hostkey, (None, None)) if robotstxt is None: return None # FIXME: mtime? we need to let robots.txt expire. robotparser = RobotExclusionRulesParser() if robotsparser.is_expired(): return None robotparser.seturl(hostkey + '/robots.txt') robotparser.parse(robotstxt.splitlines()) return robotparser.can_fetch(hostkey + relurl)
def benchmark_rerp_parser(website): from robotexclusionrulesparser import RobotExclusionRulesParser rp = RobotExclusionRulesParser() rp.parse(website['robotstxt']) for link in website['links']: rp.is_allowed('googlebot', link)
def _parse_robots(self, response): rp = RobotExclusionRulesParser() rp.parse(response.body) self._parsers[urlparse_cached(response).netloc] = rp