def get_robots(url):
        robots_directory = 'robots'
        robots_file_path = robots_directory+'/'+url
        if os.path.isfile(robots_file_path):
            robots_file = open(robots_file_path,"rb")

#            robots_parser = RobotExclusionRulesParser()
#            robots_parser.parse(content)
            robots_parser = pickle.load(robots_file)
        else:
            buffer = StringIO.StringIO()
            c = pycurl.Curl()
            c.setopt(c.URL, 'http://'+url+'/robots.txt')
            c.setopt(c.REFERER,'')
            c.setopt(c.USERAGENT,'Curl')
            c.setopt(c.FOLLOWLOCATION, 1)
            c.setopt(c.WRITEFUNCTION, buffer.write)
            try:
                c.perform()
            except pycurl.error, e:
                print "Error code: ", e[0]
                print "Error message: ", e[1]
                c.close()
                robots_parser = RobotExclusionRulesParser()
                robots_parser.parse('')
                return robots_parser
            c.close()
#            print buffer.getvalue()
            robots_parser = RobotExclusionRulesParser()
            robots_parser.parse(buffer.getvalue())
            robots_file = open(robots_file_path,"wb")
            pickle.dump(robots_parser, robots_file)
Пример #2
0
def load_robot_rules():
    """ load rules from the robots.txt

    if the online online version is not accessible, then the local version is
    loaded from disk
    """
    rerp = RobotExclusionRulesParser()
    try:
        rerp.fetch(urlparse.urljoin(BASE_URL, '/robots.txt'))
    except:
        rerp.parse(open('robots.txt', 'r').read())
    return rerp
Пример #3
0
class RerpRobotParser(RobotParser):
    def __init__(self, robotstxt_body, spider):
        from robotexclusionrulesparser import RobotExclusionRulesParser
        self.spider = spider
        self.rp = RobotExclusionRulesParser()
        robotstxt_body = decode_robotstxt(robotstxt_body, spider)
        self.rp.parse(robotstxt_body)

    @classmethod
    def from_crawler(cls, crawler, robotstxt_body):
        spider = None if not crawler else crawler.spider
        o = cls(robotstxt_body, spider)
        return o

    def allowed(self, url, user_agent):
        user_agent = to_unicode(user_agent)
        url = to_unicode(url)
        return self.rp.is_allowed(user_agent, url)
Пример #4
0
    def check(self, hostkey, relurl):
        """ Return True if allowed to fetch, False if not, None
        if we do not have robots.txt for this entry. """

        robotstxt, expiration = self.robots.get(hostkey, (None, None))

        if robotstxt is None:
            return None

        # FIXME: mtime?  we need to let robots.txt expire.

        robotparser = RobotExclusionRulesParser()

        if robotsparser.is_expired():
            return None

        robotparser.seturl(hostkey + '/robots.txt')
        robotparser.parse(robotstxt.splitlines())
        return robotparser.can_fetch(hostkey + relurl)
Пример #5
0
def benchmark_rerp_parser(website):
    from robotexclusionrulesparser import RobotExclusionRulesParser
    rp = RobotExclusionRulesParser()
    rp.parse(website['robotstxt'])
    for link in website['links']:
        rp.is_allowed('googlebot', link)
Пример #6
0
 def _parse_robots(self, response):
     rp = RobotExclusionRulesParser()
     rp.parse(response.body)
     self._parsers[urlparse_cached(response).netloc] = rp