def get_robotstxt_parser(url, session=None): """Get a RobotFileParser for the given robots.txt URL.""" rp = RobotFileParser() try: req = urlopen(url, session, max_content_bytes=MaxContentBytes, allow_errors=range(600)) except Exception: # connect or timeout errors are treated as an absent robots.txt rp.allow_all = True else: if req.status_code >= 400: rp.allow_all = True elif req.status_code == 200: rp.parse(req.text.splitlines()) return rp
def get_robots(self): rp = RobotFileParser() if self.robots_content: rp.parse(self.robots_content) else: rp.allow_all = True return rp
async def parse_robots(session, base): """Fetches and parses the robots.txt file from a given base URL. Returns an instance of RobotFileParser.""" url = urljoin(base, "robots.txt") async with session.get(url) as response: status = response.status text = await response.text() robot_parser = RobotFileParser() if status == 200: robot_parser.parse(text.splitlines()) else: robot_parser.allow_all = True return robot_parser
def get_robots_parser(self, url: str): rp = RobotFileParser() if self.store.exists(url, 'txt'): body = self.store.load_url(url, 'txt') else: page, status_code = download_page(url, 'Robot') body = page.body if status_code in [401, 403]: body = self.DISALLOW_ALL elif 400 <= status_code < 500: # including status_code 404 body = self.ALLOW_ALL self.store.save_url(url, body, 'txt') if body.strip() == self.ALLOW_ALL: rp.allow_all = True elif body.strip() == self.DISALLOW_ALL: rp.disallow_all = True else: rp.parse(body.decode('utf-8').splitlines()) return rp