def _get_request_delay( host: str, url: str, robots_parser: RobotFileParser, delay_mapping: Dict[str, Union[int, float]], default_delay: Union[int, float], ) -> Union[int, float]: pass crawl_delay = robots_parser.crawl_delay('*') if crawl_delay is not None: delay_mapping[host] = crawl_delay logger.debug( 'returning crawl delay value "%s" from robots.txt for url %s', crawl_delay, url) return crawl_delay request_rate = robots_parser.request_rate('*') if request_rate is not None: request_delay = request_rate.seconds / request_rate.requests delay_mapping[host] = request_delay logger.debug( 'computing value "%s" from request delay info (%s/%s) from robots.txt for url %s', request_delay, request_rate.requests, request_rate.seconds, url, ) return request_delay delay_mapping[host] = default_delay logger.debug('returning default delay value "%s" for url %s', default_delay, url) return default_delay
def robot_rules(self, user_agent: str): robot_parser = RobotFileParser(url=self.url_robots) robot_parser.read() return { 'can_fetch': robot_parser.can_fetch(user_agent, self.url), 'crawl_delay': robot_parser.crawl_delay(user_agent), 'request_rate': robot_parser.request_rate(user_agent), }
def parse_robotstxt(url): """ Parse robots.txt """ parsed = urlsplit(url) if parsed.scheme not in ['http', 'https']: return False if parsed.netloc == '': return False robot = RobotFileParser() robot.set_url(parsed.scheme + "://" + parsed.netloc + "/robots.txt") robot.read() return dict( allowed=robot.can_fetch('*', url), rate=robot.request_rate('*'), delay=robot.crawl_delay('*'), )