Пример #1
0
    def _get_request_delay(
        host: str,
        url: str,
        robots_parser: RobotFileParser,
        delay_mapping: Dict[str, Union[int, float]],
        default_delay: Union[int, float],
    ) -> Union[int, float]:
        pass

        crawl_delay = robots_parser.crawl_delay('*')
        if crawl_delay is not None:
            delay_mapping[host] = crawl_delay
            logger.debug(
                'returning crawl delay value "%s" from robots.txt for url %s',
                crawl_delay, url)
            return crawl_delay

        request_rate = robots_parser.request_rate('*')
        if request_rate is not None:
            request_delay = request_rate.seconds / request_rate.requests
            delay_mapping[host] = request_delay
            logger.debug(
                'computing value "%s" from request delay info (%s/%s) from robots.txt for url %s',
                request_delay,
                request_rate.requests,
                request_rate.seconds,
                url,
            )
            return request_delay

        delay_mapping[host] = default_delay
        logger.debug('returning default delay value "%s" for url %s',
                     default_delay, url)
        return default_delay
Пример #2
0
    def robot_rules(self, user_agent: str):
        robot_parser = RobotFileParser(url=self.url_robots)
        robot_parser.read()

        return {
            'can_fetch': robot_parser.can_fetch(user_agent, self.url),
            'crawl_delay': robot_parser.crawl_delay(user_agent),
            'request_rate': robot_parser.request_rate(user_agent),
        }
Пример #3
0
def parse_robotstxt(url):
    """
    Parse robots.txt
    """

    parsed = urlsplit(url)

    if parsed.scheme not in ['http', 'https']:
        return False

    if parsed.netloc == '':
        return False

    robot = RobotFileParser()
    robot.set_url(parsed.scheme + "://" + parsed.netloc + "/robots.txt")
    robot.read()

    return dict(
        allowed=robot.can_fetch('*', url),
        rate=robot.request_rate('*'),
        delay=robot.crawl_delay('*'),
    )