예제 #1
0
    def _get_request_delay(
        host: str,
        url: str,
        robots_parser: RobotFileParser,
        delay_mapping: Dict[str, Union[int, float]],
        default_delay: Union[int, float],
    ) -> Union[int, float]:
        pass

        crawl_delay = robots_parser.crawl_delay('*')
        if crawl_delay is not None:
            delay_mapping[host] = crawl_delay
            logger.debug(
                'returning crawl delay value "%s" from robots.txt for url %s',
                crawl_delay, url)
            return crawl_delay

        request_rate = robots_parser.request_rate('*')
        if request_rate is not None:
            request_delay = request_rate.seconds / request_rate.requests
            delay_mapping[host] = request_delay
            logger.debug(
                'computing value "%s" from request delay info (%s/%s) from robots.txt for url %s',
                request_delay,
                request_rate.requests,
                request_rate.seconds,
                url,
            )
            return request_delay

        delay_mapping[host] = default_delay
        logger.debug('returning default delay value "%s" for url %s',
                     default_delay, url)
        return default_delay
예제 #2
0
파일: website.py 프로젝트: rebryk/spbau-ir
class Website:
    def __init__(self, scheme: str, hostname: str):
        self.scheme = scheme
        self.hostname = hostname
        self.last_time = 0

        self._urls = set()
        self._queue = deque()

        # parse robots.txt
        self._robot_parser = RobotFileParser()
        self._robot_parser.set_url("{}://{}/robots.txt".format(
            scheme, hostname))
        self._robot_parser.read()

    def can_fetch(self, user_agent: str, url: str) -> bool:
        return self._robot_parser.can_fetch(user_agent, url)

    def add_url(self, url: str, depth: int = 0):
        if url not in self._urls:
            self._urls.add(url)
            self._queue.append((url, depth))

    def get_url(self) -> (str, int):
        return self._queue.popleft()

    def crawl_delay(self, user_agent: str) -> int:
        delay = self._robot_parser.crawl_delay(user_agent)
        return delay * 300 if delay is not None else None

    def is_empty(self) -> bool:
        return len(self._queue) == 0
예제 #3
0
    def robot_rules(self, user_agent: str):
        robot_parser = RobotFileParser(url=self.url_robots)
        robot_parser.read()

        return {
            'can_fetch': robot_parser.can_fetch(user_agent, self.url),
            'crawl_delay': robot_parser.crawl_delay(user_agent),
            'request_rate': robot_parser.request_rate(user_agent),
        }
예제 #4
0
def robots_get(url, *args, **kwargs):
    u = urlparse(url)
    robot_url = '{scm}://{loc}/robots.txt'.format(scm=u.scheme, loc=u.netloc)
    robot = RobotFileParser(robot_url)
    robot.read()
    ua = kwargs.get('headers', dict()).get('User-Agent', '*')
    if not robot.can_fetch(ua, url):
        return 'Not Allowed By robots.txt'
    delay = robot.crawl_delay(ua)
    if delay:
        time.sleep(delay)
    return requests.get(url, *args, **kwargs)
예제 #5
0
파일: arana.py 프로젝트: eksop/arana
def parse_robotstxt(url):
    """
    Parse robots.txt
    """

    parsed = urlsplit(url)

    if parsed.scheme not in ['http', 'https']:
        return False

    if parsed.netloc == '':
        return False

    robot = RobotFileParser()
    robot.set_url(parsed.scheme + "://" + parsed.netloc + "/robots.txt")
    robot.read()

    return dict(
        allowed=robot.can_fetch('*', url),
        rate=robot.request_rate('*'),
        delay=robot.crawl_delay('*'),
    )
예제 #6
0
파일: robots.py 프로젝트: ATB-K/Jeeves
def check_robots(robots_path):
    """
    robots.txtをチェックし、アクセス間隔を取得

    Parameters
    ----------
    robots_path : String
        robots.txtのURL

    Returns
    -------
    int
        記事取得間隔(秒)
    """

    # パス未指定なら適当な間隔を返却
    if robots_path == None or "":
        return random.randint(5, 10)

    rp = RobotFileParser()

    rp.set_url(robots_path)
    rp.read()

    delay = None

    try:
        delay = rp.crawl_delay('*')

        if delay == None:
            raise AttributeError

    except AttributeError:
        delay = random.randint(5, 10)

    return delay
예제 #7
0
    def __crawl_url(self, url) -> tuple[str, str, int]:
        if url in self.blacklist:
            return (None, None, None)

        try:
            can_fetch = True
            rp = self.robot_files.get(urlparse(url).hostname, None)

            if rp is None:
                rp = RobotFileParser()
                rp.set_url(url)
                rp.read()
                self.robot_files[urlparse(url).hostname] = rp

            can_fetch = rp.can_fetch("*", url)

            if not can_fetch:
                if self.verbose and self.debug:
                    self.thread_print(
                        "Not allowed to crawl, adding url to blacklist: {}".
                        format(url))
                self.blacklist.append(url)
                return (None, None, None)

            if not self.__is_html(url):
                if self.verbose and self.debug:
                    self.thread_print(
                        "Not html document, adding url to blacklist: {}".
                        format(url))
                self.blacklist.append(url)
                return (None, None, None)

            content = requests.get(url, timeout=1)
            if self.verbose and self.debug:
                self.thread_print("Statuscode: {}".format(content.status_code))
            if content.status_code != 200:
                if self.verbose and self.debug:
                    self.thread_print("Status not 200, returning")
                return (None, None, None)

        except:
            if self.verbose and self.debug:
                self.thread_print(
                    "An error happened, adding url to blacklist: {}".format(
                        url))
            self.blacklist.append(url)
            return (None, None, None)

        parsed_content = BeautifulSoup(content.text, 'html.parser')

        links = parsed_content.findAll('a')
        links = filter(lambda x: x.has_attr('href'), links)
        self.__add_links_to_frontier(links)

        title_obj = parsed_content.find('title')

        title = title_obj.string if title_obj is not None else url

        crawl_delay = rp.crawl_delay("*") if rp.crawl_delay(
            "*") is not None else 0

        return (title, url, crawl_delay)