예제 #1
0
            def response_done(trailers: RawHeaderListType) -> None:
                if not exchange.status.startswith(b"2"):
                    robots_txt = b""
                else:
                    robots_txt = exchange.res_body

                self._load_checker(origin, robots_txt)
                if self.robot_cache_dir:
                    robot_fd = CacheFile(
                        path.join(self.robot_cache_dir, origin_hash))
                    robot_fd.write(robots_txt, self.freshness_lifetime)

                while True:
                    try:
                        check_url = self.robot_lookups[origin].pop()
                    except KeyError:
                        break
                    self._robot_check(check_url, self.robot_checkers[origin])
                try:
                    del self.robot_lookups[origin]
                except KeyError:
                    pass
예제 #2
0
    def check_robots(self, url: str, sync: bool = False) -> Union[bool, None]:
        """
        Fetch the robots.txt for URL.

        When sync is true, the result is returned. Sync does not go to network; if
        there is not a local (memory or cache) robots.txt, it will return True.

        When it's false, the "robot" event will be emitted, with two arguments:
          - the URL
          - True if it's allowed, False if not
        """

        origin = url_to_origin(url)
        if origin is None:
            if sync:
                return True
            else:
                self.emit("robot-%s" % url, True)
                return None
        origin_hash = hashlib.sha1(origin.encode('ascii',
                                                 'replace')).hexdigest()

        if origin in self.robot_checkers:
            return self._robot_check(url, self.robot_checkers[origin], sync)

        if self.robot_cache_dir:
            robot_fd = CacheFile(path.join(self.robot_cache_dir, origin_hash))
            cached_robots_txt = robot_fd.read()
            if cached_robots_txt != None:
                self._load_checker(origin, cached_robots_txt)
                return self._robot_check(url, self.robot_checkers[origin],
                                         sync)

        if sync:
            return True

        if origin in self.robot_lookups:
            self.robot_lookups[origin].add(url)
        else:
            self.robot_lookups[origin] = set([url])
            exchange = self.client.exchange()

            @thor.on(exchange)
            def response_start(status: bytes, phrase: bytes,
                               headers: RawHeaderListType) -> None:
                exchange.status = status

            exchange.res_body = b""

            @thor.on(exchange)
            def response_body(chunk: bytes) -> None:
                exchange.res_body += chunk

            @thor.on(exchange)
            def response_done(trailers: RawHeaderListType) -> None:
                if not exchange.status.startswith(b"2"):
                    robots_txt = b""
                else:
                    robots_txt = exchange.res_body

                self._load_checker(origin, robots_txt)
                if self.robot_cache_dir:
                    robot_fd = CacheFile(
                        path.join(self.robot_cache_dir, origin_hash))
                    robot_fd.write(robots_txt, self.freshness_lifetime)

                while True:
                    try:
                        check_url = self.robot_lookups[origin].pop()
                    except KeyError:
                        break
                    self._robot_check(check_url, self.robot_checkers[origin])
                try:
                    del self.robot_lookups[origin]
                except KeyError:
                    pass

            @thor.on(exchange)
            def error(error: thor.http.error.HttpError) -> None:
                exchange.status = b"500"
                response_done([])

            p_url = urlsplit(url)
            robots_url = "%s://%s/robots.txt" % (p_url.scheme, p_url.netloc)
            exchange.request_start(
                b"GET", robots_url.encode('ascii'),
                [(b'User-Agent', UA_STRING.encode('ascii'))])
            exchange.request_done([])
        return None
예제 #3
0
    def check_robots(self, url: str) -> None:
        """
        Fetch the robots.txt for URL.

        The 'robot' event will be emitted, with a (url, robot_ok) payload.
        """

        origin = url_to_origin(url)
        if origin is None:
            self.emit("robot", (url, True))
            return None
        origin_hash = hashlib.sha1(origin.encode("ascii", "replace")).hexdigest()

        if origin in self.robot_checkers:
            return self._robot_check(url, self.robot_checkers[origin])

        if self.config.get("robot_cache_dir", ""):
            robot_fd = CacheFile(path.join(self.config["robot_cache_dir"], origin_hash))
            cached_robots_txt = robot_fd.read()
            if cached_robots_txt is not None:
                self._load_checker(origin, cached_robots_txt)
                return self._robot_check(url, self.robot_checkers[origin])

        if origin in self.robot_lookups:
            self.robot_lookups[origin].add(url)
        else:
            self.robot_lookups[origin] = set([url])
            exchange = self.client.exchange()

            @thor.on(exchange)
            def response_start(
                status: bytes, phrase: bytes, headers: RawHeaderListType
            ) -> None:
                exchange.status = status

            exchange.res_body = b""

            @thor.on(exchange)
            def response_body(chunk: bytes) -> None:
                exchange.res_body += chunk

            @thor.on(exchange)
            def response_done(trailers: RawHeaderListType) -> None:
                if not exchange.status.startswith(b"2"):
                    robots_txt = b""
                else:
                    robots_txt = exchange.res_body

                self._load_checker(origin, robots_txt)
                if self.config.get("robot_cache_dir", ""):
                    robot_fd = CacheFile(
                        path.join(self.config["robot_cache_dir"], origin_hash)
                    )
                    robot_fd.write(robots_txt, self.freshness_lifetime)

                while True:
                    try:
                        check_url = self.robot_lookups[origin].pop()
                    except KeyError:
                        break
                    self._robot_check(check_url, self.robot_checkers[origin])
                try:
                    del self.robot_lookups[origin]
                except KeyError:
                    pass

            @thor.on(exchange)
            def error(error: thor.http.error.HttpError) -> None:
                exchange.status = b"500"
                response_done([])

            p_url = urlsplit(url)
            robots_url = "%s://%s/robots.txt" % (p_url.scheme, p_url.netloc)
            exchange.request_start(
                b"GET",
                robots_url.encode("ascii"),
                [(b"User-Agent", UA_STRING.encode("ascii"))],
            )
            exchange.request_done([])
        return None