def response_done(trailers: RawHeaderListType) -> None: if not exchange.status.startswith(b"2"): robots_txt = b"" else: robots_txt = exchange.res_body self._load_checker(origin, robots_txt) if self.robot_cache_dir: robot_fd = CacheFile( path.join(self.robot_cache_dir, origin_hash)) robot_fd.write(robots_txt, self.freshness_lifetime) while True: try: check_url = self.robot_lookups[origin].pop() except KeyError: break self._robot_check(check_url, self.robot_checkers[origin]) try: del self.robot_lookups[origin] except KeyError: pass
def check_robots(self, url: str, sync: bool = False) -> Union[bool, None]: """ Fetch the robots.txt for URL. When sync is true, the result is returned. Sync does not go to network; if there is not a local (memory or cache) robots.txt, it will return True. When it's false, the "robot" event will be emitted, with two arguments: - the URL - True if it's allowed, False if not """ origin = url_to_origin(url) if origin is None: if sync: return True else: self.emit("robot-%s" % url, True) return None origin_hash = hashlib.sha1(origin.encode('ascii', 'replace')).hexdigest() if origin in self.robot_checkers: return self._robot_check(url, self.robot_checkers[origin], sync) if self.robot_cache_dir: robot_fd = CacheFile(path.join(self.robot_cache_dir, origin_hash)) cached_robots_txt = robot_fd.read() if cached_robots_txt != None: self._load_checker(origin, cached_robots_txt) return self._robot_check(url, self.robot_checkers[origin], sync) if sync: return True if origin in self.robot_lookups: self.robot_lookups[origin].add(url) else: self.robot_lookups[origin] = set([url]) exchange = self.client.exchange() @thor.on(exchange) def response_start(status: bytes, phrase: bytes, headers: RawHeaderListType) -> None: exchange.status = status exchange.res_body = b"" @thor.on(exchange) def response_body(chunk: bytes) -> None: exchange.res_body += chunk @thor.on(exchange) def response_done(trailers: RawHeaderListType) -> None: if not exchange.status.startswith(b"2"): robots_txt = b"" else: robots_txt = exchange.res_body self._load_checker(origin, robots_txt) if self.robot_cache_dir: robot_fd = CacheFile( path.join(self.robot_cache_dir, origin_hash)) robot_fd.write(robots_txt, self.freshness_lifetime) while True: try: check_url = self.robot_lookups[origin].pop() except KeyError: break self._robot_check(check_url, self.robot_checkers[origin]) try: del self.robot_lookups[origin] except KeyError: pass @thor.on(exchange) def error(error: thor.http.error.HttpError) -> None: exchange.status = b"500" response_done([]) p_url = urlsplit(url) robots_url = "%s://%s/robots.txt" % (p_url.scheme, p_url.netloc) exchange.request_start( b"GET", robots_url.encode('ascii'), [(b'User-Agent', UA_STRING.encode('ascii'))]) exchange.request_done([]) return None
def check_robots(self, url: str) -> None: """ Fetch the robots.txt for URL. The 'robot' event will be emitted, with a (url, robot_ok) payload. """ origin = url_to_origin(url) if origin is None: self.emit("robot", (url, True)) return None origin_hash = hashlib.sha1(origin.encode("ascii", "replace")).hexdigest() if origin in self.robot_checkers: return self._robot_check(url, self.robot_checkers[origin]) if self.config.get("robot_cache_dir", ""): robot_fd = CacheFile(path.join(self.config["robot_cache_dir"], origin_hash)) cached_robots_txt = robot_fd.read() if cached_robots_txt is not None: self._load_checker(origin, cached_robots_txt) return self._robot_check(url, self.robot_checkers[origin]) if origin in self.robot_lookups: self.robot_lookups[origin].add(url) else: self.robot_lookups[origin] = set([url]) exchange = self.client.exchange() @thor.on(exchange) def response_start( status: bytes, phrase: bytes, headers: RawHeaderListType ) -> None: exchange.status = status exchange.res_body = b"" @thor.on(exchange) def response_body(chunk: bytes) -> None: exchange.res_body += chunk @thor.on(exchange) def response_done(trailers: RawHeaderListType) -> None: if not exchange.status.startswith(b"2"): robots_txt = b"" else: robots_txt = exchange.res_body self._load_checker(origin, robots_txt) if self.config.get("robot_cache_dir", ""): robot_fd = CacheFile( path.join(self.config["robot_cache_dir"], origin_hash) ) robot_fd.write(robots_txt, self.freshness_lifetime) while True: try: check_url = self.robot_lookups[origin].pop() except KeyError: break self._robot_check(check_url, self.robot_checkers[origin]) try: del self.robot_lookups[origin] except KeyError: pass @thor.on(exchange) def error(error: thor.http.error.HttpError) -> None: exchange.status = b"500" response_done([]) p_url = urlsplit(url) robots_url = "%s://%s/robots.txt" % (p_url.scheme, p_url.netloc) exchange.request_start( b"GET", robots_url.encode("ascii"), [(b"User-Agent", UA_STRING.encode("ascii"))], ) exchange.request_done([]) return None