def _get_request_delay( host: str, url: str, robots_parser: RobotFileParser, delay_mapping: Dict[str, Union[int, float]], default_delay: Union[int, float], ) -> Union[int, float]: pass crawl_delay = robots_parser.crawl_delay('*') if crawl_delay is not None: delay_mapping[host] = crawl_delay logger.debug( 'returning crawl delay value "%s" from robots.txt for url %s', crawl_delay, url) return crawl_delay request_rate = robots_parser.request_rate('*') if request_rate is not None: request_delay = request_rate.seconds / request_rate.requests delay_mapping[host] = request_delay logger.debug( 'computing value "%s" from request delay info (%s/%s) from robots.txt for url %s', request_delay, request_rate.requests, request_rate.seconds, url, ) return request_delay delay_mapping[host] = default_delay logger.debug('returning default delay value "%s" for url %s', default_delay, url) return default_delay
class Website: def __init__(self, scheme: str, hostname: str): self.scheme = scheme self.hostname = hostname self.last_time = 0 self._urls = set() self._queue = deque() # parse robots.txt self._robot_parser = RobotFileParser() self._robot_parser.set_url("{}://{}/robots.txt".format( scheme, hostname)) self._robot_parser.read() def can_fetch(self, user_agent: str, url: str) -> bool: return self._robot_parser.can_fetch(user_agent, url) def add_url(self, url: str, depth: int = 0): if url not in self._urls: self._urls.add(url) self._queue.append((url, depth)) def get_url(self) -> (str, int): return self._queue.popleft() def crawl_delay(self, user_agent: str) -> int: delay = self._robot_parser.crawl_delay(user_agent) return delay * 300 if delay is not None else None def is_empty(self) -> bool: return len(self._queue) == 0
def robot_rules(self, user_agent: str): robot_parser = RobotFileParser(url=self.url_robots) robot_parser.read() return { 'can_fetch': robot_parser.can_fetch(user_agent, self.url), 'crawl_delay': robot_parser.crawl_delay(user_agent), 'request_rate': robot_parser.request_rate(user_agent), }
def robots_get(url, *args, **kwargs): u = urlparse(url) robot_url = '{scm}://{loc}/robots.txt'.format(scm=u.scheme, loc=u.netloc) robot = RobotFileParser(robot_url) robot.read() ua = kwargs.get('headers', dict()).get('User-Agent', '*') if not robot.can_fetch(ua, url): return 'Not Allowed By robots.txt' delay = robot.crawl_delay(ua) if delay: time.sleep(delay) return requests.get(url, *args, **kwargs)
def parse_robotstxt(url): """ Parse robots.txt """ parsed = urlsplit(url) if parsed.scheme not in ['http', 'https']: return False if parsed.netloc == '': return False robot = RobotFileParser() robot.set_url(parsed.scheme + "://" + parsed.netloc + "/robots.txt") robot.read() return dict( allowed=robot.can_fetch('*', url), rate=robot.request_rate('*'), delay=robot.crawl_delay('*'), )
def check_robots(robots_path): """ robots.txtをチェックし、アクセス間隔を取得 Parameters ---------- robots_path : String robots.txtのURL Returns ------- int 記事取得間隔(秒) """ # パス未指定なら適当な間隔を返却 if robots_path == None or "": return random.randint(5, 10) rp = RobotFileParser() rp.set_url(robots_path) rp.read() delay = None try: delay = rp.crawl_delay('*') if delay == None: raise AttributeError except AttributeError: delay = random.randint(5, 10) return delay
def __crawl_url(self, url) -> tuple[str, str, int]: if url in self.blacklist: return (None, None, None) try: can_fetch = True rp = self.robot_files.get(urlparse(url).hostname, None) if rp is None: rp = RobotFileParser() rp.set_url(url) rp.read() self.robot_files[urlparse(url).hostname] = rp can_fetch = rp.can_fetch("*", url) if not can_fetch: if self.verbose and self.debug: self.thread_print( "Not allowed to crawl, adding url to blacklist: {}". format(url)) self.blacklist.append(url) return (None, None, None) if not self.__is_html(url): if self.verbose and self.debug: self.thread_print( "Not html document, adding url to blacklist: {}". format(url)) self.blacklist.append(url) return (None, None, None) content = requests.get(url, timeout=1) if self.verbose and self.debug: self.thread_print("Statuscode: {}".format(content.status_code)) if content.status_code != 200: if self.verbose and self.debug: self.thread_print("Status not 200, returning") return (None, None, None) except: if self.verbose and self.debug: self.thread_print( "An error happened, adding url to blacklist: {}".format( url)) self.blacklist.append(url) return (None, None, None) parsed_content = BeautifulSoup(content.text, 'html.parser') links = parsed_content.findAll('a') links = filter(lambda x: x.has_attr('href'), links) self.__add_links_to_frontier(links) title_obj = parsed_content.find('title') title = title_obj.string if title_obj is not None else url crawl_delay = rp.crawl_delay("*") if rp.crawl_delay( "*") is not None else 0 return (title, url, crawl_delay)