def robots(url): a = {} rp = robotparser.RobotFileParser() rp.set_url(url) a['网址'] = rp.read() a['好爬虫'] = rp.can_fetch('GoodCrawler', url) a['恶意爬虫'] = rp.can_fetch('Bad', url) a['Google'] = rp.can_fetch('Googlebot', url) print(a)
def get_robots_parser(robots_url): " Return the robots parser object using the robots_url " try: rp = robotparser.RobotFileParser() rp.set_url(robots_url) rp.read() return rp except Exception as e: return 'Error finding robots_url:', robots_url, e
def parse_robots(robots_url): print(f"robots url {robots_url}") try: rp = robotparser.RobotFileParser(robots_url) rp.read() return rp except Exception as e: print(f"robots parse error {e}")
def process_robot_txt(self): """Обрабатываем файл robots.txt и возвращаем true или false в зависимости от того,можно ли нам краулить сайт""" rp = robotparser.RobotFileParser() try: rp.set_url(self + '/robots.txt') rp.read() except (URLError, UnicodeEncodeError, UnicodeDecodeError): return True return rp.can_fetch('*', self)
def get_robots_parser(robots_url): """return the robots parser object using the robots_url""" try: rp = robotparser.RobotFileParser() rp.set_url(robots_url) rp.read() return rp except Exception as e: print("Error finding robots url:", robots_url, e)
def get(self, url): print('Downloading:', url) headers = {'User-agent': self.user_agent} # Preparación del Header a ser incluido en el Request de Inicio # que incluye el Registro del Agente a utilizar durante la operación # En vista de lo pequeño de este tipo de archivos, tiene sentido complicar la tarea manteniendo # una copia local durante el acceso, así que se accesa cada vez que desea navegarse por alguna página del site rp = robotparser.RobotFileParser() # Inicializa objeto para parsing de robots.txt
def checkRobots(robots, url): try: rp = robotparser.RobotFileParser() rp.set_url(robots) rp.read() return rp.can_fetch("*", url) except: #no robots? allows everything return True
def __init__(self, scheme, hostname): self.hostname = hostname self.iter = 0 self.cnt_urls = {0: scheme + '://' + self.hostname} self.urls_cnt = {scheme + '://' + self.hostname: 0} self.free_numbers = set() self.timestamps = {0: None} self._rp = urobot.RobotFileParser() self._rp.set_url(scheme + '://' + hostname + '/robots.txt')
def get_robots(url): try: url = url + "/robots.txt" rp = robotparser.RobotFileParser() rp.set_url(url) rp.read() return rp except Exception as e: print(str(e)) return None
def checkrobot(self, u): try: robUrl = u if u.find( "/", MAX_PROTO_LEN) == -1 else u[:u.find("/", MAX_PROTO_LEN)] robUrl = robUrl + ROBOTS_TXT rob = rp.RobotFileParser() rob.set_url(robUrl) return rob.can_fetch("*", u) except: return True
def polite(robotcheckers, url): host = urlparse(url).netloc try: rc = robotcheckers[host] except KeyError: rc = robotparser.RobotFileParser() rc.set_url('http://' + host + '/robots.txt') rc.read() robotcheckers[host] = rc return rc.can_fetch('*', url)
def get_robots(url): rp = robotparser.RobotFileParser() rp.set_url(urllib.parse.urljoin(url, '/robots.txt')) try: rp.read() except urllib.error.URLError as e: print('robot get error:', e.reason) rp = None return rp
def get_robots(url): """ 为该链接初始化robots :param url: :return: """ rp = robotparser.RobotFileParser() rp.set_url(parse.urljoin(url, '/robots.txt')) rp.read() return rp
def my_robot(): import urllib.robotparser as robot par = robot.RobotFileParser() par.set_url('https://www.samsclub.com/robots.txt') par.read() # reading the URL content print('~' * 20) print(par) print('~' * 20) print(par.can_fetch('*', 'https:/www.samsclub.com/friend')) print(par.can_fetch('*', 'https://www.samsclub.com/friend'))
def _check_robots(url): """Check that our crawler satisfies robot exclusion standard""" try: robot_url = Robots.robots_url(url) parse = robotparser.RobotFileParser() parse.set_url(robot_url) parse.read() return parse.can_fetch('*', url) except: return True
def robotsAllowed(url): robotUrl = baseUrl(url) + "/robots.txt" if robotUrl in robotsDict: return robotsDict[robotUrl].can_fetch("*", url) else: rp = robotparser.RobotFileParser() rp.set_url(robotUrl) rp.read() robotsDict[robotUrl] = rp return rp.can_fetch("*", url)
def init_robot_parser(self): robparser = robotparser.RobotFileParser() robparser.set_url(self.base_url + "/robots.txt") try: robparser.read() return robparser except Exception as e: print(e) print("could not find robots.txt on url: " + self.base_url + "/robots.txt") exit(0)
def wrapper(*args, **kwargs): parser = robotparser.RobotFileParser(url=kwargs['url']) parser.read() if parser.can_fetch(agent_name, kwargs['url']): return func(*args, **kwargs) else: raise PermissionError( f'The robots.txt permitts the crawling of the site {kwargs["url"]}' )
def init_robot_parser(host): try: robot_parser = robotparser.RobotFileParser() robot_parser.set_url(urljoin(host, 'robots.txt')) robot_parser.read() return robot_parser except Exception as e: log(WARNING, e) return None
def get_site_maps(url): robot_checkers = {} host = urlparse(url).netloc try: rc = robot_checkers[host] except KeyError: rc = robotparser.RobotFileParser() rc.set_url('http://' + host + '/robots.txt') rc.read() robot_checkers[host] = rc return rc.site_maps()
def get_robots_parser(robots_url): ''' args: robots_url(str): url of website's robot.txt e.g http://www.a.com/robot.txt returns: rp (robotparser.RobotFileParser) ''' rp = robotparser.RobotFileParser() rp.set_url(robots_url) rp.read() return rp
def robot_check(url_parts): url = ''.join([str(p) for p in url_parts]) rp = robotparser.RobotFileParser() rp.set_url(url) rp.read() if rp.can_fetch("*", url): print("Robots.txt: User Allowed") return True else: print("Robots.txt: User Disallowed. Please abort.") return False
def get_robots_parser(robots_url): """ Return robot parser object using robots url """ try: robot_parser = robotparser.RobotFileParser() robot_parser.set_url(robots_url) robot_parser.read() return robot_parser except Exception as e: print('Error finding robots url:', robots_url, e)
def robot(): mrp = rp.RobotFileParser() mrp.set_url('https://www.tmall.com/robots.txt') mrp.read() url = 'https://www.baidu.com' user_agent = 'BadCrawler' flag = mrp.can_fetch(user_agent, url) print(flag) user_agent = '' flag = mrp.can_fetch(user_agent, url) print(flag)
def robot_allow(url): # parses the current url down to the domain and then adds '/robots.txt' to the # end of it so that beautifulsoup can view if the current path that the # url is going to is allowed url_parsed = urlparse(url) robots_url = url_parsed.scheme + "://" + url_parsed.netloc + "/robots.txt" x = RFP.RobotFileParser() x.set_url(robots_url) x.read() return x.can_fetch(useragent, url)
def __init__(self, root_url: str, user_agent: str): self.__parser = robotparser.RobotFileParser() # Parse the URL url_parse = parse.urlparse(root_url) robotsPath = "%s://%s/robots.txt" % (url_parse.scheme, url_parse.netloc) self.__parser.set_url(robotsPath) self.__user_agent = user_agent self.__parser.read()
def get_robots(url): """Initialize robots parser for this domain """ rp = robotparser.RobotFileParser() rp.set_url(urljoin(url, '/robots.txt')) html_ = urlopen(urljoin(url, '/robots.txt')).read().decode( 'utf-8', errors='ignore').split('\n') rp.parse( html_ ) #rp.read()解析出错UnicodeDecodeError: 'utf-8' codec can't decode byte return rp
def canuse(baseurl, path): parser = urobot.RobotFileParser() parser.set_url(urljoin(baseurl, 'robots.txt')) parser.read() canParse = False if (parser.can_fetch(AGENT_NAME, path)): canParse = True if (parser.can_fetch(AGENT_NAME, urljoin(baseurl, path))): canParse = True return canParse
def locate_rules(self, root_url): try: robots_url = urlunparse( (root_url.scheme, root_url.netloc, "robots.txt", "", "", "")) robots = robotparser.RobotFileParser() robots.set_url(robots_url) robots.read() self.crawler_rules[root_url.netloc] = robots except Exception as e: custom_logger().log_message("Exception in robots:\n" + str(e), logger_handler.log_level_ERROR) self.crawler_rules[root_url.netloc] = None
def addrobot(self, root): root = urlparse.urljoin(root, "/") if root in self.robots: return url = urlparse.urljoin(root, "/robots.txt") self.robots[root] = rp = robotparser.RobotFileParser() self.note(2, "Parsing %s", url) rp.debug = self.verbose > 3 rp.set_url(url) try: rp.read() except (OSError, IOError) as msg: self.note(1, "I/O error parsing %s: %s", url, msg)