def urlchecker(self, url): if url is None: return False normalized_url = urltools.normalize(url) robotparser = urllib.robotparser.RobotFileParser() try: url_comp = urlparse(normalized_url) base_url = url_comp.scheme + "://" + url_comp.netloc + "/" except: self.logger.error("Cannot parse: " + url) try: robotparser.set_url(base_url + "robots.txt") robotparser.read() if not robotparser.can_fetch("*", normalized_url): self.logger.error(url + " is excluded due to protocol") return False except: self.logger.error("Cannot determine robots exclusion protocol: " + url) if normalized_url in self.visited_urls: self.logger.debug(url + " Has been visited before! ") return False elif base_url in self.sites_times and self.sites_times[base_url] > int( self.limit): # self.logger.debug( url + " Times visiting this site have reach the limit ") return False elif 'cgi' in normalized_url: return False else: return True
def validate_robots(self): """ Confere o arquivo robots.txt dos sites e verifica condições de Dissalow adicionando duas keys novas ao dict da cidade correspondente: - has_robotstxt (True/False): responde a pergunta se tem ou não o arquivo robots.txt - can_crawling (True/False): responde a pergunta se pode ou não fazer crawling """ print(f'{self.city["city_name"]}: Verificação do arquivo robots.') try: self.city['timestamp'] = dt.timestamp(dt.now()) self.city['has_robotstxt'] = False self.city['can_crawling'] = True city_url = self.city['url'] city_url_robots = city_url + 'robots.txt' robotstxt = requests.get(city_url_robots, timeout=30, headers={'user-agent': 'uscs/0.0.1'}) if robotstxt.status_code == 200: self.city['has_robotstxt'] = True robotparser = urllib.robotparser.RobotFileParser() robotparser.set_url(city_url_robots) robotparser.read() if not robotparser.can_fetch('*', city_url): self.city['can_crawling'] = False print(f'{self.city["city_name"]}: Sem permissão para fazer crawling.') if self.city['can_crawling']: self.sourcecode = self.get_sourcecode() self.validate_recommendations() except requests.exceptions.RequestException as error: print(dt.timestamp(dt.now()), self.city['_id'], self.city["city_name"], error)
def validate_robots(self): """ Confere o arquivo robots.txt dos sites e verifica condições de Dissalow adicionando duas keys novas ao dict da cidade correspondente: - has_robotstxt (True/False): responde a pergunta se tem ou não o arquivo robots.txt - can_crawling (True/False): responde a pergunta se pode ou não fazer crawling """ try: self.city['has_robotstxt'] = False self.city['can_crawling'] = False city_url = self.city['url'] city_url_robots = city_url + 'robots.txt' robotstxt = requests.get(city_url_robots, timeout=30) if robotstxt.status_code == 404: self.city['can_crawling'] = False if robotstxt.status_code == 200: self.city['has_robotstxt'] = True robotparser = urllib.robotparser.RobotFileParser() robotparser.set_url(city_url_robots) robotparser.read() if robotparser.can_fetch('*', city_url): self.city['can_crawling'] = True self.validate_recommendations() except requests.exceptions.RequestException as error: print(dt.timestamp(dt.now()), self.city['city_name'], error)
def get_can_fetch(cls, node_url): user_agent = cls.get_user_agent() parsed_url = urlparse(node_url) robot_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" robotparser = urllib.robotparser.RobotFileParser() robotparser.set_url(robot_url) robotparser.read() return robotparser.can_fetch(user_agent, node_url)
def checkRobotsForUrl(url): # Extract base URL parsedURL = urllib.parse.urlparse(url) robotsURL = parsedURL.scheme + "://" + parsedURL.netloc + "/robots.txt" try: robotparser = urllib.robotparser.RobotFileParser() robotparser.set_url(robotsURL) robotparser.read() can_fetch = robotparser.can_fetch("*", url) return can_fetch except: return True
import urllib.robotparser from urllib.parse import urlparse user_agent = 'unpackbot' url = "https://buzzfeed.com/contests" parsed_url = urlparse(url) robot_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" print(url, robot_url) robotparser = urllib.robotparser.RobotFileParser() robotparser.set_url(robot_url) robotparser.read() can_fetch = robotparser.can_fetch(user_agent, url) print(can_fetch)
def check_robot(url, user_agent): robotparser = urllib.robotparser.RobotFileParser() robotparser.set_url(url+'/robots.txt') robotparser.read() print(robotparser.can_fetch(user_agent,url)) return robotparser.can_fetch(user_agent, url)
def check_robot(url, **kwargs): user_agent = 'python-requests/2.18.4 (Compatible; John Doe)' robotparser = urllib.robotparser.RobotFileParser() robotparser.set_url(url) robotparser.read() return robotparser.can_fetch(user_agent, url)