def test_parse(self): from robotparser import RobotFileParser rules = RobotFileParser() rules.set_url("http://www.sogou.com/robots.txt") rules.read() self.assertEqual( rules.can_fetch("mozilla", "http://www.sogou.com/sohu/robots.txt"), False)
def parse_robots(self, netloc, content): """ Parse the given robots.txt content and store against the given domain. If content is None, any URL will be allowed. """ robot = RobotFileParser() if content is not None: robot.parse(content.split("\n")) self._robots[netloc] = robot
def accessible(url): u = urlparse(url) if u.netloc not in robots_cache: resp = requests.get('http://%s/robots.txt' % u.netloc) rp = RobotFileParser() rp.parse(resp.content.splitlines()) robots_cache[u.netloc] = rp return robots_cache[u.netloc].can_fetch('*', url)
def __init__(self, url): self.page_url = url self.parsed_url = urlparse.urlparse(url) self.lang = "" self.isDownload = False self.title = "" self.text = "" self.soup = None self.robot = RobotFileParser()
def parse_robots(self, netloc, content): """ Parse the given robots.txt content and store against the given domain. If content is None, any URL will be allowed. """ robot = RobotFileParser() if content is not None: robot.parse(content.split("\n")) self.execute("UPDATE domain SET robots=? WHERE netloc=?", dumps(robot), netloc)
def _get_robot_parser(self): if self.robot_parser_pickle is not None: return pickle.loads(base64.b64decode(self.robot_parser_pickle)) else: parser = RobotFileParser() parser.set_url(self.protocol + "://" + self.domain + "/robots.txt") self.robot_parser = parser return parser
def get_robots(url): ''' Initialize robots parser for this domain :param url: :return: ''' rp = RobotFileParser() rp.set_url(urlparse.urljoin(url, '/robots.txt')) rp.read() return rp
def http_open(self, request): #request -- urllib2.Request url = request.get_full_url() host = urlsplit(url)[1] robots_url = urlunsplit(('http', host, '/robots.txt', '', '')) rp = RobotFileParser(robots_url) rp.read() if not rp.can_fetch(self.agentname, url): raise RuntimeError('Forbidden by robots.txt') return urllib2.HTTPHandler.http_open(self, request)
def checkRobots(URL): time.sleep(1) parsed = urlparse(URL) robotsUrl = parsed.scheme + "://" + parsed.netloc + "/robots.txt" robotParser = RobotFileParser() robotParser.set_url(robotsUrl) robotParser.read() result = robotParser.can_fetch("*", URL) return result
def can_fetch(self, url): host, path = urlparse.urlparse(url)[1:3] if (self.rules.has_key(host)): return self.rules[host].can_fetch(self.agent, url) else: rp = RobotFileParser() robot_url = "http://" + host + "/robots.txt" rp.set_url(robot_url) rp.read() self.rules[host] = rp return rp.can_fetch(self.agent, url)
def try_add_robot(self, url): parsed_url = urlparse(url) if parsed_url.netloc not in self.robots: try: robot_url = parsed_url.scheme + '://' + parsed_url.netloc + \ '/robots.txt' rp = RobotFileParser(robot_url) rp.read() self.robots[parsed_url.netloc] = rp except IOError as e: print str(e) except Exception as e: print str(e)
def check_robots(self, url): '''check the robots.txt in this url's domain''' hostname = urlparse(url).netloc if hostname not in self.domain_list.keys( ): # no records in domain_list rp = RobotFileParser('http://%s/robots.txt' % hostname) print("%s: fetching %s" % (url, rp.url)) try: rp.read() # get new robots.txt except IOError, e: # url's server not available(connection timeout) log.error(str(e)) rp.disallow_all = True # reject all request self.domain_list[ hostname] = rp # add domain entry into domain_list
def disallow(self, url): """ TO BE DONE """ robotFile = urljoin(url, "/robots.txt") # key = hashlib.sha1(robotFile).hexdigest() if (not self._dict.has_key(key)): self._dict[key] = RobotFileParser(robotFile) try: self._dict[key].read() except: self._dict[key] = None result = self._dict[key] is None or not self._dict[key].can_fetch( self._userAgent, url) return result
def __init__(self): self.rp = RobotFileParser() self.rp.set_url('https://www.timeanddate.com/robots.txt') self.rp.read() if not self.rp.can_fetch('WasThereAHoliday', init_url): raise RuntimeError('Scrapping forbidden due to robots.txt file') self.countries = self.get_countries(self.get_page(init_url)) try: # removing entries which are not countries self.countries.remove('un') except ValueError: pass try: # removing entries which are not countries self.countries.remove('world') except ValueError: pass
def link_crawler(seed_url, link_regex): import re crawler_queue = [seed_url] seen = {} while crawler_queue: url = crawler_queue.pop() html = download(url, now=1) from robotparser import RobotFileParser rp = RobotFileParser().set_url(web_url.join('robots.txt')).read() for link in get_links(url): depth = seen.get(link, 1) seen[link] = depth if re.match(link_regex, link) and link not in seen and rp.can_fetch( user_agent, web_url) and seen[link] != max_try: seen[link] = depth + 1 link = urlparse.urljoin(seed_url, link) crawler_queue.append(link)
def can_read(url): domain = domain_name(url) if domain not in Permissions: rp = RobotFileParser() rp.set_url(urljoin('http://' + domain, 'robots.txt')) try: rp.read() except: return False Permissions[domain] = rp res = False try: res = Permissions[domain].can_fetch("*", url) except: return False return res
def _get_soup(path): """Gets soup from the given path, respecting robots.txt""" full_path = BASE_URL + path # Set a user-agent user_agent = 'dcnotify/%s' % __version__ http_headers = {'User-Agent': '%s' % user_agent} # Honor robots.txt robots = RobotFileParser() robots.set_url("%s/robots.txt" % BASE_URL) robots.read() if not robots.can_fetch(user_agent, full_path): raise ValueError("Path disallowed by robots.txt") # Make a make a request, raising any HTTP errors that might occur request = get(full_path, headers=http_headers) request.raise_for_status() return bs(request.text)
def __init__(self, starturl, index_html='', maxlevel=1, cookie_file=None, acldb=None, urldb=None, default_charset=None, delay=0, timeout=300, debug=0): (proto, self.hostport, _x, _y, _z) = urlsplit(starturl) # assert proto == 'http' #Thread.__init__(self) self.debug = debug self.index_html = index_html if cookie_file: self.cookiejar = MozillaCookieJar(cookie_file) self.cookiejar.load() else: self.cookiejar = None self.robotstxt = RobotFileParser() self.robotstxt.set_url(urljoin(starturl, '/robots.txt')) try: self.robotstxt.read() except IOError: pass self.conn = None self.urldb = urldb self.acldb = acldb self.curlevel = 0 self.delay = delay self.timeout = timeout self.default_charset = default_charset if starturl.endswith('/'): starturl += self.index_html self.urls = [(starturl, maxlevel)] self.crawled = {} # 1:injected, 2:crawled return
def __init__(self, main_page=None, robotrules=True): """ Constuctor method that initializes the members that are used during crawling process :param main_page: The root page that needs to be crawled for generation of sitemap """ logging.info("Consider Robot.txt ? ==> " + str(robotrules)) self.robotrules = robotrules self.site_map = { } # map that records the visits of urls, datemodified and assets self.network = { } # map that maintains the network/graph of webpages visited # The intention of this map is for visual rendering using d3.js self.unvisited = set( []) # a set to keep the list of urls yet to be visited self.start_page = None # the root page, this is used to avoid cycle and keeping crawl # process limited to single domain. self.robot_txt_rules = None if main_page: self.unvisited.add(main_page) try: self.start_page = urlparse(main_page).netloc except: logging.error("Improper URL, Please provide a Valid Url:" + main_page) exit(0) if self.robotrules == "True": try: logging.info("robot.txt respected") self.robot_txt_rules = RobotFileParser() self.robot_txt_rules.set_url(main_page + "/robots.txt") self.robot_txt_rules.read() except: logging.error("Unable to read the robot.txt file") self.robotrules = False # error reading robot.txt, ignore it forever
def parse_robots(self, robots_text): self.robots_parser = RobotFileParser(robots_text) self.robots_parser.read()
def _create_robot_file_parser(self, url): host = urlparse.urlsplit(url)[1] robots_url = urlparse.urlunsplit(('http', host, '/robots.txt', '', '')) rp = RobotFileParser(robots_url) rp.read() return rp
def __init__(self, link): self.CurLink = link self.r = RobotFileParser()