def accessible(url): u = urlparse(url) if u.netloc not in robots_cache: resp = requests.get('http://%s/robots.txt' % u.netloc) rp = RobotFileParser() rp.parse(resp.content.splitlines()) robots_cache[u.netloc] = rp return robots_cache[u.netloc].can_fetch('*', url)
def accessible(url): u = urlparse(url) if u.netloc not in robots_cache: resp = requests.get('http://%s/robots.txt' % u.netloc) rp = RobotFileParser() rp.parse(resp.content.splitlines()) robots_cache[u.netloc] = rp return robots_cache[u.netloc].can_fetch('*', url)
def parse_robots(self, netloc, content): """ Parse the given robots.txt content and store against the given domain. If content is None, any URL will be allowed. """ robot = RobotFileParser() if content is not None: robot.parse(content.split("\n")) self._robots[netloc] = robot
def parse_robots(self, netloc, content): """ Parse the given robots.txt content and store against the given domain. If content is None, any URL will be allowed. """ robot = RobotFileParser() if content is not None: robot.parse(content.split("\n")) self.execute("UPDATE domain SET robots=? WHERE netloc=?", dumps(robot), netloc)
def robots_precheck(self, url): """ If we have the robots.txt file available, check it to see if the request is permissible. This does not fetch robots.txt. """ fetcher = RedFetcher(url) robots_txt = fetcher.fetch_robots_txt(url, lambda a:a, network=False) if robots_txt == "": return True checker = RobotFileParser() checker.parse(robots_txt.splitlines()) return checker.can_fetch(UA_STRING, url)
def getRobots(url): parsed = urlparse(url) robots_url = parsed.scheme + '://' + parsed.netloc + '/robots.txt' if robots_url not in robots: rp = RobotFileParser() try: r = requests.get(robots_url, verify=False, timeout=1) r.raise_for_status() except Exception: rp.parse('') else: rp.parse(r.text) #print " new robot at " + robots_url robots[robots_url] = rp return robots[robots_url]
def crawl(self, seed_url, max_urls=30, max_depth=1, obey_robots=False, max_size=1000000, force_html=True, **kwargs): """Crawl website html and return list of URLs crawled seed_url: url to start crawling from max_urls: maximum number of URLs to crawl (use None for no limit) max_depth: maximum depth to follow links into website (use None for no limit) obey_robots: whether to obey robots.txt max_size is passed to get() and is limited to 1MB by default force_text is passed to get() and is set to True by default so only crawl HTML content **kwargs is passed to get() """ user_agent = kwargs.get("user_agent", self.user_agent) server = "http://" + extract_domain(seed_url) robots = RobotFileParser() if obey_robots: robots.parse(self.get(server + "/robots.txt").splitlines()) # load robots.txt outstanding = [(seed_url, 0), (server, 0)] # which URLs need to crawl crawled = [] # urls that have crawled while outstanding: # more URLs to crawl if len(crawled) == max_urls: break url, cur_depth = outstanding.pop(0) if url not in crawled: html = self.get(url, max_size=max_size, force_html=force_html, **kwargs) crawled.append(url) if max_depth is None or cur_depth < max_depth: # continue crawling for scraped_url in re.findall(re.compile("<a[^>]+href=[\"'](.*?)[\"']", re.IGNORECASE), html): if "#" in scraped_url: scraped_url = scraped_url[ : scraped_url.index("#") ] # remove internal links to prevent duplicates if os.path.splitext(scraped_url)[ -1 ].lower() not in Download.IGNORED_EXTENSIONS and robots.can_fetch(user_agent, scraped_url): scraped_url = urljoin(server, scraped_url) # support relative links # check if same domain or sub-domain this_server = extract_domain(scraped_url) if this_server and (this_server in server or server in this_server): outstanding.append((scraped_url, cur_depth + 1)) return crawled
def run_continue(self, robots_txt): """ Continue after getting the robots file. TODO: refactor callback style into events. """ if robots_txt == "": # empty or non-200 pass else: checker = RobotFileParser() checker.parse(robots_txt.decode('ascii', 'replace').encode('ascii', 'replace').splitlines()) if not checker.can_fetch(UA_STRING, self.request.uri): self.response.http_error = RobotsTxtError() self.finish_task() return # TODO: show error? if 'user-agent' not in [i[0].lower() for i in self.request.headers]: self.request.headers.append( (u"User-Agent", UA_STRING)) self.exchange = self.client.exchange() self.exchange.on('response_start', self._response_start) self.exchange.on('response_body', self._response_body) self.exchange.on('response_done', self._response_done) self.exchange.on('error', self._response_error) if self.status_cb and self.name: self.status_cb("fetching %s (%s)" % ( self.request.uri, self.name )) req_hdrs = [ (k.encode('ascii', 'replace'), v.encode('latin-1', 'replace')) \ for (k, v) in self.request.headers ] self.exchange.request_start( self.request.method, self.request.uri, req_hdrs ) self.request.start_time = thor.time() if self.request.payload != None: self.exchange.request_body(self.request.payload) self.transfer_out += len(self.request.payload) self.exchange.request_done([])
def knock(self, user_agent, url, override, retries=0, debug_force_status=None): """ Makes a request for '/robots.txt' and returns True if 'user_agent' can fetch 'url'. Returns False otherwise If we get a HTTP response code other than '200' or any request error occurs, this function will return True If we get a gaierror (DNS lookup error), this function will return False as everything else is doomed to fail If 'override' is True, this function will automatically return True. Default value for override is False """ if override: return True host = net.urlparse(url)[1] robot = RobotFileParser() clearance = False if retries > 0: time_mod.sleep(self.crawl_delay) try: # We try to get the resource /robots.txt connection = net.HTTPConnection(host, 80) connection.request( self.GET, "/robots.txt", None, { "User-Agent" : user_agent } ) response = connection.getresponse() robot_lines = response.read().splitlines() connection.close() if debug_force_status: response.status = debug_force_status if response.status == 200 and filter(None, robot_lines) != []: # If everthing went well, we feed the content of the resource to the parser robot.parse(robot_lines) # And resolve if we have clearance to fetch the url clearance = robot.can_fetch(user_agent, url) # We try to get the Crawl-delay directive, if it exists try: self.crawl_delay = int( "".join(list( directive for directive in robot_lines if directive.lower().startswith("crawl-delay") )).split(":")[1] ) except IndexError: # If no 'Crawl-delay' is specified, we leave it at 1 second pass elif response.status in [408, 500, 503]: if retries < 3: try: time_mod.sleep(self.current_headers["retry-after"] - self.crawl_delay) except KeyError: pass except TypeError: pass clearance = self.knock(user_agent, url, False, retries + 1) else: clearance = True else: clearance = True if retries < 1: time_mod.sleep(self.crawl_delay) return clearance except net.HTTPException: # A request error occurred. We retry the request, if it fails we just ignore /robots.txt and proceed if retries < 3: return self.knock(user_agent, url, False, retries + 1) else: return True except net.timeout: # Request timed out. We retry the request, if it fails we just ignore /robots.txt and proceed if retries < 3: return self.knock(user_agent, url, False, retries + 1) else: return True