def isDirectory(self, path): """Return whether the path is a directory. Assumes any path ending in a slash is a directory, and any that redirects to a location ending in a slash is also a directory. """ if path.endswith("/"): return True # If the URI scheme is FTP, then the URI comes from a Squid # FTP listing page, which includes the trailing slash on all # URIs that need it. if self.scheme == 'ftp': return False self.log.debug("Checking if %s is a directory" % path) try: response = self.request("HEAD", path) except (TimeoutError, requests.RequestException) as exc: raise HTTPWalkerError(str(exc)) if not response.is_redirect or "location" not in response.headers: return False url = response.headers["location"] scheme, netloc, redirect_path, _, _ = urlsplit(url, self.scheme, self.FRAGMENTS) if len(scheme) and scheme != self.scheme: return False elif len(netloc) and netloc != self.full_netloc: return False elif redirect_path != as_dir(path): return False else: return True
def __init__(self, base, log_parent=None): self.log = log.get_logger(type(self).__name__, log_parent) self.base = base (scheme, netloc, path, query, fragment) \ = urlsplit(base, self.URL_SCHEMES[0], self.FRAGMENTS) if scheme not in self.URL_SCHEMES: raise WalkerError("Can't handle %s scheme" % scheme) self.scheme = scheme self.full_netloc = netloc try: (user_passwd, host) = netloc.split("@", 1) self.host = unquote_plus(host) try: (user, passwd) = user_passwd.split(":", 1) self.user = unquote_plus(user) self.passwd = unquote_plus(passwd) except ValueError: self.user = unquote_plus(user_passwd) self.passwd = None except ValueError: self.host = unquote_plus(netloc) self.user = None self.passwd = None self.query = query self.fragment = fragment self.path = as_dir(path)
def list(self, dirname): """Download the HTML index at subdir and scrape for URLs. Returns a list of directory names (links ending with /, or that result in redirects to themselves ending in /) and filenames (everything else) that reside underneath the path. """ self.log.info("Listing %s" % dirname) try: response = self.request("GET", dirname) try: soup = BeautifulSoup(response.read()) finally: response.close() except (IOError, socket.error) as exc: raise HTTPWalkerError(str(exc)) base = URI(self.base).resolve(dirname) # Collect set of URLs that are below the base URL urls = set() for anchor in soup("a"): href = anchor.get("href") if href is None: continue try: url = base.resolve(href) except InvalidURIError: continue # Only add the URL if it is strictly inside the base URL. if base.contains(url) and not url.contains(base): urls.add(url) dirnames = set() filenames = set() for url in urls: if url.path.endswith(';type=a') or url.path.endswith(';type=i'): # these links come from Squid's FTP dir listing to # force either ASCII or binary download and can be # ignored. continue filename = subdir(base.path, url.path) if self.isDirectory(url.path): dirnames.add(as_dir(filename)) else: filenames.add(filename) return (sorted(dirnames), sorted(filenames))
def isDirectory(self, path): """Return whether the path is a directory. Assumes any path ending in a slash is a directory, and any that redirects to a location ending in a slash is also a directory. """ if path.endswith("/"): return True # If the URI scheme is FTP, then the URI comes from a Squid # FTP listing page, which includes the trailing slash on all # URIs that need it. if self.scheme == 'ftp': return False self.log.debug("Checking if %s is a directory" % path) try: self.request("HEAD", path) return False except urllib2.HTTPError as exc: if exc.code != 301: return False except (IOError, socket.error) as exc: # Raise HTTPWalkerError for other IO or socket errors. raise HTTPWalkerError(str(exc)) # We have a 301 redirect error from here on. url = exc.hdrs.getheader("location") (scheme, netloc, redirect_path, query, fragment) \ = urlsplit(url, self.scheme, self.FRAGMENTS) if len(scheme) and scheme != self.scheme: return False elif len(netloc) and netloc != self.full_netloc: return False elif redirect_path != as_dir(path): return False else: return True
def walk(self): """Walk through the URL. Yields (dirpath, dirnames, filenames) for each path under the base; dirnames can be modified as with os.walk. """ try: self.open() except (IOError, socket.error) as e: self.log.info("Could not connect to %s" % self.base) self.log.info("Failure: %s" % e) return subdirs = [self.path] while len(subdirs): sub_dir = subdirs.pop(0) try: (dirnames, filenames) = self.list(sub_dir) except WalkerError: self.log.info('could not retrieve directory ' 'listing for %s', sub_dir) continue except UnicodeEncodeError: # This page is unparsable. # XXX sinzui 2009-06-22 bug=70524: # This problem should be reported to the project drivers # so that they can attempt to get this fixed. self.log.info( "Unicode error parsing %s page '%s'" % (self.base, sub_dir)) continue yield (sub_dir, dirnames, filenames) for dirname in dirnames: subdirs.append(urljoin(sub_dir, as_dir(dirname))) self.close()
def walk(self): """Walk through the URL. Yields (dirpath, dirnames, filenames) for each path under the base; dirnames can be modified as with os.walk. """ try: self.open() except (IOError, socket.error) as e: self.log.info("Could not connect to %s" % self.base) self.log.info("Failure: %s" % e) return subdirs = [self.path] while len(subdirs): sub_dir = subdirs.pop(0) try: (dirnames, filenames) = self.list(sub_dir) except WalkerError: self.log.info('could not retrieve directory ' 'listing for %s', sub_dir) continue except UnicodeEncodeError: # This page is unparsable. # XXX sinzui 2009-06-22 bug=70524: # This problem should be reported to the project drivers # so that they can attempt to get this fixed. self.log.info("Unicode error parsing %s page '%s'" % (self.base, sub_dir)) continue yield (sub_dir, dirnames, filenames) for dirname in dirnames: subdirs.append(urljoin(sub_dir, as_dir(dirname))) self.close()