def _glob_find(self, path, processor, include_toplevel): '''Handle globs in paths. This is done by listing the directory before a glob and checking which node matches the initial glob. If there are more globs in the path, we don't add the found children to the result, but traverse into paths that did have a match. ''' # Split path elements and check where the first occurence of magic is path_elements = path.split("/") for i, element in enumerate(path_elements): if glob.has_magic(element): first_magic = i break # Create path that we check first to get a listing we match all children # against. If the 2nd path element is a glob, we need to check "/", and # we hardcode that, since "/".join(['']) doesn't return "/" if first_magic == 1: check_path = "/" else: check_path = "/".join(path_elements[:first_magic]) # Path that we need to match against match_path = "/".join(path_elements[:first_magic + 1]) # Rest of the unmatched path. In case the rest is only one element long # we prepend it with "/", since "/".join(['x']) doesn't return "/x" rest_elements = path_elements[first_magic + 1:] if len(rest_elements) == 1: rest = "/" + rest_elements[0] else: rest = "/".join(rest_elements) # Check if the path exists and that it's a directory (which it should..) fileinfo = self._get_file_info(check_path) if fileinfo and self._is_dir(fileinfo.fs): # List all child nodes and match them agains the glob listing = self._get_dir_listing(check_path) for node in listing.dirList.partialListing: full_path = self._get_full_path(check_path, node) if fnmatch.fnmatch(full_path, match_path): # If we have a match, but need to go deeper, we recurse if rest and glob.has_magic(rest): traverse_path = "/".join([full_path, rest]) for item in self._glob_find(traverse_path, processor, include_toplevel): yield item else: # If the matching node is a directory, we list the directory # This is what the hadoop client does at least. if self._is_dir(node): if include_toplevel: yield processor(full_path, node) fp = self._get_full_path(check_path, node) dir_list = self._get_dir_listing(fp) if dir_list: # It might happen that the directory above has been removed for n in dir_list.dirList.partialListing: full_child_path = self._get_full_path(fp, n) yield processor(full_child_path, n) else: yield processor(full_path, node)
def _find_items(self, paths, processor, include_toplevel=False, include_children=False, recurse=False, check_nonexistence=False): ''' Request file info from the NameNode and call the processor on the node(s) returned :param paths: A list of paths that need to be processed :param processor: Method that is called on an node. Method signature should be foo(path, node). For additional (static) params, use a lambda. :param include_toplevel: Boolean to enable the inclusion of the first node found. Example: listing a directory should not include the toplevel, but chmod should only operate on the path that is input, so it should include the toplevel. :param include_children: Include children (when the path is a directory) in processing. Recurse will always include children. Example: listing a directory should include children, but chmod shouldn't. :param recurse: Recurse into children if they are directories. ''' #collection = [] if not paths: paths = [os.path.join("/user", pwd.getpwuid(os.getuid())[0])] # Expand paths if necessary (/foo/{bar,baz} --> ['/foo/bar', '/foo/baz']) paths = glob.expand_paths(paths) for path in paths: if not path.startswith("/"): path = self._join_user_path(path) log.debug("Trying to find path %s" % path) if glob.has_magic(path): log.debug("Dealing with globs in %s" % path) for item in self._glob_find(path, processor, include_toplevel): yield item else: fileinfo = self._get_file_info(path) if not fileinfo and not check_nonexistence: raise FileNotFoundException("`%s': No such file or directory" % path) elif not fileinfo and check_nonexistence: yield processor(path, None) return if (include_toplevel and fileinfo) or not self._is_dir(fileinfo.fs): # Construct the full path before processing full_path = self._get_full_path(path, fileinfo.fs) log.debug("Added %s to to result set" % full_path) entry = processor(full_path, fileinfo.fs) yield entry if self._is_dir(fileinfo.fs) and (include_children or recurse): for node in self._get_dir_listing(path): full_path = self._get_full_path(path, node) last_entry_path = node.path entry = processor(full_path, node) yield entry # Recurse into directories if recurse and self._is_dir(node): # Construct the full path before processing full_path = os.path.join(path, node.path) for item in self._find_items([full_path], processor, include_toplevel=False, include_children=False, recurse=recurse): yield item