class DFS(object): def __init__(self, hostname='speedy', port=8020): self.hfs = Hfilesystem(hostname, port) self.urlhead = 'hdfs://' + hostname + ':' + str(port) self.urlheadlen = len(self.urlhead) def mkdir(self, dirname): if not self.exists(dirname): self.hfs.mkdir(dirname) def rmdir(self, dirname): if self.exists(dirname): self.hfs(dirname) def rm_rf(self, d): self.rmdir(d) def rename(self, srcpath, destpath): if os.path.exists(destpath): self.rm_rf(destpath) self.hfs.rename(srcpath, destpath) def is_done(self, dirname): return self.exists(os.path.join(dirname, DONE_TAG)) def get_subdirs(self, dirname, checkdone = False): if not self.exists(dirname): return [] return [sdir[sdir.rindex['/']+1:] for sdir in self.hfs.listdir(dirname) if self.hfs.isDir(os.path.join(dirname, sdir)) if not checkdone or self.is_done(os.path.join(dirname, sdir))] def get_abs_subdirs(self, dirname, checkdone = False): if not self.exists(dirname): return [] return [sdir[self.urlheadlen:] for sdir in self.hfs.listdir(dirname) if self.hfs.isDir(os.path.join(dirname, sdir)) if not checkdone or self.is_done(os.path.join(dirname, sdir))] def get_unfinished_subdirs(self, dirname, jobname = '', checkdone = False): return [sdir for sdir in self.get_subdirs(dirname, checkdone) if not self.exists(os.path.join(dirname, sdir, jobname + FINISHED_TAG))] def get_buffered_subdirs(self, dirname, jobname = '', checkdone = False): return [sdir for sdir in self.get_subdirs(dirname, checkdone = False) if not self.exists(os.path.join(dirname, sdir, jobname + FINISHED_TAG)) and not self.exists(os.path.join(dirname, sdir, jobname + STARTED_TAG))] def get_unfinished_abs_subdirs(self, dirname, jobname = '', checkdone = False): return [sdir for sdir in self.get_abs_subdirs(dirname, checkdone) if not self.exists(os.path.join(dirname, sdir, jobname + FINISHED_TAG))] def get_buffered_abs_subdirs(self, dirname, jobname = '', checkdone = False): return [os.path.join(dirname, sdir) for sdir in self.get_abs_subdirs(dirname, checkdone = False) if not self.exists(os.path.join(dirname, sdir, jobname + FINISHED_TAG)) and not self.exists(os.path.join(dirname, sdir, jobname + STARTED_TAG))] def get_subfiles(self, dirname): if not self.exists(dirname): return [] return [sfile for sfile in self.hfs.listdir(dirname) if self.hfs.isFile(os.path.join(dirname, sfile)) ] def get_abs_subfiles(self, dirname): return [os.path.join(dirname, sfile) for sfile in self.get_subfiles(dirname) ] def get_subdir_num(self, dirname): return len(self.get_subdirs(dirname)) def get_unfinished_subdir_num(self, dirname, jobname = '', checkdone = False): return len(self.get_unfinished_subdirs(dirname, jobname, checkdone = False)) def get_buffered_subdir_num(self, dirname, jobname = '', checkdone = False): return len(self.get_buffered_subdirs(dirname, jobname, checkdone = False)) def get_subfile_num(self, dirname): return len(self.get_subfiles(dirname)) # recursive def get_dir_size(self, dirname): size = 0L if self.exists(dirname): for node in self.hfs.listdir(dirname): if self.hfs.isFile(node): size += self.hfs.stat(node).mSize else: size += self.get_dir_size(node) return size # recursive def get_unfinished_dir_size(self, dirname, jobname = ''): size = 0L if self.exists(dirname) and not self.exists(os.path.join(dirname, jobname + FINISHED_TAG)): for node in self.hfs.listdir(dirname): if self.hfs.isFile(node): size += self.hfs.stat(node).mSize else: size += self.get_dir_size(node) return size def get_buffered_dir_size(self, dirname, jobname = ''): size = 0L if self.exists(dirname) and not self.exists(os.path.join(dirname, jobname + FINISHED_TAG)): for node in self.hfs.listdir(dirname): if self.hfs.isFile(node): size += self.hfs.stat(node).mSize else: size += self.get_dir_size(node) return size def exists(self, pathname): return self.hfs.exists(pathname)