def test_filesystem(self): hfile = Hfile(hostname, port, path, mode='w') hfile.close() fs = Hfilesystem(hostname, port) self.assertTrue(fs.exists(path)) self.assertFalse(fs.exists(path + 'doesnotexist')) self.assertTrue(fs.rename(path, path + 'renamed')) self.assertTrue(fs.delete(path + 'renamed')) self.assertFalse(fs.delete(path))
def test_mkdir(self): fs = Hfilesystem(hostname, port) self.assertTrue(fs.mkdir(path)) self.assertTrue(fs.delete(path))
hfile.write(line) # And close them. fh.close() hfile.close() # Let's read local_path into memory for comparison. motd = open(local_path).read() # Now let's read the data back hfile = Hfile(hostname, port, hdfs_path) # With an iterator data_read_from_hdfs = '' for line in hfile: data_read_from_hdfs += line print motd == data_read_from_hdfs # All at once data_read_from_hdfs = hfile.read() print motd == data_read_from_hdfs hfile.close() # Hopefully you have enough info to get started! from hdfs.hfilesystem import Hfilesystem hfs = Hfilesystem(hostname, port) print hfs.getHosts(hdfs_path, 0, 1)
def __init__(self, hostname='speedy', port=8020): self.hfs = Hfilesystem(hostname, port) self.urlhead = 'hdfs://' + hostname + ':' + str(port) self.urlheadlen = len(self.urlhead)
class DFS(object): def __init__(self, hostname='speedy', port=8020): self.hfs = Hfilesystem(hostname, port) self.urlhead = 'hdfs://' + hostname + ':' + str(port) self.urlheadlen = len(self.urlhead) def mkdir(self, dirname): if not self.exists(dirname): self.hfs.mkdir(dirname) def rmdir(self, dirname): if self.exists(dirname): self.hfs(dirname) def rm_rf(self, d): self.rmdir(d) def rename(self, srcpath, destpath): if os.path.exists(destpath): self.rm_rf(destpath) self.hfs.rename(srcpath, destpath) def is_done(self, dirname): return self.exists(os.path.join(dirname, DONE_TAG)) def get_subdirs(self, dirname, checkdone = False): if not self.exists(dirname): return [] return [sdir[sdir.rindex['/']+1:] for sdir in self.hfs.listdir(dirname) if self.hfs.isDir(os.path.join(dirname, sdir)) if not checkdone or self.is_done(os.path.join(dirname, sdir))] def get_abs_subdirs(self, dirname, checkdone = False): if not self.exists(dirname): return [] return [sdir[self.urlheadlen:] for sdir in self.hfs.listdir(dirname) if self.hfs.isDir(os.path.join(dirname, sdir)) if not checkdone or self.is_done(os.path.join(dirname, sdir))] def get_unfinished_subdirs(self, dirname, jobname = '', checkdone = False): return [sdir for sdir in self.get_subdirs(dirname, checkdone) if not self.exists(os.path.join(dirname, sdir, jobname + FINISHED_TAG))] def get_buffered_subdirs(self, dirname, jobname = '', checkdone = False): return [sdir for sdir in self.get_subdirs(dirname, checkdone = False) if not self.exists(os.path.join(dirname, sdir, jobname + FINISHED_TAG)) and not self.exists(os.path.join(dirname, sdir, jobname + STARTED_TAG))] def get_unfinished_abs_subdirs(self, dirname, jobname = '', checkdone = False): return [sdir for sdir in self.get_abs_subdirs(dirname, checkdone) if not self.exists(os.path.join(dirname, sdir, jobname + FINISHED_TAG))] def get_buffered_abs_subdirs(self, dirname, jobname = '', checkdone = False): return [os.path.join(dirname, sdir) for sdir in self.get_abs_subdirs(dirname, checkdone = False) if not self.exists(os.path.join(dirname, sdir, jobname + FINISHED_TAG)) and not self.exists(os.path.join(dirname, sdir, jobname + STARTED_TAG))] def get_subfiles(self, dirname): if not self.exists(dirname): return [] return [sfile for sfile in self.hfs.listdir(dirname) if self.hfs.isFile(os.path.join(dirname, sfile)) ] def get_abs_subfiles(self, dirname): return [os.path.join(dirname, sfile) for sfile in self.get_subfiles(dirname) ] def get_subdir_num(self, dirname): return len(self.get_subdirs(dirname)) def get_unfinished_subdir_num(self, dirname, jobname = '', checkdone = False): return len(self.get_unfinished_subdirs(dirname, jobname, checkdone = False)) def get_buffered_subdir_num(self, dirname, jobname = '', checkdone = False): return len(self.get_buffered_subdirs(dirname, jobname, checkdone = False)) def get_subfile_num(self, dirname): return len(self.get_subfiles(dirname)) # recursive def get_dir_size(self, dirname): size = 0L if self.exists(dirname): for node in self.hfs.listdir(dirname): if self.hfs.isFile(node): size += self.hfs.stat(node).mSize else: size += self.get_dir_size(node) return size # recursive def get_unfinished_dir_size(self, dirname, jobname = ''): size = 0L if self.exists(dirname) and not self.exists(os.path.join(dirname, jobname + FINISHED_TAG)): for node in self.hfs.listdir(dirname): if self.hfs.isFile(node): size += self.hfs.stat(node).mSize else: size += self.get_dir_size(node) return size def get_buffered_dir_size(self, dirname, jobname = ''): size = 0L if self.exists(dirname) and not self.exists(os.path.join(dirname, jobname + FINISHED_TAG)): for node in self.hfs.listdir(dirname): if self.hfs.isFile(node): size += self.hfs.stat(node).mSize else: size += self.get_dir_size(node) return size def exists(self, pathname): return self.hfs.exists(pathname)