def cp(fromPath, toPath, force=False): """ Runs 'hdfs dfs -cp fromPath toPath' to copy a file. """ command = ['hdfs', 'dfs', '-cp', fromPath, toPath] if force: command.insert(3, '-f') sh(command)
def test_sh(self): command = ['/bin/echo', 'test-list'] output = sh(command) self.assertEqual(output, 'test-list') command = '/bin/echo test-string' output = sh(command) self.assertEqual(output, 'test-string')
def cat(path): """ Runs hdfs dfs -cat path and returns the contents of the file. Be careful with file size, it will be returned as an in-memory string. """ command = ['hdfs', 'dfs', '-cat', path] return sh(command).decode('utf-8')
def get_modified_datetime(path): """ Runs 'hdfs dfs -stat' and returns the modified datetime for the given path. """ stat_str = sh(['hdfs', 'dfs', '-stat', path]).decode('utf-8') date_str, time_str = stat_str.strip().split() iso_datetime_str = date_str + 'T' + time_str + 'Z' return parser.parse(iso_datetime_str)
def rmdir(paths): """ Runs hdfs dfs -rmdir on paths. """ if isinstance(paths, str): paths = paths.split() return sh(['hdfs', 'dfs', '-rmdir'] + paths)
def touchz(paths): """ Runs hdfs dfs -touchz paths, optinally skipping trash. """ if isinstance(paths, str): paths = paths.split() return sh(['hdfs', 'dfs', '-touchz'] + paths)
def mkdir(paths, create_parent=True): """ Runs hdfs dfs -mkdir -p on paths. """ options = ['-p'] if create_parent else [] if isinstance(paths, str): paths = paths.split() return sh(['hdfs', 'dfs', '-mkdir'] + options + paths)
def rm(paths, recurse=True, skip_trash=True): """ Runs hdfs dfs -rm -R on paths, optinally skipping trash. """ if isinstance(paths, str): paths = paths.split() options = (['-R'] if recurse else []) + (['-skipTrash'] if skip_trash else []) return sh(['hdfs', 'dfs', '-rm'] + options + paths)
def mv(from_paths, to_paths, inParent=True): """ Runs hdfs dfs -mv fromPath toPath for each values of from/to Paths. If inParent is True (default), the parent folder in each of the to_paths provide is used as destination. Set inParent parameter to False if the file/folder moved is also renamed. """ if isinstance(from_paths, str): from_paths = from_paths.split() if isinstance(to_paths, str): to_paths = to_paths.split() if len(from_paths) != len(to_paths): raise Exception('from_paths and to_paths size don\'t match in hdfs mv function') for i in range(len(from_paths)) : toParent = '/'.join(to_paths[i].split('/')[:-1]) if not Hdfs.ls(toParent, include_children=False): Hdfs.mkdir(toParent) if (inParent): sh(['hdfs', 'dfs', '-mv', from_paths[i], toParent]) else: sh(['hdfs', 'dfs', '-mv', from_paths[i], to_paths[i]])
def ls(paths, include_children=True, with_details=False): """ Runs hdfs dfs -ls on paths. Parameters: paths : List or string paths to files to ls. Can include shell globs. include_children : If include_children is False, the -d flag will be given to hdfs dfs -ls. Returns: Array of paths matching the ls-ed path. """ if isinstance(paths, str): paths = paths.split() options = [] if not include_children: options.append('-d') split_lines = [ line.split() for line in sh( ['hdfs', 'dfs', '-ls'] + options + paths, # Not checking return code here so we don't # fail paths do not exist. check_return_code=False ).splitlines() if not line.startswith(b'Found ') ] if with_details: return [ { 'file_type': 'f' if parts[0].decode('utf-8')[0] == '-' else 'd', 'permission': parts[0][1:], 'replication': parts[1], 'owner': parts[2], 'group': parts[3], 'file_size': parts[4], 'modification_date': parts[5], 'modification_time': parts[6], 'path': parts[7] } for parts in split_lines ] else: return [parts[-1] for parts in split_lines]
def dir_bytes_size(path): """ Returns the size in bytes of a hdfs path """ return int(sh(['hdfs', 'dfs', '-du', '-s', path]).split()[0])
def test_sh_pipe(self): command = '/bin/echo hi_there | /usr/bin/env sed -e \'s@_there@_you@\'' output = sh(command) self.assertEqual(output, 'hi_you')
def _command(self, args, check_return_code=True): """Runs the `hive` from the command line, passing in the given args, and returning stdout. """ cmd = self.hivecmd + args return sh(cmd, check_return_code)
def put(local_path, hdfs_path, force=False): """ Runs 'hdfs dfs -put local_path hdfs_path' to copy a local file over to hdfs. """ options = ['-f'] if force else [] sh(['hdfs', 'dfs', '-put'] + options + [local_path, hdfs_path])
def get(hdfs_path, local_path, force=False): """ Runs 'hdfs dfs -get hdfs_path local_path' to copy a local file over to hdfs. """ options = ['-f'] if force else [] sh(['hdfs', 'dfs', '-get'] + options + [hdfs_path, local_path])