def run(self): log.info('initiating snakebite hdfs client') try: client = AutoConfigClient() except krbV.Krb5Error as _: # pylint: disable=no-member if self.verbose: print('', file=sys.stderr) print(_, file=sys.stderr) start_time = time.time() dir_count = 0 file_count = 0 repl1_count = 0 for path in self.path_list: try: result_list = client.ls([path], recurse=True, include_toplevel=True, include_children=True) for result in result_list: if self.verbose and (dir_count + file_count) % 100 == 0: print('.', file=sys.stderr, end='') if result['block_replication'] == 0: dir_count += 1 continue file_count += 1 if result['block_replication'] == 1: file_path = result['path'] repl1_count += 1 if self.verbose: print('', file=sys.stderr) print(file_path) if self.replication_factor: log.info('setting replication factor to %s on %s', self.replication_factor, file_path) # returns a generator so must evaluate in order to actually execute # otherwise you find there is no effect on the replication factor for _ in client.setrep([file_path], self.replication_factor, recurse=False): if 'result' not in _: print( 'WARNING: result field not found in setrep result: {}' .format(_), file=sys.stderr) continue if not _['result']: print( 'WARNING: failed to setrep: {}'.format( _)) except (snakebite.errors.FileNotFoundException, snakebite.errors.RequestError) as _: if self.verbose: print('', file=sys.stderr) print(_, file=sys.stderr) if self.verbose: print('', file=sys.stderr) secs = int(time.time() - start_time) print('\nCompleted in {} secs\n'.format(secs), file=sys.stderr) print('{} files with replication factor 1 out of {} files in {} dirs'\ .format(repl1_count, file_count, dir_count), file=sys.stderr)
class HdfsFileManager(FileManagerBase): """A wrapper of snakebite client.""" def can_handle(self, path): return path.startswith('hdfs://') def __init__(self): self._client = AutoConfigClient() def ls(self, path: str, recursive=False) -> List[File]: files = [] for file in self._client.ls([path], recurse=recursive): if file['file_type'] == 'f': files.append(File( path=file['path'], size=file['length'])) return files def move(self, source: str, destination: str) -> bool: return len(list(self._client.rename([source], destination))) > 0 def remove(self, path: str) -> bool: return len(list(self._client.delete([path]))) > 0 def copy(self, source: str, destination: str) -> bool: # TODO raise NotImplementedError() def mkdir(self, path: str) -> bool: return next(self._client.mkdir([path], create_parent=True))\ .get('result')
def main(): hadoop_conf_dir = "/media/d2/code-sky/dockers/hadoop/etc/hadoop" os.environ['HADOOP_CONF_DIR'] = hadoop_conf_dir file_dict = {} cli = AutoConfigClient() target_hdfs_path = "/" for element in cli.ls([target_hdfs_path]): print("Result: " + str(element))
def ls(hdfs_path, recurse=False, include_toplevel=True, include_children=False): """ Parameters: paths (list) : Paths to list recurse (boolean) : Recursive listing include_toplevel (boolean) : Include the given path in the listing. If the path is a file, include_toplevel is always True. include_children (boolean) : Include child nodes in the listing. Returns: (list) path listings with attributes """ client = AutoConfigClient() path_info = list(client.ls([hdfs_path], recurse, include_toplevel, include_children)) return LsObject(path_info)
class HdfsFileManager(FileManagerBase): """A wrapper of snakebite client.""" def can_handle(self, path): return path.startswith('hdfs://') def __init__(self): self._client = AutoConfigClient() def ls(self, path: str, recursive=False) -> List[str]: files = [] for file in self._client.ls([path], recurse=recursive): if file['file_type'] == 'f': files.append(file['path']) return files def move(self, source: str, destination: str) -> bool: return len(list(self._client.rename([source], destination))) > 0 def remove(self, path: str) -> bool: return len(list(self._client.delete([path]))) > 0
# don't nuke this; hbase uses it for bulk loading. re.compile("^/tmp/hbase-staging/?"), # let's try to make sure we're not matching against a top-level path re.compile("^/[-_.a-zA-Z0-9]+/?$"), re.compile("cloudera_health_monitoring_canary_files"), # let's bail out explicitly on anything in our data path re.compile("^/data/production/?"), ] if client.test(args.path, exists=True): for x in client.ls([args.path], recurse=args.recurse_filesystem): if any(regex.search(x['path']) for regex in donotdelete_whitelist): logger.info("Matched banned thing, not attempting to delete it: %s", x['path']) else: f_timestamp = datetime.datetime.fromtimestamp(x['modification_time']/1000) if f_timestamp < older_than: logger.info("I might delete this: %s %s", x['path'], f_timestamp) if args.actually_delete: logger.info("Issuing delete of %s", list(client.delete([x['path']], recurse=True))) if client.test(x['path'], exists=True): logger.info("Removed %s", x['path']) else: logger.info( "I would have deleted this: %s ", x['path']) else: logger.warn("%s is not found on hdfs", args.path)