def build_hdfs_client(): # pylint: disable=global-statement global _hdfs_client _hdfs_client = AutoConfigClient() # To warm up the connection to the namenode # Otherwise it may take 3+ minutes at the first time _hdfs_client.df()
class HdfsFileManager(FileManagerBase): """A wrapper of snakebite client.""" def can_handle(self, path): return path.startswith('hdfs://') def __init__(self): self._client = AutoConfigClient() def ls(self, path: str, recursive=False) -> List[File]: files = [] for file in self._client.ls([path], recurse=recursive): if file['file_type'] == 'f': files.append(File( path=file['path'], size=file['length'])) return files def move(self, source: str, destination: str) -> bool: return len(list(self._client.rename([source], destination))) > 0 def remove(self, path: str) -> bool: return len(list(self._client.delete([path]))) > 0 def copy(self, source: str, destination: str) -> bool: # TODO raise NotImplementedError() def mkdir(self, path: str) -> bool: return next(self._client.mkdir([path], create_parent=True))\ .get('result')
def run(self): log.info('initiating snakebite hdfs client') try: client = AutoConfigClient() except krbV.Krb5Error as _: # pylint: disable=no-member if self.verbose: print('', file=sys.stderr) print(_, file=sys.stderr) start_time = time.time() dir_count = 0 file_count = 0 repl1_count = 0 for path in self.path_list: try: result_list = client.ls([path], recurse=True, include_toplevel=True, include_children=True) for result in result_list: if self.verbose and (dir_count + file_count) % 100 == 0: print('.', file=sys.stderr, end='') if result['block_replication'] == 0: dir_count += 1 continue file_count += 1 if result['block_replication'] == 1: file_path = result['path'] repl1_count += 1 if self.verbose: print('', file=sys.stderr) print(file_path) if self.replication_factor: log.info('setting replication factor to %s on %s', self.replication_factor, file_path) # returns a generator so must evaluate in order to actually execute # otherwise you find there is no effect on the replication factor for _ in client.setrep([file_path], self.replication_factor, recurse=False): if 'result' not in _: print( 'WARNING: result field not found in setrep result: {}' .format(_), file=sys.stderr) continue if not _['result']: print( 'WARNING: failed to setrep: {}'.format( _)) except (snakebite.errors.FileNotFoundException, snakebite.errors.RequestError) as _: if self.verbose: print('', file=sys.stderr) print(_, file=sys.stderr) if self.verbose: print('', file=sys.stderr) secs = int(time.time() - start_time) print('\nCompleted in {} secs\n'.format(secs), file=sys.stderr) print('{} files with replication factor 1 out of {} files in {} dirs'\ .format(repl1_count, file_count, dir_count), file=sys.stderr)
def mv(src, dest, overwrite=False): """ src (str) : Source path on HDFS dest (str) : Destination path on HDFS overwrite (boolean) : Overwrite dest if exists """ client = AutoConfigClient() list(client.rename2(src, dest, overwrite))
def mkdir(hdfs_path, create_parent=False, mode=0755): """ paths (list of strings) : Paths to create create_parent (boolean) : Also create the parent directories mode (int) : Mode the directory should be created with Returns: String mkdir result as json """ client = AutoConfigClient() return list(client.mkdir([hdfs_path], create_parent, mode))
def rm(hdfs_path, recurse=False, force=False): """ hdfs_path (str or list of strings) : hdfs files to delete recurse (boolean) : recursively delete the folder force (boolean) : force deletion (non-interactive) Returns: String mkdir result as json """ client = AutoConfigClient() return list(client.delete([hdfs_path], recurse))
def main(): hadoop_conf_dir = "/media/d2/code-sky/dockers/hadoop/etc/hadoop" os.environ['HADOOP_CONF_DIR'] = hadoop_conf_dir file_dict = {} cli = AutoConfigClient() target_hdfs_path = "/" for element in cli.ls([target_hdfs_path]): print("Result: " + str(element))
def ls(hdfs_path, recurse=False, include_toplevel=True, include_children=False): """ Parameters: paths (list) : Paths to list recurse (boolean) : Recursive listing include_toplevel (boolean) : Include the given path in the listing. If the path is a file, include_toplevel is always True. include_children (boolean) : Include child nodes in the listing. Returns: (list) path listings with attributes """ client = AutoConfigClient() path_info = list(client.ls([hdfs_path], recurse, include_toplevel, include_children)) return LsObject(path_info)
def get_bite(self): """ If Luigi has forked, we have a different PID, and need to reconnect. """ config = hdfs_config.hdfs() if self.pid != os.getpid() or not self._bite: client_kwargs = dict( filter( lambda k_v: k_v[1] is not None and k_v[1] != '', six.iteritems({ 'hadoop_version': config.client_version, 'effective_user': config.effective_user, }))) if config.snakebite_autoconfig: """ This is fully backwards compatible with the vanilla Client and can be used for a non HA cluster as well. This client tries to read ``${HADOOP_PATH}/conf/hdfs-site.xml`` to get the address of the namenode. The behaviour is the same as Client. """ from snakebite.client import AutoConfigClient self._bite = AutoConfigClient(**client_kwargs) else: from snakebite.client import Client self._bite = Client(config.namenode_host, config.namenode_port, **client_kwargs) return self._bite
def get_bite(self): """ If Luigi has forked, we have a different PID, and need to reconnect. """ if self.pid != os.getpid() or not self._bite: client_kwargs = dict( filter( lambda (k, v): v is not None and v != '', { 'hadoop_version': self.config.getint("hdfs", "client_version", None), 'effective_user': self.config.get("hdfs", "effective_user", None) }.iteritems())) if self.config.getboolean("hdfs", "snakebite_autoconfig", False): """ This is fully backwards compatible with the vanilla Client and can be used for a non HA cluster as well. This client tries to read ``${HADOOP_PATH}/conf/hdfs-site.xml`` to get the address of the namenode. The behaviour is the same as Client. """ from snakebite.client import AutoConfigClient self._bite = AutoConfigClient(**client_kwargs) else: from snakebite.client import Client self._bite = Client( self.config.get("hdfs", "namenode_host"), self.config.getint("hdfs", "namenode_port"), **client_kwargs) return self._bite
def get_bite(self): """ If Luigi has forked, we have a different PID, and need to reconnect. """ if self.pid != os.getpid() or not self._bite: autoconfig_enabled = self.config.getboolean( "hdfs", "snakebite_autoconfig", False) if autoconfig_enabled is True: """ This is fully backwards compatible with the vanilla Client and can be used for a non HA cluster as well. This client tries to read ``${HADOOP_PATH}/conf/hdfs-site.xml`` to get the address of the namenode. The behaviour is the same as Client. """ from snakebite.client import AutoConfigClient self._bite = AutoConfigClient() else: from snakebite.client import Client try: ver = self.config.getint("hdfs", "client_version") if ver is None: raise RuntimeError() self._bite = Client( self.config.get("hdfs", "namenode_host"), self.config.getint("hdfs", "namenode_port"), hadoop_version=ver) except: self._bite = Client( self.config.get("hdfs", "namenode_host"), self.config.getint("hdfs", "namenode_port")) return self._bite
def get_conn(self): ''' Returns a snakebite HDFSClient object. ''' connections = self.get_connections(self.hdfs_conn_id) use_sasl = False if configuration.get('core', 'security') == 'kerberos': use_sasl = True client = None ''' When using HAClient, proxy_user must be the same, so is ok to always take the first ''' effective_user = self.proxy_user or connections[0].login if len(connections) == 1: autoconfig = connections[0].extra_dejson.get('autoconfig', False) if autoconfig: client = AutoConfigClient(effective_user=effective_user, use_sasl=use_sasl) else: client = Client(connections[0].host, connections[0].port, effective_user=effective_user, use_sasl=use_sasl) elif len(connections) > 1: nn = [Namenode(conn.host, conn.port) for conn in connections] client = HAClient(nn, effective_user=effective_user, use_sasl=use_sasl) else: raise HDFSHookException("conn_id doesn't exist in the repository") return client
def test_autoconfig_client_trash_false(self, environ_get): environ_get.return_value = False HDFSConfig.core_try_paths = ( self.get_config_path('ha-core-site.xml'), ) HDFSConfig.hdfs_try_paths = ( self.get_config_path('ha-noport-hdfs-site.xml'), ) client = AutoConfigClient() self.assertFalse(client.use_trash)
class HdfsFileManager(FileManagerBase): """A wrapper of snakebite client.""" def can_handle(self, path): return path.startswith('hdfs://') def __init__(self): self._client = AutoConfigClient() def ls(self, path: str, recursive=False) -> List[str]: files = [] for file in self._client.ls([path], recurse=recursive): if file['file_type'] == 'f': files.append(file['path']) return files def move(self, source: str, destination: str) -> bool: return len(list(self._client.rename([source], destination))) > 0 def remove(self, path: str) -> bool: return len(list(self._client.delete([path]))) > 0
class HDFSClient(object): def __init__(self): check_output("hadoop") self.fs = AutoConfigClient() def homedir(self): return "/user/%s/" % getuser() def exists(self, path): try: return self.fs.test(path) except Exception: return False
def get_conn(self) -> Any: """ Returns a snakebite HDFSClient object. """ # When using HAClient, proxy_user must be the same, so is ok to always # take the first. effective_user = self.proxy_user autoconfig = self.autoconfig use_sasl = conf.get('core', 'security') == 'kerberos' try: connections = self.get_connections(self.hdfs_conn_id) if not effective_user: effective_user = connections[0].login if not autoconfig: autoconfig = connections[0].extra_dejson.get( 'autoconfig', False) hdfs_namenode_principal = connections[0].extra_dejson.get( 'hdfs_namenode_principal') except AirflowException: if not autoconfig: raise if autoconfig: # will read config info from $HADOOP_HOME conf files client = AutoConfigClient(effective_user=effective_user, use_sasl=use_sasl) elif len(connections) == 1: client = Client( connections[0].host, connections[0].port, effective_user=effective_user, use_sasl=use_sasl, hdfs_namenode_principal=hdfs_namenode_principal, ) elif len(connections) > 1: name_node = [ Namenode(conn.host, conn.port) for conn in connections ] client = HAClient( name_node, effective_user=effective_user, use_sasl=use_sasl, hdfs_namenode_principal=hdfs_namenode_principal, ) else: raise HDFSHookException("conn_id doesn't exist in the repository " "and autoconfig is not specified") return client
import urllib2, os, json, pytz, sys import datetime, calendar, pprint import argparse import snakebite import subprocess from snakebite.client import AutoConfigClient client = AutoConfigClient() # download.py # modified version of /home/zsb739/code/libs/ripe-measurement-downloader/experiment_launcher/download.py # This script downloads data from ripe atlas and stores it in the hdfs def parse_args(): parser = argparse.ArgumentParser( description='Download daily RIPE data for the provided ' 'measurement ID number') parser.add_argument( 'measurement', type=int, nargs="+", help="The integer identification number for the desired " "measurement") return parser.parse_args() def days(start, stop=None): if stop == None: curr_time = datetime.datetime.utcnow() stop_time = datetime.datetime(curr_time.year, curr_time.month,
def __init__(self): self._client = AutoConfigClient()
def __init__(self): check_output("hadoop") self.fs = AutoConfigClient()
active_namenode = which_active_namenode.communicate()[0].rstrip('\n').rsplit(':')[0] break else: logger.info(node + " is the standby node") continue # bail out if the current node we're running on is not the active namenode. if active_namenode != socket.getfqdn(): logger.info("active node " + active_namenode + " is not the current host, so bailing out.") exit(-1) else: logger.info("active node is " + active_namenode) client = AutoConfigClient() client.use_trash=False donotdelete_whitelist = [ # don't remove hadoop-mapred, this kills running jobs re.compile("hadoop-mapred"), # let's explicitly match hbase. re.compile("^/hbase/?"), # don't nuke this; hbase uses it for bulk loading. re.compile("^/tmp/hbase-staging/?"), # let's try to make sure we're not matching against a top-level path re.compile("^/[-_.a-zA-Z0-9]+/?$"),
parser.add_argument('-t', '--title', default='Cutflow Efficiency') parser.add_argument( '-x', '--NoX', action='store_true', help='This argument suppresses showing plots via X-forwarding') parser.add_argument('-o', '--NoOutput', action='store_true', help='This argument suppresses the output of PDF plots') args = parser.parse_args() df_list = [] file_list = [] fs = AutoConfigClient() HT_eff_tot = [] MHT_eff_tot = [] BDP_eff_tot = [] NBJet_eff_tot = [] NJet_eff_tot = [] NVeto_eff_tot = [] M_sq = [] M_lsp = [] for f in fs.text([args.file + '/*/ROOTCuts_output/ROOTCuts.txt']): df = pd.read_csv(StringIO(f), delimiter=r'\s+') df_HT = df.loc[(df['HT'] > 1200.)] df_MHT = df.loc[(df['MHT'] > 200.)] df_NBJet = df.loc[(df['NBJet'] > 1)]
logger.info(node + " is the active node") which_active_namenode = subprocess.Popen(['hdfs','getconf','-confKey','dfs.namenode.rpc-address.'+hdfs_cluster+'.'+node], stdout=subprocess.PIPE) active_namenode = which_active_namenode.communicate()[0].rstrip('\n').rsplit(':')[0] break else: logger.info(node + " is the standby node") continue if active_namenode != socket.getfqdn(): logger.info("active node " + active_namenode + " is not the current host, so bailing out.") exit(-1) else: logger.info("active node is " + active_namenode) client = AutoConfigClient() logger.info("Getting user list from /etc/passwd and ldap") # get a sorted user list from the passwd directory (file+ldap) user_list = sorted(pwd.getpwall(), key=lambda tup: tup[0]) for user in user_list: username = user.pw_name userdir = "/user/" + username if user.pw_uid <= 500: continue if user.pw_uid >= 65534: continue if client.test(userdir, exists=True):
from snakebite.client import AutoConfigClient import pandas as pd fs = AutoConfigClient() def ls(paths=['/'], recursive=False): if not isinstance(paths, list) and not isinstance(paths, tuple): paths = [paths] data = [ dict(size=p['length'], path=p['path'], ftype=p['file_type'], date=p['modification_time']) for p in fs.ls(paths, recursive) ] df = pd.DataFrame(data) df = df.reindex(['size', 'path', 'ftype', 'date'], axis=1) df['size'] = df['size'].astype(int) df['date'] = pd.to_datetime(df['date'], unit='ms') # ignore current uploads df = df[~df.path.str.endswith('.upload')] return df