def get_bite(self): """ If Luigi has forked, we have a different PID, and need to reconnect. """ config = hdfs_config.hdfs() if self.pid != os.getpid() or not self._bite: client_kwargs = dict( filter( lambda k_v: k_v[1] is not None and k_v[1] != '', six.iteritems({ 'hadoop_version': config.client_version, 'effective_user': config.effective_user, }))) if config.snakebite_autoconfig: """ This is fully backwards compatible with the vanilla Client and can be used for a non HA cluster as well. This client tries to read ``${HADOOP_PATH}/conf/hdfs-site.xml`` to get the address of the namenode. The behaviour is the same as Client. """ from snakebite.client import AutoConfigClient self._bite = AutoConfigClient(**client_kwargs) else: from snakebite.client import Client self._bite = Client(config.namenode_host, config.namenode_port, **client_kwargs) return self._bite
def run(self): log.info('initiating snakebite hdfs client') try: client = AutoConfigClient() except krbV.Krb5Error as _: # pylint: disable=no-member if self.verbose: print('', file=sys.stderr) print(_, file=sys.stderr) start_time = time.time() dir_count = 0 file_count = 0 repl1_count = 0 for path in self.path_list: try: result_list = client.ls([path], recurse=True, include_toplevel=True, include_children=True) for result in result_list: if self.verbose and (dir_count + file_count) % 100 == 0: print('.', file=sys.stderr, end='') if result['block_replication'] == 0: dir_count += 1 continue file_count += 1 if result['block_replication'] == 1: file_path = result['path'] repl1_count += 1 if self.verbose: print('', file=sys.stderr) print(file_path) if self.replication_factor: log.info('setting replication factor to %s on %s', self.replication_factor, file_path) # returns a generator so must evaluate in order to actually execute # otherwise you find there is no effect on the replication factor for _ in client.setrep([file_path], self.replication_factor, recurse=False): if 'result' not in _: print( 'WARNING: result field not found in setrep result: {}' .format(_), file=sys.stderr) continue if not _['result']: print( 'WARNING: failed to setrep: {}'.format( _)) except (snakebite.errors.FileNotFoundException, snakebite.errors.RequestError) as _: if self.verbose: print('', file=sys.stderr) print(_, file=sys.stderr) if self.verbose: print('', file=sys.stderr) secs = int(time.time() - start_time) print('\nCompleted in {} secs\n'.format(secs), file=sys.stderr) print('{} files with replication factor 1 out of {} files in {} dirs'\ .format(repl1_count, file_count, dir_count), file=sys.stderr)
def get_conn(self): ''' Returns a snakebite HDFSClient object. ''' connections = self.get_connections(self.hdfs_conn_id) use_sasl = False if configuration.get('core', 'security') == 'kerberos': use_sasl = True client = None ''' When using HAClient, proxy_user must be the same, so is ok to always take the first ''' effective_user = self.proxy_user or connections[0].login if len(connections) == 1: autoconfig = connections[0].extra_dejson.get('autoconfig', False) if autoconfig: client = AutoConfigClient(effective_user=effective_user, use_sasl=use_sasl) else: client = Client(connections[0].host, connections[0].port, effective_user=effective_user, use_sasl=use_sasl) elif len(connections) > 1: nn = [Namenode(conn.host, conn.port) for conn in connections] client = HAClient(nn, effective_user=effective_user, use_sasl=use_sasl) else: raise HDFSHookException("conn_id doesn't exist in the repository") return client
def get_bite(self): """ If Luigi has forked, we have a different PID, and need to reconnect. """ if self.pid != os.getpid() or not self._bite: client_kwargs = dict( filter( lambda (k, v): v is not None and v != '', { 'hadoop_version': self.config.getint("hdfs", "client_version", None), 'effective_user': self.config.get("hdfs", "effective_user", None) }.iteritems())) if self.config.getboolean("hdfs", "snakebite_autoconfig", False): """ This is fully backwards compatible with the vanilla Client and can be used for a non HA cluster as well. This client tries to read ``${HADOOP_PATH}/conf/hdfs-site.xml`` to get the address of the namenode. The behaviour is the same as Client. """ from snakebite.client import AutoConfigClient self._bite = AutoConfigClient(**client_kwargs) else: from snakebite.client import Client self._bite = Client( self.config.get("hdfs", "namenode_host"), self.config.getint("hdfs", "namenode_port"), **client_kwargs) return self._bite
def build_hdfs_client(): # pylint: disable=global-statement global _hdfs_client _hdfs_client = AutoConfigClient() # To warm up the connection to the namenode # Otherwise it may take 3+ minutes at the first time _hdfs_client.df()
def get_bite(self): """ If Luigi has forked, we have a different PID, and need to reconnect. """ if self.pid != os.getpid() or not self._bite: autoconfig_enabled = self.config.getboolean( "hdfs", "snakebite_autoconfig", False) if autoconfig_enabled is True: """ This is fully backwards compatible with the vanilla Client and can be used for a non HA cluster as well. This client tries to read ``${HADOOP_PATH}/conf/hdfs-site.xml`` to get the address of the namenode. The behaviour is the same as Client. """ from snakebite.client import AutoConfigClient self._bite = AutoConfigClient() else: from snakebite.client import Client try: ver = self.config.getint("hdfs", "client_version") if ver is None: raise RuntimeError() self._bite = Client( self.config.get("hdfs", "namenode_host"), self.config.getint("hdfs", "namenode_port"), hadoop_version=ver) except: self._bite = Client( self.config.get("hdfs", "namenode_host"), self.config.getint("hdfs", "namenode_port")) return self._bite
def test_autoconfig_client_trash_false(self, environ_get): environ_get.return_value = False HDFSConfig.core_try_paths = ( self.get_config_path('ha-core-site.xml'), ) HDFSConfig.hdfs_try_paths = ( self.get_config_path('ha-noport-hdfs-site.xml'), ) client = AutoConfigClient() self.assertFalse(client.use_trash)
def mv(src, dest, overwrite=False): """ src (str) : Source path on HDFS dest (str) : Destination path on HDFS overwrite (boolean) : Overwrite dest if exists """ client = AutoConfigClient() list(client.rename2(src, dest, overwrite))
def mkdir(hdfs_path, create_parent=False, mode=0755): """ paths (list of strings) : Paths to create create_parent (boolean) : Also create the parent directories mode (int) : Mode the directory should be created with Returns: String mkdir result as json """ client = AutoConfigClient() return list(client.mkdir([hdfs_path], create_parent, mode))
def main(): hadoop_conf_dir = "/media/d2/code-sky/dockers/hadoop/etc/hadoop" os.environ['HADOOP_CONF_DIR'] = hadoop_conf_dir file_dict = {} cli = AutoConfigClient() target_hdfs_path = "/" for element in cli.ls([target_hdfs_path]): print("Result: " + str(element))
def rm(hdfs_path, recurse=False, force=False): """ hdfs_path (str or list of strings) : hdfs files to delete recurse (boolean) : recursively delete the folder force (boolean) : force deletion (non-interactive) Returns: String mkdir result as json """ client = AutoConfigClient() return list(client.delete([hdfs_path], recurse))
def get_conn(self) -> Any: """ Returns a snakebite HDFSClient object. """ # When using HAClient, proxy_user must be the same, so is ok to always # take the first. effective_user = self.proxy_user autoconfig = self.autoconfig use_sasl = conf.get('core', 'security') == 'kerberos' try: connections = self.get_connections(self.hdfs_conn_id) if not effective_user: effective_user = connections[0].login if not autoconfig: autoconfig = connections[0].extra_dejson.get( 'autoconfig', False) hdfs_namenode_principal = connections[0].extra_dejson.get( 'hdfs_namenode_principal') except AirflowException: if not autoconfig: raise if autoconfig: # will read config info from $HADOOP_HOME conf files client = AutoConfigClient(effective_user=effective_user, use_sasl=use_sasl) elif len(connections) == 1: client = Client( connections[0].host, connections[0].port, effective_user=effective_user, use_sasl=use_sasl, hdfs_namenode_principal=hdfs_namenode_principal, ) elif len(connections) > 1: name_node = [ Namenode(conn.host, conn.port) for conn in connections ] client = HAClient( name_node, effective_user=effective_user, use_sasl=use_sasl, hdfs_namenode_principal=hdfs_namenode_principal, ) else: raise HDFSHookException("conn_id doesn't exist in the repository " "and autoconfig is not specified") return client
def ls(hdfs_path, recurse=False, include_toplevel=True, include_children=False): """ Parameters: paths (list) : Paths to list recurse (boolean) : Recursive listing include_toplevel (boolean) : Include the given path in the listing. If the path is a file, include_toplevel is always True. include_children (boolean) : Include child nodes in the listing. Returns: (list) path listings with attributes """ client = AutoConfigClient() path_info = list(client.ls([hdfs_path], recurse, include_toplevel, include_children)) return LsObject(path_info)
import urllib2, os, json, pytz, sys import datetime, calendar, pprint import argparse import snakebite import subprocess from snakebite.client import AutoConfigClient client = AutoConfigClient() # download.py # modified version of /home/zsb739/code/libs/ripe-measurement-downloader/experiment_launcher/download.py # This script downloads data from ripe atlas and stores it in the hdfs def parse_args(): parser = argparse.ArgumentParser( description='Download daily RIPE data for the provided ' 'measurement ID number') parser.add_argument( 'measurement', type=int, nargs="+", help="The integer identification number for the desired " "measurement") return parser.parse_args() def days(start, stop=None): if stop == None: curr_time = datetime.datetime.utcnow() stop_time = datetime.datetime(curr_time.year, curr_time.month,
def __init__(self): check_output("hadoop") self.fs = AutoConfigClient()
def __init__(self): self._client = AutoConfigClient()
parser.add_argument('-t', '--title', default='Cutflow Efficiency') parser.add_argument( '-x', '--NoX', action='store_true', help='This argument suppresses showing plots via X-forwarding') parser.add_argument('-o', '--NoOutput', action='store_true', help='This argument suppresses the output of PDF plots') args = parser.parse_args() df_list = [] file_list = [] fs = AutoConfigClient() HT_eff_tot = [] MHT_eff_tot = [] BDP_eff_tot = [] NBJet_eff_tot = [] NJet_eff_tot = [] NVeto_eff_tot = [] M_sq = [] M_lsp = [] for f in fs.text([args.file + '/*/ROOTCuts_output/ROOTCuts.txt']): df = pd.read_csv(StringIO(f), delimiter=r'\s+') df_HT = df.loc[(df['HT'] > 1200.)] df_MHT = df.loc[(df['MHT'] > 200.)] df_NBJet = df.loc[(df['NBJet'] > 1)]