def __init__(self, url): ''' :param url:HDFS名称节点的主机名或IP地址,以协议为前缀,其次是namenode上的WebHDFS端口,也可以指定多个URL以分号分隔以获取高可用性支持. ''' # 实例化HDFS web client using Kerberos authentication self.client = KerberosClient(url)
def get_conn(self): """ Returns a hdfscli InsecureClient object. """ nn_connections = self.get_connections(self.webhdfs_conn_id) for nn in nn_connections: try: self.log.debug('Trying namenode %s', nn.host) connection_str = 'http://{nn.host}:{nn.port}'.format(nn=nn) if _kerberos_security_mode: client = KerberosClient(connection_str) else: proxy_user = self.proxy_user or nn.login client = InsecureClient(connection_str, user=proxy_user) client.status('/') self.log.debug('Using namenode %s for hook', nn.host) return client except HdfsError as e: self.log.debug( "Read operation on namenode {nn.host} " "failed with error: {e}".format(**locals()) ) nn_hosts = [c.host for c in nn_connections] no_nn_error = "Read operations failed " \ "on the namenodes below:\n{}".format("\n".join(nn_hosts)) raise AirflowWebHDFSHookException(no_nn_error)
def transform(outDir, image, x, y, dt): plt.switch_backend('agg') plt.figure(figsize=(25, 15), dpi=100) p = Proj(proj='geos', h=satHeight, lon_0=satLongitude, sweep=satSweep) XX, YY = np.meshgrid(x, y) lons, lats = p(XX, YY, inverse=True) mH = Basemap(resolution='i', projection='lcc', area_thresh=1500, width=1800 * 3000, height=1060 * 3000, lat_1=38.5, lat_2=38.5, lat_0=38.5, lon_0=-97.5) xH, yH = mH(lons, lats) rgb = image[1][:, :-1, :] rgb = rgb / 256.0 colorTuple = rgb.reshape((rgb.shape[0] * rgb.shape[1]), 3) colorTuple = np.insert(colorTuple, 3, 1.0, axis=1) newmap = mH.pcolormesh(xH, yH, image[1][:, :, 0], color=colorTuple, linewidth=0) newmap.set_array(None) mH.drawstates() mH.drawcountries() mH.drawcoastlines() # plt.title('GOES-16 Pseudo Color\n%s' % dt.strftime('%B %d, %Y UTC')) buf = BytesIO() plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0) buf.seek(0) client = KerberosClient('http://hc.gps.stthomas.edu:50070') with client.write(outDir + '/TRANSFORM_' + image[0].split("/")[-1], overwrite=True) as writer: writer.write(buf.getvalue()) buf.close()
def execute_process(args): directory = args.directory linux_path = args.linuxPath file_name = args.fileName end_file_name = file_name.replace(".csv", "_done.csv") full_path = '{}{}/'.format(linux_path, directory) full_file_name_path = '{}{}'.format(full_path, end_file_name) with open('{}{}'.format(full_path, file_name), 'rb') as read: with open(full_file_name_path, 'wb') as file_write: reader = csv.reader(read, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL) for row in reader: new_row = [ data.replace("\n", " ").replace("\r", " ") for data in row ] wr = csv.writer(file_write, delimiter=';') wr.writerow(new_row) client = KerberosClient(args.webHdfs) client.upload(args.hdfsPath + directory, full_file_name_path, n_threads=5, overwrite=True)
def get_conn(self): """ Returns a hdfscli InsecureClient object. """ nn_connections = self.get_connections(self.webhdfs_conn_id) for nn in nn_connections: try: self.log.debug('Trying namenode %s', nn.host) connection_str = 'http://{nn.host}:{nn.port}'.format(nn=nn) if _kerberos_security_mode: client = KerberosClient(connection_str) else: proxy_user = self.proxy_user or nn.login client = InsecureClient(connection_str, user=proxy_user) client.status('/') self.log.debug('Using namenode %s for hook', nn.host) return client except HdfsError as e: self.log.debug( "Read operation on namenode {nn.host} failed with error: {e}" .format(**locals())) nn_hosts = [c.host for c in nn_connections] no_nn_error = "Read operations failed on the namenodes below:\n{}".format( "\n".join(nn_hosts)) raise AirflowWebHDFSHookException(no_nn_error)
def get_conn(self): """ Returns a hdfscli InsecureClient object. """ nn_connections = self.get_connections(self.webhdfs_conn_id) for nn in nn_connections: try: logging.debug("Trying namenode {}".format(nn.host)) connection_str = "http://{nn.host}:{nn.port}".format(nn=nn) if _kerberos_security_mode: client = KerberosClient(connection_str) else: proxy_user = self.proxy_user or nn.login client = InsecureClient(connection_str, user=proxy_user) client.status("/") logging.debug("Using namenode {} for hook".format(nn.host)) return client except HdfsError as e: logging.debug( "Read operation on namenode {nn.host} failed with" " error: {e.message}".format(**locals()) ) nn_hosts = [c.host for c in nn_connections] no_nn_error = "Read operations failed on the namenodes below:\n{}".format( "\n".join(nn_hosts) ) raise Exception(no_nn_error)
def process(): for key in urlMaping.keys(): client = KerberosClient(urlMaping[key], root=root, proxy=proxy) try: client.list("/") return client except: continue
def save_file_hdfs(rdd, dir_files_pdf, server_hdfs, user_name_hdfs): n_file_id = int(rdd[0]) n_info_tec = rdd[1].replace("/", "-") n_file = rdd[2] hdfsclient = KerberosClient(server_hdfs) hdfsclient.write(os.path.join(dir_files_pdf, '{}_{}.pdf'.format(n_file_id, n_info_tec)), n_file, overwrite=True) return rdd
def hdfs_connect_demo(): # NOTE 底层会调用 kinit with krbContext(using_keytab=True, principal='*****@*****.**', keytab_file='/houleilei.client.keytab'): client = KerberosClient('http://hadoop01.stor:50070', hostname_override='hadoop01.stor') # client = InsecureClient('http://hadoop01.stor:50070', user='******') result = client.list('/home/holyzing/') print(type(result), result)
def testip(ip, root=None, proxy=None): print ip if ip == '': return process() else: client = KerberosClient(urlMaping[ip], root=root, proxy=proxy) try: print 'test %s' % urlMaping[ip] client.list("/") return client except: return process()
def get_client(host, use_kerberos): if use_kerberos: from hdfs.ext.kerberos import KerberosClient return KerberosClient(host) else: from hdfs.client import Client return Client(host)
def client(self): # type ()-> WebHDFS if self.client_type == WebHdfsClientType.KERBEROS: from hdfs.ext.kerberos import KerberosClient return KerberosClient( url=self.url, mutual_authentication=self.mutual_authentication, service=self.service, delegate=self.delegate, force_preemptive=self.force_preemptive, principal=self.principal, hostname_override=self.hostname_override, sanitize_mutual_error_response=self. sanitize_mutual_error_response, send_cbt=self.send_cbt, ) elif self.client_type == WebHdfsClientType.INSECURE: from hdfs import InsecureClient return InsecureClient(url=self.url, user=self.user) elif self.client_type == WebHdfsClientType.TOKEN: from hdfs import TokenClient return TokenClient(url=self.url, token=self.token) else: raise Exception("WebHdfs client type %s does not exist" % self.client_type)
def hdfs_client_ini(conf): _conf = conf _url = '' _nodes = [] for _node in _conf['namenodes']: _nodes.append('http://' + str(_node) + ':' + str(_conf['port'])) _url = ';'.join(_nodes) if os.path.isfile(_conf['keytab']): _conf_keytab = _conf['keytab'] else: _conf_keytab = str(os.path.dirname( os.path.realpath(__file__))) + os.sep + str(_conf['keytab']) try: os.environ["KRB5_CLIENT_KTNAME"] = _conf_keytab except Exception as _err: print('ERR: [initiator:hdfs_client_ini]', _err) return False try: _kerberos_auth = HTTPKerberosAuth(principal=_conf['principal']) except Exception as _err: print('ERR: [initiator:hdfs_client_ini]', _err) return False else: try: _client = KerberosClient(_url) except Exception as _err: print('ERR: [initiator:hdfs_client_ini]', _err) return False else: return _client
def run(self): # 前处理 tc = BeforeHandler(self.__args, self.__col_info, self.__db_info, self.__props) ret = tc.run() if ret != 0: LOG.error("加载前处理失败") return ret # 加载处理 # 上传文件到指定hdfs目录 HDFS_WORK_DIR = "{0}/{1}".format(self.__args.loaddir, self.__args.table) #put_cmd = "KRB5_CONFIG={0}" \ # " && kinit -kt {1} {2}" \ # " && hadoop fs -rm -r -f {3}" \ # " && hadoop fs -mkdir -p {3}" \ # " && hadoop fs -put {4} {5}"\ # .format(self.__args.krbfile, self.__args.ktfile, self.__args.ktuser, HDFS_WORK_DIR, self.__args.srcfile, HDFS_WORK_DIR) #LOG.info("HDFS PUT CMD[{0}]".format(put_cmd)) #ret = os.system(put_cmd) #if ret != 0: # LOG.error("上传文件到hdfs失败") # return -1 print "AAA:{0}".format(HDFS_WORK_DIR) try: # 建立连接 hdfs_client = KerberosClient(self.__args.nnurl, principal="{0}".format( self.__args.ktuser)) # 删除历史目录 hdfs_client.delete(HDFS_WORK_DIR, recursive=True) # 创建新的目录 hdfs_client.makedirs(HDFS_WORK_DIR) # 上传文件到HDFS hdfs_client.upload(HDFS_WORK_DIR, self.__args.srcfile) except: traceback.print_exc() LOG.error("数据加载失败") return -1 LOG.info("数据加载成功") # 后处理 tc = AfterHandler(self.__args, self.__db_info, self.__props) ret = tc.run() if ret != 0: LOG.error("加载后处理失败") return ret return 0
def __get_hdfs_client(self): # hdfs_host = "http://10.72.59.89:50070" # user = "******" if self.is_kerberos: cli = KerberosClient(url=hdfs_host) else: cli = client.InsecureClient(url=hdfs_host, user=user) return cli
def get_client(namenode_url: str) -> KerberosClient: """Thin wrapper around KerberosClient Parameters ---------- namenode_url: The url of the namenode. Should include protocol (http/https) and port """ return KerberosClient(namenode_url)
def hdfs_connect(host='localhost', port=50070, protocol='webhdfs', use_https='default', auth_mechanism='NOSASL', verify=True, **kwds): """ Connect to HDFS Parameters ---------- host : string, Host name of the HDFS NameNode port : int, NameNode's WebHDFS port (default 50070) protocol : {'webhdfs'} use_https : boolean, default 'default' Connect to WebHDFS with HTTPS, otherwise plain HTTP. For secure authentication, the default for this is True, otherwise False auth_mechanism : string, Set to NOSASL or PLAIN for non-secure clusters. Set to GSSAPI or LDAP for Kerberos-secured clusters. verify : boolean, Set to False to turn off verifying SSL certificates. (default True) Other keywords are forwarded to hdfs library classes Returns ------- client : WebHDFS """ import requests session = kwds.setdefault('session', requests.Session()) session.verify = verify if auth_mechanism in ['GSSAPI', 'LDAP']: if use_https == 'default': prefix = 'https' else: prefix = 'https' if use_https else 'http' try: import requests_kerberos except ImportError: raise IbisError( "Unable to import requests-kerberos, which is required for " "Kerberos HDFS support. Install it by executing `pip install " "requests-kerberos` or `pip install hdfs[kerberos]`.") from hdfs.ext.kerberos import KerberosClient # note SSL url = '{0}://{1}:{2}'.format(prefix, host, port) kwds.setdefault('mutual_auth', 'OPTIONAL') hdfs_client = KerberosClient(url, **kwds) else: if use_https == 'default': prefix = 'http' else: prefix = 'https' if use_https else 'http' from hdfs.client import InsecureClient url = '{0}://{1}:{2}'.format(prefix, host, port) hdfs_client = InsecureClient(url, **kwds) return WebHDFS(hdfs_client)
def generate_temp_files(need_certificate=NEED_CERTIFICATE): if need_certificate: with krbcontext(using_keytab=True, keytab_file=KEYTAB_PATH, principal=PRINCIPAL): for node in HDFS.NODES: try: hdfs_client = KerberosClient(node) hdfs_client.download(HDFS.REMOTE_PATH, HDFS.LOCAL_PATH, n_threads=HDFS.THREAD_NUM) except Exception as err: logging.info(err) else: return logging.error("Failed to download remote HDFS file.") raise Exception("Failed to download remote HDFS file.") else: for node in HDFS.NODES: try: hdfs_client = Client(node) hdfs_client.download(HDFS.REMOTE_PATH, HDFS.LOCAL_PATH, n_threads=HDFS.THREAD_NUM) except Exception as err: logging.info(err) else: return logging.error("Failed to download remote HDFS file.") raise Exception("Failed to download remote HDFS file.")
def initialize_hdfs_client(url): global client if not client: session = Session() session.verify = False if kerberos['enabled']: client = KerberosClient(url, session=session) else: client = InsecureClient(url, user=hdfs['user'], session=session)
def _get_client(self, connection): connection_str = 'http://{host}:{port}'.format(host=connection.host, port=connection.port) if _kerberos_security_mode: client = KerberosClient(connection_str) else: proxy_user = self.proxy_user or connection.login client = InsecureClient(connection_str, user=proxy_user) return client
def addMap(outDir, image, satLongitude, xmin, xmax, ymin, ymax, dt): plt.switch_backend('agg') plt.figure(figsize=(25, 15), dpi=100) m = Basemap(projection='geos', lon_0=satLongitude, resolution='i', area_thresh=1000, llcrnrx=xmin, llcrnry=ymin, urcrnrx=xmax, urcrnry=ymax) m.imshow(np.flipud(image[1])) m.drawcoastlines() m.drawcountries() m.drawstates() # plt.title('GOES-16 Pseudo Color\n%s' % dt.strftime('%B %d, %Y UTC')) buf = BytesIO() plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0) buf.seek(0) client = KerberosClient('http://hc.gps.stthomas.edu:50070') with client.write(outDir + '/MAP_' + image[0].split("/")[-1], overwrite=True) as writer: writer.write(buf.getvalue()) buf.close()
def _get_client(self, connection: Connection) -> Any: connection_str = f'http://{connection.host}:{connection.port}' if _kerberos_security_mode: client = KerberosClient(connection_str) else: proxy_user = self.proxy_user or connection.login client = InsecureClient(connection_str, user=proxy_user) return client
def hdfs_client(self): url = 'http://{nn_host}:{webhdfs_port}'.format( nn_host=self._nn_host, webhdfs_port=self._webhdfs_port) if self._kerberized: from hdfs.ext.kerberos import KerberosClient client = KerberosClient(url, mutual_auth='REQUIRED') else: from hdfs.client import InsecureClient client = InsecureClient(url, user=self._hdfs_user) return client
def _get_client(self, namenode: str, port: int, login: str, extra_dejson: dict) -> Any: connection_str = f'http://{namenode}:{port}' session = requests.Session() if extra_dejson.get('use_ssl', False): connection_str = f'https://{namenode}:{port}' session.verify = extra_dejson.get('verify', True) if _kerberos_security_mode: return KerberosClient(connection_str, session=session) proxy_user = self.proxy_user or login return InsecureClient(connection_str, user=proxy_user, session=session)
def client(self): # A naive benchmark showed that 1000 existence checks took 2.5 secs # when not recreating the client, and 4.0 secs when recreating it. So # not urgent to memoize it. Note that it *might* be issues with process # forking and whatnot (as the one in the snakebite client) if we # memoize it too trivially. if self.client_type == 'kerberos': from hdfs.ext.kerberos import KerberosClient return KerberosClient(url=self.url) else: import hdfs return hdfs.InsecureClient(url=self.url, user=self.user)
def get_model(self): client = KerberosClient(settings.DUNANT_HDFS_PATH) MODEL_DIR = settings.DUNANT_MODEL_DIR MOST_RECENT_MODEL = sorted(client.list(MODEL_DIR))[-1] MODEL_PARAMETERS_PATH = f'{MODEL_DIR}/{MOST_RECENT_MODEL}/model' MLB_PATH = f'{MODEL_PARAMETERS_PATH}/mlb_binarizer.pkl' VECTORIZER_PATH = f'{MODEL_PARAMETERS_PATH}/vectorizer.pkl' CLASSIFIER_PATH = f'{MODEL_PARAMETERS_PATH}/model.pkl' # For pickle to be able to unpickle, the class must be present in the # same import structure as when it was pickled. # Manually setting sys.modules to mimic the expected import structure sys.modules['models'] = classifiers # Latin1 encoding required to convert Python2 pickle to Python3 with client.read(MLB_PATH) as r: mlb = pickle.loads(r.read(), encoding="latin1") with client.read(VECTORIZER_PATH) as r: vectorizer = pickle.loads(r.read(), encoding="latin1") with client.read(CLASSIFIER_PATH) as r: clf = pickle.loads(r.read(), encoding="latin1") del sys.modules['models'] return mlb, vectorizer, clf
def create(self): """ Creates webhdfs client instance. Concrete implementation depends on a client_type parameter, if it's kerberos, then KerberosClient is created, otherwise InsecureClient. :return hdfs client: """ if self.client_type == 'kerberos': from hdfs.ext.kerberos import KerberosClient return KerberosClient(url=self.url) else: return hdfs.InsecureClient(url=self.url, user=self.user)
def get_hdfs_client(is_kerberos=False): """ :return: client of hdfs """ # hdfs_host = "http://10.18.0.28:50070" # user="******" if is_kerberos: cli = KerberosClient(url=hdfs_host) else: cli = client.InsecureClient(url=hdfs_host, user=user) client.Client return cli
def __init__(self, hdfs_urls, path_hdfs='./', max_file_size=MAX_FILE_SIZE, max_process=4, log_level='INFO'): """ :param hdfs_url list[str]: hdfs url (ex: ['X']) :param path_hdfs str: path to write file in HDFS :param max_file_size int: limit size before create a new file and save the current file to hdfs (compressed) :param max_process int: number of subprocess to compress and write file in HDFS (max_process > 0) :param log_level str: logger level """ # Config logger formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") stream_handler = logging.StreamHandler() stream_handler.setFormatter(formatter) self.logger = logging.getLogger('WriteHdfs') self.logger.addHandler(stream_handler) self.logger.setLevel(log_level) # Config signal exit signal.signal(signal.SIGINT, self.__signal_handler) # Try to find the active namenode in the list for hdfs_url in hdfs_urls: try: hdfs_client = KerberosClient(hdfs_url) hdfs_client.list(path_hdfs) self.hdfs_url = hdfs_url self.logger.info('identify namenode: %s' % hdfs_url) break except hdfs.util.HdfsError: continue self.path_hdfs = path_hdfs self.max_process = max_process # Files settings self.file_size = 0 self.file_name = self.__generate_file_name() self.max_file_size = max_file_size
def get_conn(self): """ Returns a hdfscli InsecureClient object. """ nn_connections = self.get_connections(self.webhdfs_conn_id) for nn in nn_connections: try: logging.debug('Trying namenode {}'.format(nn.host)) connection_str = 'http://{nn.host}:{nn.port}'.format(nn=nn) if _kerberos_security_mode: client = KerberosClient(connection_str) else: client = InsecureClient(connection_str) client.content('/') logging.debug('Using namenode {} for hook'.format(nn.host)) return client except HdfsError as e: logging.debug("Read operation on namenode {nn.host} failed with" " error: {e.message}".format(**locals())) nn_hosts = [c.host for c in nn_connections] no_nn_error = "Read operations failed on the namenodes below:\n{}".format("\n".join(nn_hosts)) raise AirflowWebHDFSHookException(no_nn_error)
def __write_to_hdfs(hdfs_url, path_hdfs, file_name, logger): """ - Compress local file with ZLIB - Put compressed file on HDFS - Remove local files :param path_hdfs str: :param file_name str: """ logger.debug('Start process __write_to_hdfs for file: %s' % file_name) # Compress file file_name_zlib = '%s.gz' % file_name with open(file_name, 'rb') as f_in: with open(file_name_zlib, 'wb') as f_out: f_out.write(zlib.compress(f_in.read())) # Write file to HDFS try: hdfs_client = KerberosClient(hdfs_url) except hdfs.util.HdfsError as e: logger.error('Error during HDFS connection, wait...: %s' % e) time.sleep(10) WriteHdfs.__write_to_hdfs(hdfs_url, path_hdfs, file_name, logger) return file_name_hdfs = file_name_zlib.replace('.tmp', '') file_path_hdfs = '%s/%s' % (path_hdfs, file_name_hdfs) try: hdfs_client.upload(file_path_hdfs, file_name_zlib) except hdfs.util.HdfsError as e: logger.error('Error during HDFS write, wait...: %s' % e) time.sleep(10) WriteHdfs.__write_to_hdfs(hdfs_url, path_hdfs, file_name, logger) return # Remove tmp files os.remove(file_name) os.remove(file_name_zlib) logger.debug('End process __write_to_hdfs for file: %s' % file_name)
def _get_client(self, connection: Connection) -> Any: connection_str = f'http://{connection.host}:{connection.port}' session = requests.Session() if connection.extra_dejson.get('use_ssl', False): connection_str = f'https://{connection.host}:{connection.port}' session.verify = connection.extra_dejson.get('verify', True) if _kerberos_security_mode: client = KerberosClient(connection_str, session=session) else: proxy_user = self.proxy_user or connection.login client = InsecureClient(connection_str, user=proxy_user, session=session) return client
from hdfs.ext.kerberos import KerberosClient if __name__ == "__main__": client = KerberosClient("http://10.214.208.11:9000") client.list("/") pass