def process(): for key in urlMaping.keys(): client = KerberosClient(urlMaping[key], root=root, proxy=proxy) try: client.list("/") return client except: continue
def testip(ip, root=None, proxy=None): print ip if ip == '': return process() else: client = KerberosClient(urlMaping[ip], root=root, proxy=proxy) try: print 'test %s' % urlMaping[ip] client.list("/") return client except: return process()
def get_model(self): client = KerberosClient(settings.DUNANT_HDFS_PATH) MODEL_DIR = settings.DUNANT_MODEL_DIR MOST_RECENT_MODEL = sorted(client.list(MODEL_DIR))[-1] MODEL_PARAMETERS_PATH = f'{MODEL_DIR}/{MOST_RECENT_MODEL}/model' MLB_PATH = f'{MODEL_PARAMETERS_PATH}/mlb_binarizer.pkl' VECTORIZER_PATH = f'{MODEL_PARAMETERS_PATH}/vectorizer.pkl' CLASSIFIER_PATH = f'{MODEL_PARAMETERS_PATH}/model.pkl' # For pickle to be able to unpickle, the class must be present in the # same import structure as when it was pickled. # Manually setting sys.modules to mimic the expected import structure sys.modules['models'] = classifiers # Latin1 encoding required to convert Python2 pickle to Python3 with client.read(MLB_PATH) as r: mlb = pickle.loads(r.read(), encoding="latin1") with client.read(VECTORIZER_PATH) as r: vectorizer = pickle.loads(r.read(), encoding="latin1") with client.read(CLASSIFIER_PATH) as r: clf = pickle.loads(r.read(), encoding="latin1") del sys.modules['models'] return mlb, vectorizer, clf
def hdfs_connect_demo(): # NOTE 底层会调用 kinit with krbContext(using_keytab=True, principal='*****@*****.**', keytab_file='/houleilei.client.keytab'): client = KerberosClient('http://hadoop01.stor:50070', hostname_override='hadoop01.stor') # client = InsecureClient('http://hadoop01.stor:50070', user='******') result = client.list('/home/holyzing/') print(type(result), result)
def get_client(self, block_params=None, connection_params=None): try: kerb_auth = False method = "https" if "https" in connection_params: if connection_params["https"]: method = "https" else: method = "http" host_name = connection_params["hostName"] port = connection_params["port"] if 'kerberos' in connection_params: kerb_auth = bool(connection_params['kerberos']) if kerb_auth: principal = generate_ticket_granting_ticket( block_params, connection_params["authName"]) session = requests.Session() session.verify = False full_host = "%s://%s:%s" % (method, host_name, port) client = KerberosClient(url=full_host, session=session, mutual_auth='OPTIONAL', principal=principal) client.list('/') return client else: hadoop_host = host_name + ":" + port client = InsecureClient("http://" + hadoop_host) client.list('/') return client except Exception as e: self.logger.error( "Error Occurred While Connecting to HDFS With Given Connection Details" ) raise e
def __init__(self, hdfs_urls, path_hdfs='./', max_file_size=MAX_FILE_SIZE, max_process=4, log_level='INFO'): """ :param hdfs_url list[str]: hdfs url (ex: ['X']) :param path_hdfs str: path to write file in HDFS :param max_file_size int: limit size before create a new file and save the current file to hdfs (compressed) :param max_process int: number of subprocess to compress and write file in HDFS (max_process > 0) :param log_level str: logger level """ # Config logger formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") stream_handler = logging.StreamHandler() stream_handler.setFormatter(formatter) self.logger = logging.getLogger('WriteHdfs') self.logger.addHandler(stream_handler) self.logger.setLevel(log_level) # Config signal exit signal.signal(signal.SIGINT, self.__signal_handler) # Try to find the active namenode in the list for hdfs_url in hdfs_urls: try: hdfs_client = KerberosClient(hdfs_url) hdfs_client.list(path_hdfs) self.hdfs_url = hdfs_url self.logger.info('identify namenode: %s' % hdfs_url) break except hdfs.util.HdfsError: continue self.path_hdfs = path_hdfs self.max_process = max_process # Files settings self.file_size = 0 self.file_name = self.__generate_file_name() self.max_file_size = max_file_size
HDFS_USER = options['hdfs_user'] HDFS_MODEL_DIR = options['hdfs_model_dir'] EVALUATE_SAVE_GSPREAD = options['evaluate_save_gspread'] FORMATTED_HDFS_PATH = "/".join(HDFS_MODEL_DIR.split('/')[5:]) print('Running Evaluate script:') print('Connecting to HDFS and Oracle database...') client = KerberosClient(HDFS_URL) conn = jdbc.connect("oracle.jdbc.driver.OracleDriver", URL_ORACLE_SERVER, [USER_ORACLE, PASSWD_ORACLE], ORACLE_DRIVER_PATH) curs = conn.cursor() MOTIVOS_DICT = get_motivos_declarados(curs) model_dates = sorted(client.list(FORMATTED_HDFS_PATH)) validated_datasets = [] classified_datasets = [] for model_date in model_dates: try: data_hdfs = get_results_from_hdfs(client, FORMATTED_HDFS_PATH, model_date=model_date) except BaseException: continue # Results are stored as a tuple represented as a string data_hdfs['MDEC_DK'] = data_hdfs['MDEC_DK'].apply( lambda x: ast.literal_eval(x)) keys = get_keys(data_hdfs, 'SNCA_DK')
principal=config['kerberos_principal'], keytab_file=config['keytab_file'], ccache_file=config['kerberos_cache_file']): # hive.Connection() con = hive.connect(host='uatnd02.csdntest.com.local', port=10000, auth='KERBEROS', kerberos_service_name="hive" ) # host为hiveserver2所在节点,port默认10000,为hs2的端口 cursor = con.cursor() cursor.execute('select * from dl_nccp.account limit 5') # 不能有分号! # cursor.execute('desc dl_nccp.account') #不能有分号! datas = cursor.fetchall() print(datas) cursor.close() con.close() conn = dbapi.connect(host='uatnd02.csdntest.com.local', port=10000, auth_mechanism='GSSAPI', kerberos_service_name="hive") cursor = conn.cursor() # hdfs kerberos client = KerberosClient('http://hdfs_ip:50070', hostname_override="hdfs域名") client._list_status() client.list() client.delete() client.upload() client.download() client.makedirs('test')
from hdfs.ext.kerberos import KerberosClient if __name__ == "__main__": client = KerberosClient("http://10.214.208.11:9000") client.list("/") pass
def execute_process(args): app_name = "criar_tabela_tce" spark = pyspark.sql.session.SparkSession \ .builder \ .appName(app_name) \ .config("hive.exec.dynamic.partition.mode", "nonstrict") \ .enableHiveSupport() \ .getOrCreate() client = KerberosClient(args.webHdfs) hdfs_files = client.list(args.pathDirectoryBase) for directory in hdfs_files: try: actual_directory = args.pathDirectoryBase + directory df = spark.read.text(actual_directory) if not df.rdd.isEmpty(): # df = spark.read.load(actual_directory, format="csv", multiLine=True, # sep=args.delimiter, inferSchema=True, header=True) columns_types = params_table.table_columns_type[directory] df = spark.read.option("quote", "\"") \ .option("escape", "\"") \ .load(actual_directory, format="csv", sep=args.delimiter, header=True) columns = [ trait_columns_name(column_name) for column_name in df.columns ] df = df.toDF(*columns) df = reduce(check_type, columns_types, df) #df = reduce(remove_break_lines, df.dtypes, df) table_hive = "{}.{}".format(args.schemaHive, directory) table_postgres = "{}.{}".format(args.schemaPostgres, directory) df.write.mode("overwrite").format("parquet").saveAsTable( table_hive) spark.sql( "ANALYZE TABLE {} COMPUTE STATISTICS".format(table_hive)) execute_compute_stats(table_hive) export_to_postgres(df, args, table_postgres) send_log(SUCCESS_MESSAGE.format(directory), app_name, SUCCESS, args.solrServer, args.source) except Exception as message: send_log(ERROR_MESSAGE.format(directory, message), app_name, ERROR, args.solrServer, args.source)
df = df.groupby([ID_COLUMN, TEXT_COLUMN])\ .agg(lambda x: set(x))\ .reset_index() nb_new_documents = len(df) if nb_new_documents == 0: print('No new data to predict!') sys.exit() else: print('{} new documents to predict.\n'.format(nb_new_documents)) X = np.array(df[TEXT_COLUMN]) print('Loading models...') formatted_hdfs_path = "/".join(HDFS_MODEL_DIR.split('/')[5:]) most_recent_date = sorted(client.list(formatted_hdfs_path))[-1] with client.read('{}/{}/model/mlb_binarizer.pkl'.format( formatted_hdfs_path, most_recent_date)) as mlb_reader: mlb = pickle.loads(mlb_reader.read()) with client.read('{}/{}/model/vectorizer.pkl'.format( formatted_hdfs_path, most_recent_date)) as vectorizer_reader: vectorizer = pickle.loads(vectorizer_reader.read()) with client.read('{}/{}/model/model.pkl'.format( formatted_hdfs_path, most_recent_date)) as clf_reader: clf = pickle.loads(clf_reader.read()) print('Predicting...') reg_clf = RegexClassifier(RULES) y_regex = reg_clf.predict(X) y_regex = mlb.transform(y_regex)
class OperateHDFS: def __init__(self, url): ''' :param url:HDFS名称节点的主机名或IP地址,以协议为前缀,其次是namenode上的WebHDFS端口,也可以指定多个URL以分号分隔以获取高可用性支持. ''' # 实例化HDFS web client using Kerberos authentication self.client = KerberosClient(url) def file_list(self, file_path): ''' :param file_path: HDFS远程目录路径 :return: 返回一个远程目录中包含的所有文件 ''' file_detail = self.client.list(hdfs_path=file_path) return file_detail def file_read(self, file_path): ''' 从HDFS中读取文件 :param file_path: HDFS远程文件路径 :return: ''' lines = [] with self.client.read(hdfs_path=file_path, encoding='utf-8', delimiter=r'\n') as reader: # content = file.read() # print(content) for item in reader: lines.append(item.strip()) return lines def file_create_write(self, file_path, data_write): ''' 在HDFS中创建新文件并写入内容 :param file_path: HDFS远程文件路径 :param data_write: 写入到文件的数据 :return: ''' self.client.write(hdfs_path=file_path, data=data_write, encoding='utf-8') def file_append_write(self, file_path, data_append): ''' 在HDFS中已存在的文件中追加写入内容,文件必须已存在 :param file_path: HDFS远程文件路径 :param data_append: 追加到文件的数据 :return: ''' self.client.write(hdfs_path=file_path, data=data_append, encoding='utf-8', append=True) def file_rename(self, src_file_path, dst_file_path): ''' 重命名/移动文件或文件夹 :param src_file_path: 源文件路径 :param dst_file_path: 目的文件路径 :return: ''' self.client.rename(hdfs_src_path=src_file_path, hdfs_dst_path=dst_file_path) def mkdir(self, file_path): ''' 在HDFS中创建远程目录,必要时递归创建 :param file_path: 需要新建的文件夹路径(包含名字) :return: ''' self.client.makedirs(hdfs_path=file_path) def upload_files(self, file_path, local_path): ''' 上传文件或目录到HDFS :param file_path:HDFS目标路径。如果它已经存在并且是一个目录,文件将被上传其中。 :param local_path:文件或文件夹的本地路径。 如果是文件夹,则将上传其中的所有文件(请注意,这意味着没有文件的文件夹将不会远程创建) :return:hdfs_path_return:成功后,此方法将返回远程上传路径。 ''' hdfs_path_return = self.client.upload(hdfs_path=file_path, local_path=local_path) return hdfs_path_return def download_files(self, file_path, local_path): ''' 从HDFS下载一个文件或文件夹并将其保存在本地 :param file_path:HDFS上要下载的文件或文件夹的路径。 如果是文件夹,则将下载该文件夹下的所有文件 :param local_path:本地路径。 如果它已经存在并且是目录,则文件将在其中下载。 :return: local_path_return:成功后,此方法将返回本地下载路径 ''' local_path_return = self.client.download(hdfs_path=file_path, local_path=local_path) return local_path_return def delete_files(self, file_path): ''' 从HDFS中删除文件或目录 :param file_path: HDFS中需要删除的文件或目录的路径 :return:如果删除成功,则此函数返回“ True”,如果先前在“ hdfs_path”处不存在文件或目录,则返回“ False”。 ''' # recursive:递归删除文件和目录。 默认情况下,如果尝试删除非空目录,则此方法将引发HdfsError。 # skip_trash:设置为false时,已删除的路径将被移至相应的垃圾文件夹,而不是被删除。 这需要Hadoop 2.9+且在集群上启用trash return self.client.delete(hdfs_path=file_path, recursive=False, skip_trash=True) def set_files_permission(self, file_path): ''' 更改文件的权限 :param file_path: 需要更改权限的文件路径 :return: ''' # permission:文件的新八进制权限字符串 self.client.set_permission(hdfs_path=file_path, permission=None)
import os from requests_kerberos import HTTPKerberosAuth from hdfs.ext.kerberos import KerberosClient url = "http://alderamin.sdab.sn:50070;http://fomalgaut.sdab.sn:50070" os.environ[ "KRB5_CLIENT_KTNAME"] = "/home/doopy/pyprojects/csp-ba-bas_logs_mapreduce/doopy.keytab" kerberos_auth = HTTPKerberosAuth(principal="*****@*****.**") client = KerberosClient(url) print(client.list('/tmp/'))
""" ******************* *Copyright 2017, MapleLabs, All Rights Reserved. * ******************** """ import sys from hdfs.ext.kerberos import KerberosClient from hdfs.client import InsecureClient from requests import Session from requests_kerberos import HTTPKerberosAuth, DISABLED session = Session() session.verify = False kerberos_auth = HTTPKerberosAuth(mutual_authentication=DISABLED, force_preemptive=True, principal='') session.auth = kerberos_auth client = KerberosClient("", session=session) #client = InsecureClient("", session=session) file = sys.argv[1] destfile = sys.argv[2] print client.list('/mr-history/done') client.download(file, destfile, overwrite=True)
from hdfs.ext.kerberos import KerberosClient client = KerberosClient('http://X:50070') # Listing all files inside a directory. fnames = client.list('.') print(fnames)