class HDFSTensorboardManager(base.TensorboardManager): """ Store and tfevents files to HDFS. """ @util.preserve_random_state def __init__( self, hdfs_url: str, hdfs_path: str, user: Optional[str] = None, *args: Any, **kwargs: Any, ) -> None: super().__init__(*args, **kwargs) self.hdfs_url = hdfs_url self.hdfs_path = hdfs_path self.user = user self.client = InsecureClient(self.hdfs_url, root=self.hdfs_path, user=self.user) self.client.makedirs(str(self.sync_path)) @util.preserve_random_state def sync(self) -> None: for path in self.to_sync(): file_name = str(self.sync_path.joinpath(path.name)) logging.debug(f"Uploading {path} to {self.hdfs_path}") self.client.upload(file_name, str(path)) self._synced_event_sizes[path] = path.stat().st_size
def run_conv_file_local_to_hdfs(configData: ConfigData): """ # client.upload('/shouyinbao/', "/home/testFolder/logflow/bl_shouyinbao/UTF8/20181101/9999100000/t1_trxrecord_20181101_V2.csv", cleanup=True) # dat = client.list('/shouyinbao/', status=False) # print(dat) # root_path = "/home/bd/桌面/201811_flow/zc_shouyinbao/UNZIP/" # dest_dir1 = "/home/bd/桌面/201811_flow/zc_shouyinbao/UTF8/" # dest_dir2 = "/shouyinbao/zc_shouyinbao/UTF8/" # root_path = "/home/testFolder/logflow/bl_shouyinbao/UNZIP/" # dest_dir1 = "/home/testFolder/logflow/bl_shouyinbao/UTF8/" # dest_dir2 = "/shouyinbao/zc_shouyinbao/UTF8/" # i_file = '/home/testFolder/logflow/bl_shouyinbao/20181101/9999100000/t1_trxrecord_20181101_V2.csv' # o_file = '/home/testFolder/logflow/bl_shouyinbao/UTF8/20181101/9999100000/t1_trxrecord_20181101_V2.csv' :param configData: :param the_date: :param is_baoli: :return: """ f_date_str = configData.get_f_date() # "20181101" a_client = InsecureClient(configData.hdfs_ip(), user="******") # "http://10.2.201.197:50070" # webhdfs 默认是 dr.who ,不能伪装成其他用户,可以在配置里修改 hadoop.http.staticuser.user=dr.who # https://www.cnblogs.com/peizhe123/p/5540845.html root_path = os.path.join(configData.get_data_path(), f_date_str) dest_dir1 = os.path.join(configData.get_utf8_path(), f_date_str) dest_dir2 = str(pathlib.PurePosixPath(configData.get_hdfs_path()).joinpath(f_date_str)) print("Start\n") f_name = configData.get_file_name(f_date_str) # "t1_trxrecord_" the_date # "_V2.csv" branches = MyLocalFile.get_child_dir(root_path) for aBranch in branches: if MyLocalFile.check_branch(aBranch): files = MyLocalFile.get_child_file(aBranch) f_a_branch = os.path.basename(aBranch) for aFile in files: if MyLocalFile.check_file(aFile, f_name): to_file1 = str(pathlib.PurePath(dest_dir1).joinpath(f_a_branch, f_name)) to_file2 = str(pathlib.PurePosixPath(dest_dir2).joinpath(f_a_branch, f_name)) f_add_head = configData.get_hive_add_date(f_a_branch) f_add_end = configData.get_hive_add_date("789") f_need_head = not configData.get_hive_head() # False MyLocalFile.conv_file_local(aFile, to_file1, need_first_line=f_need_head,p_add_head=f_add_head, p_add_tail=f_add_end,quoting="") MyHdfsFile.safe_make_dir(a_client, to_file2) # client.newupload(to_file2, to_file1, encoding='utf-8') the_file = a_client.status(to_file2, strict=False) if the_file is None: a_client.upload(to_file2, to_file1) #, encoding='utf-8') a_client.set_permission(to_file2, 777) # a_client.set_owner(thePath,owner='hdfs',group='supergroup') elif the_file['type'].lower() == 'file': # 'directory' a_client.set_permission(to_file2, 777)
class HDFSStorageManager(StorageManager): """ Store and load checkpoints from HDFS. """ def __init__( self, hdfs_url: str, hdfs_path: str, user: Optional[str] = None, temp_dir: Optional[str] = None, ) -> None: super().__init__( temp_dir if temp_dir is not None else tempfile.gettempdir()) self.hdfs_url = hdfs_url self.hdfs_path = hdfs_path self.user = user self.client = InsecureClient(self.hdfs_url, root=self.hdfs_path, user=self.user) def post_store_path(self, storage_id: str, storage_dir: str, metadata: StorageMetadata) -> None: """post_store_path uploads the checkpoint to hdfs and deletes the original files.""" try: logging.info("Uploading storage {} to HDFS".format(storage_id)) result = self.client.upload(metadata, storage_dir) logging.info("Uploaded storage {} to HDFS path {}".format( storage_id, result)) finally: self._remove_checkpoint_directory(metadata.storage_id) @contextlib.contextmanager def restore_path(self, metadata: StorageMetadata) -> Iterator[str]: logging.info("Downloading storage {} from HDFS".format( metadata.storage_id)) self.client.download(metadata.storage_id, self._base_path, overwrite=True) try: yield os.path.join(self._base_path, metadata.storage_id) finally: self._remove_checkpoint_directory(metadata.storage_id) def delete(self, metadata: StorageMetadata) -> None: logging.info("Deleting storage {} from HDFS".format( metadata.storage_id)) self.client.delete(metadata.storage_id, recursive=True)
def run_remove_files(configData: ConfigData): f_date_str = configData.get_f_date() # StrTool.get_the_date_str(the_date, delta_day) # "20181101" data_path = os.path.join(configData.get_data_path(), f_date_str) # allinpay_data_bl utf8_path = os.path.join(configData.get_utf8_path(), f_date_str) # allinpay_utf8_bl hdfs_path = str(pathlib.PurePosixPath(configData.get_hdfs_path()).joinpath(f_date_str)) # hdfs_dir_bl a_client = InsecureClient(configData.hdfs_ip(), user="******") # "http://10.2.201.197:50070" shutil.rmtree(data_path, ignore_errors=True) shutil.rmtree(utf8_path, ignore_errors=True) try: a_client.delete(hdfs_path, recursive=True) except: pass
def __init__( self, hdfs_url: str, hdfs_path: str, user: Optional[str] = None, temp_dir: Optional[str] = None, ) -> None: super().__init__(temp_dir if temp_dir is not None else tempfile.gettempdir()) self.hdfs_url = hdfs_url self.hdfs_path = hdfs_path self.user = user self.client = InsecureClient(self.hdfs_url, root=self.hdfs_path, user=self.user)
def __init__( self, hdfs_url: str, hdfs_path: str, user: Optional[str] = None, *args: Any, **kwargs: Any, ) -> None: super().__init__(*args, **kwargs) self.hdfs_url = hdfs_url self.hdfs_path = hdfs_path self.user = user self.client = InsecureClient(self.hdfs_url, root=self.hdfs_path, user=self.user) self.client.makedirs(str(self.sync_path))
def run_hive(configData: ConfigData): a_client = InsecureClient(url=configData.hdfs_ip(), user="******") # "http://10.2.201.197:50070" conn = connect(host=configData.hive_ip(), port=configData.hive_port(), auth_mechanism=configData.hive_auth(), user=configData.hive_user()) cur = conn.cursor() f_date_str = configData.get_f_date() # "20181101" p_date_str = configData.get_p_date() # "2018-11-01" # hdfs_dir_bl root_path = str(pathlib.PurePosixPath(configData.get_hdfs_path()).joinpath(f_date_str)) file_name = str(pathlib.PurePosixPath(root_path).joinpath(configData.get_file_name(f_date_str))) # "/data/posflow/allinpay_utf8_zc/20181101/" # 20181101_loginfo_rsp_bl_new.csv # 20181101_rsp_agt_bl_new.del # 20181101_rxinfo_rsp_bl.txt table_name = configData.get_table_name() print("Start\n") if MyHdfsFile.isfile(a_client, file_name): if not configData.get_has_partition(): sql = "LOAD DATA INPATH '{}' INTO TABLE {}".format(file_name, table_name) # 'test.t1_trxrecprd_v2_zc' # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2' else: sql = "LOAD DATA INPATH '{}' INTO TABLE {} PARTITION ( p_date='{}' )".format(file_name, table_name, p_date_str) # 'test.t1_trxrecprd_v2_zc' print("OK" + " " + sql+"\n") cur.execute(sql) # , async=True) cur.close() conn.close()
def run_hive(configData: ConfigData): a_client = InsecureClient(url=configData.hdfs_ip(), user="******") # "http://10.2.201.197:50070" conn = connect(host=configData.hive_ip(), port=configData.hive_port(), auth_mechanism=configData.hive_auth(), user=configData.hive_user()) cur = conn.cursor() f_date_str = configData.get_f_date() # "20181101" p_date_str = configData.get_p_date() # "2018-11-01" root_path = configData.get_hdfs_path() # "/shouyinbao/bl_shouyinbao/UTF8/" file_name = configData.get_file_name(f_date_str) # "t1_trxrecord_" the_date # "_V2.csv" table_name = configData.get_table_name() print("Start\n") idn = 0 branches = MyHdfsFile.get_child(a_client, str(pathlib.PurePosixPath(root_path).joinpath(f_date_str))) for aBranch in branches: if MyHdfsFile.check_branch(a_client, aBranch): files = MyHdfsFile.get_child(a_client, aBranch) f_a_branch = MyHdfsFile.get_name(aBranch) for aFile in files: if MyHdfsFile.check_file(a_client, aFile, file_name): # '/shouyinbao/bl_shouyinbao/UTF8/20181101/9999997900/t1_trxrecord_20181101_V2.csv' to_file2 = str(pathlib.PurePosixPath(root_path).joinpath(f_date_str, f_a_branch, file_name)) if not configData.get_has_partition(): sql = "LOAD DATA INPATH '{}' INTO TABLE {}".format(to_file2, table_name) # 'test.t1_trxrecprd_v2_zc' # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2' else: sql = "LOAD DATA INPATH '{}' INTO TABLE {} PARTITION ( p_date='{}' )".format(to_file2, table_name, p_date_str) # 'test.t1_trxrecprd_v2_zc' idn += 1 print(str(idn) + " " + sql + "\n") cur.execute(sql) # , async=True) cur.close() conn.close()
def upload_and_delete_file(file_path, upload_path): for file_name in os.listdir(file_path): try: # session = Session() # session.auth = HTTPBasicAuth('username', 'password') # client = InsecureClient(url=HDFS_HOSTS, user="******", session=session) client = InsecureClient(url=HDFS_HOSTS, user=HDFS_USER) # client.delete("/data/scan/sm_word_log_message-2018-06-27-20") # print(client.list("/data/scan/sm/", status=False)) path = client.upload(upload_path + file_name, file_path + file_name, cleanup=True) if path: os.remove(file_path + file_name) except Exception as e: logger.error("upload_and_delete_file error = %s", str(e))
def hdfs_connect(host='localhost', port=50070, protocol='webhdfs', use_https='default', auth_mechanism='NOSASL', verify=True, **kwds): """ Connect to HDFS Parameters ---------- host : string, Host name of the HDFS NameNode port : int, NameNode's WebHDFS port (default 50070) protocol : {'webhdfs'} use_https : boolean, default 'default' Connect to WebHDFS with HTTPS, otherwise plain HTTP. For secure authentication, the default for this is True, otherwise False auth_mechanism : string, Set to NOSASL or PLAIN for non-secure clusters. Set to GSSAPI or LDAP for Kerberos-secured clusters. verify : boolean, Set to False to turn off verifying SSL certificates. (default True) Other keywords are forwarded to hdfs library classes Returns ------- client : WebHDFS """ import requests session = kwds.setdefault('session', requests.Session()) session.verify = verify if auth_mechanism in ['GSSAPI', 'LDAP']: if use_https == 'default': prefix = 'https' else: prefix = 'https' if use_https else 'http' try: import requests_kerberos except ImportError: raise IbisError( "Unable to import requests-kerberos, which is required for " "Kerberos HDFS support. Install it by executing `pip install " "requests-kerberos` or `pip install hdfs[kerberos]`.") from hdfs.ext.kerberos import KerberosClient # note SSL url = '{0}://{1}:{2}'.format(prefix, host, port) kwds.setdefault('mutual_auth', 'OPTIONAL') hdfs_client = KerberosClient(url, **kwds) else: if use_https == 'default': prefix = 'http' else: prefix = 'https' if use_https else 'http' from hdfs.client import InsecureClient url = '{0}://{1}:{2}'.format(prefix, host, port) hdfs_client = InsecureClient(url, **kwds) return WebHDFS(hdfs_client)
def initialize_hdfs_client(url): global client if not client: session = Session() session.verify = False if kerberos['enabled']: client = KerberosClient(url, session=session) else: client = InsecureClient(url, user=hdfs['user'], session=session)
def get_pd_DF(cli: InsecureClient, file_path, header): """ 读取hdfs上的csv文件,返回pandas的DataFrame :param cli: hdfs的InsecureClient :param file_path: hdfs的文件路径,相对InsecureClient里面设置的root路径 :return: """ with cli.read(file_path) as reader: df_pd = pd.read_csv(reader, header=header) return df_pd
def hdfs_client(self): url = 'http://{nn_host}:{webhdfs_port}'.format( nn_host=self._nn_host, webhdfs_port=self._webhdfs_port) if self._kerberized: from hdfs.ext.kerberos import KerberosClient client = KerberosClient(url, mutual_auth='REQUIRED') else: from hdfs.client import InsecureClient client = InsecureClient(url, user=self._hdfs_user) return client
def save_pd_DF(df_pd: pd.DataFrame, cli: InsecureClient, file_path): """ 将pandas的DataFrame写入hdfs csv :param df_pd: pandas的DataFrame :param cli: hdfs的InsecureClient :param file_path: hdfs的文件路径,相对InsecureClient里面设置的root路径 """ with cli.write(hdfs_path=file_path, encoding='utf-8', overwrite=True) as writer: df_pd.to_csv(writer)
def __init__(self, host, port=50070, use_https=False, secure=False, **kwds): import requests session = kwds.setdefault('session', requests.Session()) session.verify = True prefix = 'https' if use_https else 'http' url = '{0}://{1}:{2}'.format(prefix, host, port) if secure: import requests_kerberos from hdfs.ext.kerberos import KerberosClient kwds.setdefault('mutual_auth', 'OPTIONAL') self.client = KerberosClient(url, **kwds) else: from hdfs.client import InsecureClient self.client = InsecureClient(url, **kwds)
def save_driver_data(spark, broadcast_variable, train_x, train_y, valid_x, valid_y, save_method): if broadcast_variable: train_x = spark.sparkContext.broadcast(train_x) if train_x is not None else train_x train_y = spark.sparkContext.broadcast(train_y) if train_y is not None else train_y valid_x = spark.sparkContext.broadcast(valid_x) if valid_x is not None else valid_x valid_y = spark.sparkContext.broadcast(valid_y) if valid_y is not None else valid_y hdfs_path_dic = None else: hdfs = InsecureClient(WEB_HDFS_URL, user=WEB_USER) hdfs_path_dic = save_train_valid_to_hdfs(hdfs=hdfs, method=save_method, tmp_hdfs_path=TEMP_HDFS_PATH, train_x=train_x, train_y=train_y, valid_x=valid_x, valid_y=valid_y) train_x, train_y, valid_x, valid_y = (None,) * 4 return train_x, train_y, valid_x, valid_y, hdfs_path_dic
def load_driver_data(hdfs_path, train_x, train_y, valid_x, valid_y, save_method): if hdfs_path is None: from pyspark.broadcast import Broadcast train_x = train_x.value if isinstance(train_x, Broadcast) else train_x train_y = train_y.value if isinstance(train_y, Broadcast) else train_y valid_x = valid_x.value if isinstance(valid_x, Broadcast) else valid_x valid_y = valid_y.value if isinstance(valid_y, Broadcast) else valid_y else: hdfs = InsecureClient(WEB_HDFS_URL, user=WEB_USER) data_dic = load_train_valid_to_local(hdfs=hdfs, method=save_method, ret_dic=hdfs_path) train_x, train_y, valid_x, valid_y = data_dic["train_x"], data_dic["train_y"], \ data_dic["valid_x"], data_dic["valid_y"] del data_dic return train_x, train_y, valid_x, valid_y
def run_conv_file_local_to_hdfs(configData: ConfigData): """ :param configData: :return: """ f_date_str = configData.get_f_date() # "20181101" a_client = InsecureClient(configData.hdfs_ip(), user="******") # "http://10.2.201.197:50070" root_path = os.path.join(configData.get_data_path(), f_date_str) # allinpay_data_bl dest_dir1 = os.path.join(configData.get_utf8_path(), f_date_str) # allinpay_utf8_bl dest_dir2 = str(pathlib.PurePosixPath(configData.get_hdfs_path()).joinpath(f_date_str)) # hdfs_dir_bl # file_ext7 = configData.get_data("file_ext7") # _loginfo_rsp_bl_new.csv # 20181101_loginfo_rsp_bl_new.csv # file_ext8 = configData.get_data("file_ext8") # _rsp_agt_bl_new.del # 20181101_rsp_agt_bl_new.del # file_ext9 = configData.get_data("file_ext9") # _rxinfo_rsp_bl.txt # 20181101_rxinfo_rsp_bl.txt # f_list = [file_ext7, file_ext8, file_ext9] print("Start\n") # "file_ext" + str(configData.the_id) file_name = configData.get_file_name(f_date_str).lower() files = MyLocalFile.get_child_file(root_path) for aFile in files: short_name = os.path.basename(aFile).lower() f_name = pathlib.PurePath(aFile).name if short_name == file_name: to_file1 = str(pathlib.PurePath(dest_dir1).joinpath(f_name)) to_file2 = str(pathlib.PurePosixPath(dest_dir2).joinpath(f_name)) f_add_date = configData.get_hive_add_date(f_date_str) f_need_head = not configData.get_hive_head() MyLocalFile.conv_file_local(aFile, to_file1, need_first_line=f_need_head, p_add_head=f_add_date) MyHdfsFile.safe_make_dir(a_client, to_file2) # a_client.newupload(to_file2, to_file1, encoding='utf-8') the_file = a_client.status(to_file2, strict=False) if the_file is None: a_client.upload(to_file2, to_file1) a_client.set_permission(to_file2, 777) # a_client.set_owner(thePath,owner='hdfs',group='supergroup') elif the_file['type'].lower() == 'file': # 'directory' a_client.set_permission(to_file2, 777)
def connect_test(env, with_hdfs=True): con = ibis.impala_connect(host=env.impala_host, protocol=env.impala_protocol, database=env.test_data_db, port=env.impala_port, use_kerberos=env.use_kerberos, pool_size=2) if with_hdfs: if env.use_kerberos: from hdfs.ext.kerberos import KerberosClient hdfs_client = KerberosClient(env.hdfs_url, mutual_auth='REQUIRED') else: from hdfs.client import InsecureClient hdfs_client = InsecureClient(env.hdfs_url) return ibis.make_client(con, hdfs_client) else: return ibis.make_client(con)
def hdfs_connect(host='localhost', port=50070, protocol='webhdfs', use_kerberos=False, verify=True, **kwds): """ Connect to HDFS Parameters ---------- host : string port : int, default 50070 (webhdfs default) protocol : {'webhdfs'} use_kerberos : boolean, default False verify : boolean, default False Set to False to turn off verifying SSL certificates Other keywords are forwarded to hdfs library classes Returns ------- client : ibis HDFS client """ if use_kerberos: try: import requests_kerberos except ImportError: raise IbisError( "Unable to import requests-kerberos, which is required for " "Kerberos HDFS support. Install it by executing `pip install " "requests-kerberos` or `pip install hdfs[kerberos]`.") from hdfs.ext.kerberos import KerberosClient url = 'https://{0}:{1}'.format(host, port) # note SSL hdfs_client = KerberosClient(url, mutual_auth='OPTIONAL', verify=verify, **kwds) else: from hdfs.client import InsecureClient url = 'http://{0}:{1}'.format(host, port) hdfs_client = InsecureClient(url, verify=verify, **kwds) return WebHDFS(hdfs_client)
def save_to_hdfs(self, key, url, title, content): current_date = datetime.datetime.now().strftime("%Y%m%d") hdfs_path = hdfs_dir + current_date import sys reload(sys) sys.setdefaultencoding('utf-8') data = "\n" + key + "\n" + url + "\n" if (title != None and title != ''): data = data + title + "\n" if (content != None and content != ''): data = data + content + "\n" try: client = InsecureClient(hdfs_web, user=hdfs_user) client.write(hdfs_path=hdfs_path, data=data, append=True) except HdfsError, e: client.write(hdfs_path=hdfs_path, data=data)
def __init__(self, url, root=None, user=None, proxy=None, timeout=None, session=None): """ 连接hdfs url: HDFS名称节点的主机名或IP地址及端口号 root: 根路径,此路径将作为传递给客户端的所有HDFS路径的前缀 user: 使用InsecureClient(Base Client),指定访问hdfs的用户;Client使用默认用户dr.who proxy: 代理的用户 timeout: 连接超时,转发到请求处理程序 session: request.Session实例,用于发出所有请求 """ if user: self.client = InsecureClient(url, user=user) else: self.client = Client(url, root=root, proxy=proxy, timeout=timeout, session=session)
def run_conv_file_hdfs(configData: ConfigData): f_date_str = configData.get_f_date() # "20181101" client = InsecureClient(configData.hdfs_ip(), user="******") # "http://10.2.201.197:50070" root_path = configData.get_data_path() # 'D:/DATA/UNZIP/' dest_dir = configData.get_hdfs_path() f_name = configData.get_file_name(f_date_str) # "t1_trxrecord_" the_date # "_V2.csv" print("Start\n") branches = MyLocalFile.get_child(os.path.join(root_path, f_date_str)) for aBranch in branches: if MyLocalFile.check_branch(aBranch): files = MyLocalFile.get_child(aBranch) for aFile in files: if MyLocalFile.check_file(aFile, f_name): MyHdfsFile.conv_file_hdfs(aFile, os.path.join(dest_dir, f_date_str, os.path.basename(aBranch), f_name), client)
class HDFSTensorboardManager(base.TensorboardManager): """ Store and tfevents files to HDFS. """ @util.preserve_random_state def __init__( self, hdfs_url: str, hdfs_path: str, user: Optional[str] = None, *args: Any, **kwargs: Any, ) -> None: super().__init__(*args, **kwargs) self.hdfs_url = hdfs_url self.hdfs_path = hdfs_path self.user = user self.client = InsecureClient(self.hdfs_url, root=self.hdfs_path, user=self.user) self.client.makedirs(str(self.sync_path)) @util.preserve_random_state def sync( self, selector: Callable[[pathlib.Path], bool] = lambda _: True, mangler: Callable[[pathlib.Path, int], pathlib.Path] = lambda p, __: p, rank: int = 0, ) -> None: for path in self.to_sync(selector): relative_path = path.relative_to(self.base_path) mangled_relative_path = mangler(relative_path, rank) mangled_path = self.sync_path.joinpath(mangled_relative_path) file_name = str(mangled_path) logging.debug(f"Uploading {path} to {self.hdfs_path}") self.client.upload(file_name, str(path)) def delete(self) -> None: self.client.delete(self.sync_path, recursive=True)
class HDFSStorageManager(storage.CloudStorageManager): """ Store and load checkpoints from HDFS. """ def __init__( self, hdfs_url: str, hdfs_path: str, user: Optional[str] = None, temp_dir: Optional[str] = None, ) -> None: super().__init__(temp_dir if temp_dir is not None else tempfile.gettempdir()) self.hdfs_url = hdfs_url self.hdfs_path = hdfs_path self.user = user self.client = InsecureClient(self.hdfs_url, root=self.hdfs_path, user=self.user) @util.preserve_random_state def upload(self, src: Union[str, os.PathLike], dst: str) -> None: src = os.fspath(src) logging.info(f"Uploading to HDFS: {dst}") self.client.upload(dst, src) @util.preserve_random_state def download(self, src: str, dst: Union[str, os.PathLike]) -> None: dst = os.fspath(dst) logging.info(f"Downloading {src} from HDFS") self.client.download(src, dst, overwrite=True) @util.preserve_random_state def delete(self, tgt: str) -> None: logging.info(f"Deleting {tgt} from HDFS") self.client.delete(tgt, recursive=True)
def hdfs_connect(host='localhost', port=50070, protocol='webhdfs', use_https='default', auth_mechanism='NOSASL', verify=True, session=None, **kwds): """Connect to HDFS. Parameters ---------- host : str Host name of the HDFS NameNode port : int NameNode's WebHDFS port protocol : str, The protocol used to communicate with HDFS. The only valid value is ``'webhdfs'``. use_https : bool Connect to WebHDFS with HTTPS, otherwise plain HTTP. For secure authentication, the default for this is True, otherwise False. auth_mechanism : str Set to NOSASL or PLAIN for non-secure clusters. Set to GSSAPI or LDAP for Kerberos-secured clusters. verify : bool Set to :data:`False` to turn off verifying SSL certificates. session : Optional[requests.Session] A custom :class:`requests.Session` object. Notes ----- Other keywords are forwarded to HDFS library classes. Returns ------- WebHDFS """ import requests if session is None: session = requests.Session() session.verify = verify if auth_mechanism in ('GSSAPI', 'LDAP'): if use_https == 'default': prefix = 'https' else: prefix = 'https' if use_https else 'http' try: import requests_kerberos # noqa: F401 except ImportError: raise IbisError( "Unable to import requests-kerberos, which is required for " "Kerberos HDFS support. Install it by executing `pip install " "requests-kerberos` or `pip install hdfs[kerberos]`.") from hdfs.ext.kerberos import KerberosClient # note SSL url = '{0}://{1}:{2}'.format(prefix, host, port) kwds.setdefault('mutual_auth', 'OPTIONAL') hdfs_client = KerberosClient(url, session=session, **kwds) else: if use_https == 'default': prefix = 'http' else: prefix = 'https' if use_https else 'http' from hdfs.client import InsecureClient url = '{}://{}:{}'.format(prefix, host, port) hdfs_client = InsecureClient(url, session=session, **kwds) return WebHDFS(hdfs_client)
def hdfs_connect( host: str = 'localhost', port: int = 50070, protocol: Literal['webhdfs'] = 'webhdfs', use_https: str = 'default', auth_mechanism: str = 'NOSASL', verify: bool = True, session: Any = None, **kwds: Any, ) -> WebHDFS: """Connect to HDFS. Parameters ---------- host Host name of the HDFS NameNode port NameNode's WebHDFS port protocol The protocol used to communicate with HDFS. The only valid value is ``'webhdfs'``. use_https Connect to WebHDFS with HTTPS, otherwise plain HTTP. For secure authentication, the default for this is True, otherwise False. auth_mechanism Set to NOSASL or PLAIN for non-secure clusters. Set to GSSAPI or LDAP for Kerberos-secured clusters. verify Set to `False` to turn off verifying SSL certificates. session A custom `requests.Session` object. Returns ------- WebHDFS WebHDFS client """ import requests if session is None: session = requests.Session() session.verify = verify if auth_mechanism in ('GSSAPI', 'LDAP'): from hdfs.ext.kerberos import KerberosClient if use_https == 'default': prefix = 'https' else: prefix = 'https' if use_https else 'http' # note SSL url = f'{prefix}://{host}:{port}' kwds.setdefault('mutual_auth', 'OPTIONAL') hdfs_client = KerberosClient(url, session=session, **kwds) else: if use_https == 'default': prefix = 'http' else: prefix = 'https' if use_https else 'http' from hdfs.client import InsecureClient url = f'{prefix}://{host}:{port}' hdfs_client = InsecureClient(url, session=session, **kwds) return WebHDFS(hdfs_client)
from hdfs.client import Client, InsecureClient import redis import json import time import pymysql client = InsecureClient("http://192.168.1.176:50070", user='******') r = redis.StrictRedis(host='192.168.1.176', port=6379, decode_responses=True) def connect_to_db(DB_NAME): # 設定資料庫連線資訊 host = 'localhost' port = 3306 user = '******' passwd = 'recipe' db = DB_NAME charset = 'utf8mb4' # 建立連線 conn = pymysql.connect(host=host, port=port, user=user, passwd=passwd, db=db, charset=charset) print('Successfully connected to DB : {} !'.format(DB_NAME)) return conn def main(): db = 'recipe' conn = connect_to_db(db) cursor = conn.cursor() # 從mysql同義詞庫抓同義詞存進redis (格式hash- synonym(key)- {總詞彙(sub key):定義食材(value) sql = """
# skip the first argument, which is FunctionContext* arg_types = tuple([ llvm2impala[arg.pointee.name] for arg in function.type.pointee.args[1:] ]) functions.append((symbol, arg_types)) except (AttributeError, KeyError): # this process could fail for non-UDF helper functions...just ignore # them, because we're not going to be registering them anyway log("Had trouble with function %s; moving on..." % symbol) pass # transfer the LLVM module to HDFS url = 'http://{nn_host}:{webhdfs_port}'.format(nn_host=args.nn_host, webhdfs_port=args.webhdfs_port) hdfs_client = InsecureClient(url, user=args.user) hdfs_client.write(args.hdfs_path, bc, overwrite=args.force) log("Transferred LLVM IR to HDFS at %s" % args.hdfs_path) # register the functions with impala conn = impala.dbapi.connect(host=args.impala_host, port=args.impala_port) cursor = conn.cursor(user=args.user) log("Connected to impalad: %s" % args.impala_host) if args.db: cursor.execute('USE %s' % args.db) cursor.execute("SHOW FUNCTIONS") registered_functions = cursor.fetchall() for (udf_name, return_type) in zip(args.name, args.return_type): log("Registering function %s" % udf_name) # find matching LLVM symbols to the current UDF name matches = [pair for pair in functions if udf_name in pair[0]]
from hdfs.client import Client, InsecureClient import redis import json # r = redis.StrictRedis(host='192.168.1.176', port=6379,decode_responses=True) client = InsecureClient("http://192.168.1.176:50070", user='******') # 路徑不用hdfs:// # client.list("/") -> ['recipe', 'tmp', 'user'] with client.read("/recipe/recipe1018_V8.json") as reader: data = json.load(reader) print(data[:10])
symbol = function.name log("Loading types for function %s" % symbol) # skip the first argument, which is FunctionContext* arg_types = tuple([llvm2impala[arg.pointee.name] for arg in function.type.pointee.args[1:]]) functions.append((symbol, arg_types)) except (AttributeError, KeyError): # this process could fail for non-UDF helper functions...just ignore # them, because we're not going to be registering them anyway log("Had trouble with function %s; moving on..." % symbol) pass # transfer the LLVM module to HDFS url = 'http://{nn_host}:{webhdfs_port}'.format( nn_host=args.nn_host, webhdfs_port=args.webhdfs_port) hdfs_client = InsecureClient(url, user=args.user) hdfs_client.write(args.hdfs_path, bc, overwrite=args.force) log("Transferred LLVM IR to HDFS at %s" % args.hdfs_path) # register the functions with impala conn = impala.dbapi.connect(host=args.impala_host, port=args.impala_port) cursor = conn.cursor(user=args.user) log("Connected to impalad: %s" % args.impala_host) if args.db: cursor.execute('USE %s' % args.db) cursor.execute("SHOW FUNCTIONS") registered_functions = cursor.fetchall() for (udf_name, return_type) in zip(args.name, args.return_type): log("Registering function %s" % udf_name) # find matching LLVM symbols to the current UDF name matches = [pair for pair in functions if udf_name in pair[0]]