def test_workflow(hdfs_cluster): w = WebHDFS(hdfs_cluster, user='******', data_proxy={'worker.example.com': 'localhost'}) fn = '/user/testuser/testrun/afile' w.mkdir('/user/testuser/testrun') with w.open(fn, 'wb') as f: f.write(b'hello') assert w.exists(fn) info = w.info(fn) assert info['size'] == 5 assert w.isfile(fn) assert w.cat(fn) == b'hello' w.rm('/user/testuser/testrun', recursive=True) assert not w.exists(fn)
def test_workflow(hdfs_cluster): w = WebHDFS(hdfs_cluster, user="******", data_proxy={"worker.example.com": "localhost"}) fn = "/user/testuser/testrun/afile" w.mkdir("/user/testuser/testrun") with w.open(fn, "wb") as f: f.write(b"hello") assert w.exists(fn) info = w.info(fn) assert info["size"] == 5 assert w.isfile(fn) assert w.cat(fn) == b"hello" w.rm("/user/testuser/testrun", recursive=True) assert not w.exists(fn)
def test_workflow_transaction(hdfs_cluster): w = WebHDFS(hdfs_cluster, user='******', data_proxy={'worker.example.com': 'localhost'}) fn = '/user/testuser/testrun/afile' w.mkdirs('/user/testuser/testrun') with w.transaction: with w.open(fn, 'wb') as f: f.write(b'hello') assert not w.exists(fn) assert w.exists(fn) assert w.ukey(fn) files = w.ls('/user/testuser/testrun', True) summ = w.content_summary('/user/testuser/testrun') assert summ['length'] == files[0]['size'] assert summ['fileCount'] == 1 w.rm('/user/testuser/testrun', recursive=True) assert not w.exists(fn)
def test_workflow_transaction(hdfs_cluster): w = WebHDFS(hdfs_cluster, user="******", data_proxy={"worker.example.com": "localhost"}) fn = "/user/testuser/testrun/afile" w.mkdirs("/user/testuser/testrun") with w.transaction: with w.open(fn, "wb") as f: f.write(b"hello") assert not w.exists(fn) assert w.exists(fn) assert w.ukey(fn) files = w.ls("/user/testuser/testrun", True) summ = w.content_summary("/user/testuser/testrun") assert summ["length"] == files[0]["size"] assert summ["fileCount"] == 1 w.rm("/user/testuser/testrun", recursive=True) assert not w.exists(fn)
class DISC: from _PATHS import _IMPALA_HOST, _HIVE_HOST, _HTTPFS_HOST, _HDFS_PATH, USER_GUIDE_URL __TEMP_LOCAL_DIR = os.path.join(os.path.dirname(__file__), '._temp_connectors') __PEM_PATH = os.path.join(os.path.dirname(__file__), 'certificates/accprd-truststore.pem') def __init__(self): try: # This works on jupyter ipython self._is_jupyter = bool(get_ipython().config) self._is_ipython = True except: # On plain python get_ipython is not defined self._is_jupyter = self._is_ipython = False self.open() self.__log() self.spark = None self._spark_uri = None def open(self, hive=False): """Opens DISC connection: selects automatically according to platform (Local Windows or CDSW) """ from fsspec.implementations.webhdfs import WebHDFS os.environ['REQUESTS_CA_BUNDLE'] = self.__PEM_PATH self._hdfs_cnxn = WebHDFS(self._HTTPFS_HOST, port=14000, kerberos=True, use_https=True, use_ssl=True, use_listings_cache=False) self._engine = "hive" if hive else 'impala' if IS_WINDOWS: # LOCAL - Windows from pyodbc import connect self._cnxn = connect('DSN=DISC DP Impala 64bit' if not hive else 'DSN=DISC DP Hive 64bit', autocommit=True) else: # CDSW (os.name='POSIX') from impala.dbapi import connect self._cnxn = connect(host=self._HIVE_HOST if hive else self._IMPALA_HOST, use_ssl=True, timeout=30, kerberos_service_name=self._engine, port=10000 if hive else 21050, auth_mechanism="GSSAPI") #['NOSASL', 'PLAIN', 'GSSAPI', 'LDAP'] self._cursor = self._cnxn.cursor() if not os.path.exists(self.__TEMP_LOCAL_DIR): os.mkdir(self.__TEMP_LOCAL_DIR) self._is_disc_connected = True self.db = None def connect_spark(self, app_name=None, master=None, config=dict(), return_SparkSession=False): """Connects to spark via pyspark. Stores the spark session in the attribute `disc.spark`. Access to the Spark UI is provided via the link `disc.spark_ui` (the address is also available as `disc._spark_uri)`. Args: app_name (str or None): A name for the current session. master (str or None): Either `yarn` (default) or `local`. config (dict): Dictionary with spark configurations. return_SparkSession (bool): Whether to return the spark session (default is False). Returns: None or SparkSession """ self._spark_uri = f"https://spark-{os.environ['CDSW_ENGINE_ID']}.{os.environ['CDSW_DOMAIN']}/jobs/" from pyspark.sql import SparkSession spark = (SparkSession.builder .appName(app_name or 'SparkSession')) if master is not None: spark = spark.master(master) for k, v in config.items(): spark = spark.config(k, v) spark = spark.getOrCreate() self.spark = spark if return_SparkSession: return spark else: return self.spark_ui @property def spark_ui(self): if self._is_spark_connected: from IPython.core.display import HTML return HTML(f'<a href="{self._spark_uri}">Go to Spark UI</a>') else: return 'Spark is not connected. Connect with `disc.connect_spark()`.' def stop_spark(self): if self._is_spark_connected: self.spark.sparkContext.stop() self.spark.stop() @property def _is_spark_connected(self): try: urllib.request.urlopen(self._spark_uri).getcode() return True except: return False @lru_cache() def __get_databases(self): self._cursor.execute("show databases") return [*zip(*self._cursor.fetchall())][0] def get_databases(self,ret=False): """Prints and returns available databases.""" databases = self.__get_databases() if ret: return databases else: for db in databases: print(db) def select_database(self, database): """Navigates to given database. Args: database (str): the database to be selected """ self._cursor = self._cnxn.cursor() self._cursor.execute(f"use {database}") self.db = database def get_tables(self, database=None, ret=False): """Prints and returs tables within current database.""" if database is not None: self.select_database(database) self._cursor.execute("show tables") tables, = zip(*self._cursor.fetchall()) if ret: return tables else: for table in tables: print(table) def describe_table(self, table): """Describes a table and returns DataFrame with description.""" self._cursor.execute(f"describe {table}") columns, dtype, desc = [*zip(*self._cursor.fetchall())] df = pd.DataFrame({'columns': columns, 'dtype': dtype, 'desc': desc}) return df def _delete_table(self, lab, table_name): self._cursor.execute(f"DROP TABLE IF EXISTS {lab}.{table_name}") def _create_table(self, lab, table_name, dtypes, path, cols): col_and_types = [f'{col} {tp}' for col,tp in zip(cols, dtypes)] col_and_types = ', '.join(col_and_types) self._cursor.execute(f""" CREATE EXTERNAL TABLE IF NOT EXISTS {lab}.{table_name}({col_and_types}) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION '{path}' """) # TODO: add APPEND TO TABLE OPTION def _refresh_table(self, lab, table_name): self._cursor.execute(f'REFRESH {lab}.{table_name}') def create_table_csv(self, df, lab, table_name, path, dtypes, cols=None): """ Wrapper method: deletes previous table, transfers new csv file, creates new table, and refreshes it. Note: dtypes must be one of: ARRAY, BIGINT, BINARY, BOOLEAN, CHAR, DATE, DATETIME, DECIMAL, REAL, FLOAT, INTEGER, MAP, SMALLINT, STRING, STRUCT, TIMESTAMP, TINYINT, VARCHAR Args: df: A pandas.DataFrame. lab: Name of DataLab where to create the table. table_name: Name of the table to be created. path: Where to store the underlying data. dtypes: SQL data types for each column. cols: List of strings with column names. By default df.columns.values. """ self._delete_table(lab=lab, table_name=table_name) self.to_csv(df, f'{path}/{table_name}.csv', index=False, header=False) self._create_table(lab=lab, table_name=table_name, cols=cols or df.columns.values, dtypes=dtypes, path=path) def create_table(self, df, lab, table_name, path, external=True, permissions=_default_permissions): """Create table storing as parquet file. TODO: integrate with the above, allowing user to choose.;Allow to append rows to existing table instead of deleting old one; check whether there is no other table in present Args: df: A pandas.DataFrame or pyspark.sql.dataframe.DataFrame. lab: Name of DataLab where to create the table. table_name: Name of the table to be created. path: Where to store the underlying data. external: EXTERNAL table if `True`. permissions (str or None): posix representation or permission, give as oct string, e.g, '777'(default) or 0o777 """ self._delete_table(lab=lab, table_name=table_name) if not isinstance(df, pd.DataFrame): # If Spark dataframe path += f'/{table_name}' df.write.parquet(path) self._hdfs_cnxn.chmod(path, permissions) file_path = [f for f in disc.ls(path) if f[-8:]=='.parquet'][0] else: # If Pandas dataframe df = df.reset_index(drop=True) file_path = f'{path}/{table_name}.parq' df.columns = df.columns.str.replace(':','').str.replace(' ','') self.to_parquet(df, file_path, permissions=permissions) query = (f"""CREATE {'EXTERNAL' if external else ''} TABLE {lab}.{table_name} LIKE PARQUET '{file_path}' STORED AS PARQUET {f"LOCATION '{path}'" if external else ''};""") self._cursor.execute(query) def _fix_path(self, path): """Adds hdfs root to path.""" if path[:len(self._HDFS_PATH)] != self._HDFS_PATH: path = self._HDFS_PATH + path return path def read_csv(self, path, **kwargs): """Wrapper around pandas.read_csv. Args: path (str): Path to DISC location kwargs: Keyword arguments to be passed to pandas.read_csv """ with self._hdfs_cnxn.open(path) as f: df = pd.read_csv(f, **kwargs) return df def to_csv(self, df, path, name=None, permissions=_default_permissions, **kwargs): """Save dataframe to DISC `path` in csv format. Args: df (pandas.DataFrame) path (str): Path to DISC location permissions (str or None): posix representation or permission, give as oct string, e.g, '777'(default) or 0o777 kwargs: Keyword arguments to be passed to pandas.to_csv""" if name is None: name = ntpath.basename(path) path = ntpath.dirname(path) token = secrets.token_hex(nbytes=8) local_file = f'{self.__TEMP_LOCAL_DIR}/{token}{name}' df.to_csv(local_file, **kwargs) self.upload_file(local_file=local_file, destination_file_path=f'{path}/{name}', rm_local=True, permissions=permissions) def read_excel(self, path, **kwargs): """Wrapper around pandas.read_excel""" with self._hdfs_cnxn.open(path) as f: df = pd.read_excel(f, **kwargs) return df def ls(self, path): """Wrapper around `self._hdfs_cnxn.ls`.""" return self._hdfs_cnxn.ls(path) def listdir(self, path, full_path=False): """Wrapper around `self.ls`. The parameter `full_path` (False by default) allows to hide the root of the paths showing only content of selected path.""" paths = self.ls(path) if not full_path: paths = [path.split('/')[-1] for path in paths] return paths def makedir(self, destination_path): """Wrapper around WebHDFS.makedir.""" self._hdfs_cnxn.makedir(destination_path) def read_parquet(self, path, **kwargs): """Wrapper aroud pandas.read_parquet. Args: path (str): Path to DISC location kwargs: Keyword arguments to be passed to pandas.read_parquet """ df = pd.read_parquet(path, filesystem=self._hdfs_cnxn, **kwargs) return df def to_parquet(self, df, path, permissions=_default_permissions, **kwargs): """Wrapper aroud pandas.DataFrame.to_parquet. Args: df (pandas.DataFrame) path (str): Path to DISC location permissions (str or None): posix representation or permission, give as oct string, e.g, '777'(default) or 0o777 kwargs: Keyword arguments to be passed to pandas.to_parquet """ df.columns = df.columns.astype(str) df.to_parquet(path, filesystem=self._hdfs_cnxn, **kwargs) if permissions: self._hdfs_cnxn.chmod(path, permissions) def to_feather(self, df, path, permissions=_default_permissions, **kwargs): """Wrapper around pandas.to_feather. Args: df (pandas.DataFrame) path (str): Path to DISC location permissions (str or None): posix representation or permission, give as oct string, e.g, '777'(default) or 0o777 kwargs: Keyword arguments to be passed to pandas.to_feather """ name, path = ntpath.basename(path), ntpath.dirname(path) token = secrets.token_hex(nbytes=8) local_file = f'{self.__TEMP_LOCAL_DIR}/{token}{name}' df.to_feather(local_file, **kwargs) self.upload_file(local_file, f'{path}/{name}', rm_local=True, permissions=permissions) def read_feather(self, path, **kwargs): """Wrapper around pandas.read_feather. Args: path (str): Path to DISC location kwargs: Keyword arguments to be passed to pandas.read_feather """ with self._hdfs_cnxn.open(path) as f: df = pd.read_feather(f, **kwargs) return df def to_stata(self, df, path, permissions=_default_permissions, **kwargs): """Wrapper around pandas.to_stata. Args: df (pandas.DataFrame) path (str): Path to DISC location permissions (str or None): posix representation or permission, give as oct string, e.g, '777'(default) or 0o777 kwargs: Keyword arguments to be passed to pandas.to_stata """ name, path = ntpath.basename(path), ntpath.dirname(path) token = secrets.token_hex(nbytes=8) local_file = f'{self.__TEMP_LOCAL_DIR}/{token}{name}' df.to_stata(local_file, **kwargs) self.upload_file(local_file, f'{path}/{name}', rm_local=True, permissions=permissions) def read_stata(self, path, **kwargs): """Wrapper around pandas.read_stata. Args: path (str): Path to DISC location kwargs: Keyword arguments to be passed to pandas.read_stata """ with self._hdfs_cnxn.open(path) as f: df = pd.read_stata(f, **kwargs) return df def read_encrypted(self, path, password, **kwargs): """Wrapper aroud cryptpandas.read_encrypted. Args: path (str): Path to DISC location password (str): Password for decryption kwargs: Keyword arguments to be passed to cryptpandas.read_encrypted """ token = secrets.token_hex(nbytes=8) local_file = f'{self.__TEMP_LOCAL_DIR}/encrypted_{token}' self._hdfs_cnxn.download(path, local_file) df = crpd.read_encrypted(local_file, password, **kwargs) return df def to_encrypted(self, df, path, password, permissions=_default_permissions, **kwargs): """Write a DataFrame as encrypted binary at specified DISC location. Args: df: A pandas DataFrame path: Path to DISC location password: Password for encryption permissions (str or None): posix representation or permission, give as oct string, e.g, '777'(default) or 0o777 kwargs: Keyword arguments to be passed to cryptpandas.to_encrypted """ # Write the encrypted file token = secrets.token_hex(nbytes=8) name, root = ntpath.basename(path), ntpath.dirname(path) local_file = f'{self.__TEMP_LOCAL_DIR}/{token}_{name}' encrypted = crpd.to_encrypted(df, password=password, path=local_file, **kwargs) destination_file_path = f'{root}/{name}' self.upload_file(local_file, destination_file_path, rm_local=False, overwrite=True, permissions=permissions) def to_pickle(self, obj, path, protocol='HIGHEST_PROTOCOL', permissions=_default_permissions, **kwargs): """Saves to remote HDFS as pickle file. Args: path (str): the path of the file to be saved protocol (str or int): Either a strig ('HIGHEST_PROTOCOL' or 'DEFAULT_PROTOCOL') or an integer permissions (str or None): posix representation or permission, give as oct string, e.g, '777'(default) or 0o777 """ if isinstance(protocol, str): protocol = getattr(pickle, protocol) with self._hdfs_cnxn.open(path, "wb") as f: pickle.dump(obj, f, protocol=protocol, **kwargs) if permissions: self._hdfs_cnxn.chmod(path, permissions) def read_pickle(self, path, **kwargs): with self._hdfs_cnxn.open(path, "rb") as f: obj = pickle.load(f, **kwargs) return obj def upload_file(self, local_file, destination_file_path, rm_local=False, overwrite=True, permissions=_default_permissions): """Uploads file to DISC. Args: local_file (str): Path to local file to be uploaded. destination_file_path (str): Destination path. rm_local (bool): If True deletes local file after upload (default is False). overwrite (bool): If True overwrites file at destination (default is True). permissions (str or None): posix representation or permission, give as oct string, e.g, '777'(default) or 0o777 """ if overwrite: if self._hdfs_cnxn.exists(destination_file_path): self._hdfs_cnxn.rm(destination_file_path) self._hdfs_cnxn.upload(local_file, destination_file_path) if permissions: self._hdfs_cnxn.chmod(destination_file_path, permissions) if rm_local: os.remove(local_file) def upload(self, local_path, destination_path): """Uploads files and/or folders from `local_path` onto the DISC `destination_path`""" if not self._hdfs_cnxn.exists(destination_path): self._hdfs_cnxn.mkdir(destination_path) if os.path.isdir(local_path): for root, dirs, files in [*os.walk(local_path)]: if root==local_path: dest_path = destination_path else: relpath = os.path.relpath(root, local_path) dest_path = f'{destination_path}/{relpath}' for dir in dirs: if not self._hdfs_cnxn.exists(f'{dest_path}/{dir}'): self._hdfs_cnxn.mkdir(f'{dest_path}/{dir}') for file in files: self.upload_file(local_file=f'{root}/{file}', destination_file_path=f'{dest_path}/{file}', rm_local=False) else: # upload_file self.upload_file(local_file=local_path, destination_file_path=destination_path, rm_local=False) def savefig(self, path, ax=None, **kwargs): """Saves matplotlib figure to DISC `path` destination.""" import matplotlib.pyplot as plt if ax is not None: plt = ax token = secrets.token_hex(nbytes=8) local_file = f'{self.__TEMP_LOCAL_DIR}/{token}_fig.png' plt.savefig(local_file, **kwargs) self.upload_file(local_file=local_file, destination_file_path=path, rm_local=True) def make_vintage(self, origin, freq='month', overwrite=False, deep=False, exclude=[]): """ Args: origin (str): The folder of which you want to store vintages. A new 'origin/VINTAGES' folder will be created. freq (str): Frequency with which to store vintages. Choose between year, month, day, max. Default is 'month'. overwrite (bool): Whether to overwrite already existing vintages. Default is `False`. deep (bool): If True stores as year/month/etc. Else in a single folder year_month_day_etc. exclude (list): List of items (files or directories) in origin, of which no vintaging should occur. """ from datetime import datetime dtmt = datetime.today() _DAY, _MONTH, _YEAR = dtmt.day, dtmt.strftime('%b'), dtmt.year sep = '/' if deep else '_' vintage = f'{_YEAR}{sep}{_MONTH}' if freq=='year': vintage = f'{_YEAR}' elif freq=='day': vintage = f'{_YEAR}{sep}{_MONTH}{sep}{_DAY}' elif freq=='max': vintage = f'{_YEAR}{sep}{_MONTH}{sep}{_DAY}{sep}{dtmt.hour}h{dtmt.minute}m{dtmt.second}s' if self._hdfs_cnxn.exists(f'{origin}/VINTAGES/{vintage}') and not overwrite: raise PermissionError(f"Vintage 'VINTAGES/{vintage}' already exists at {origin}.\nTo override set `overwrite=True`.") else: self._hdfs_cnxn.mkdir(f'{origin}/VINTAGES/{vintage}') current = set(self.listdir(origin, full_path=False)) - ({'.', '..', 'VINTAGES'}|set(exclude)) for item in current: self.hdfs_mv(f'{origin}/{item}', f'{origin}/VINTAGES/{vintage}/') def hdfs_mv(self, origin_path, destination_path): """Moves files/directories from one DISC location to another.""" # if IS_WINDOWS: self._hdfs_cnxn.mv(origin_path, destination_path) # else: # (subprocess.Popen(f'hdfs dfs -mv {origin_path} {destination_path}', # stdout=subprocess.PIPE, shell=True) # .communicate()) def read_sql(self, query, **kwargs): """Performs a sql query on disc. Args: query (str): a SQL query. kwargs: Keyword arguments to be passed to pandas.read_sql. Returns: pd.DataFrame """ return pd.read_sql(query, con=self._cnxn, **kwargs) def __repr__(self): states = ('Not active', 'Active') sb, eb = ("\033[1m","\033[0;0m") if self._is_ipython else ('','') # Bold sr, er = ("\x1b[31m", "\x1b[0m") if self._is_ipython else ('','') # Red sg, eg = ("\033[92m", "\033[0m") if self._is_ipython else ('','') # Green clr_str = lambda isc:(sg,eg) if isc else (sr,er) state_str = lambda isc: f'{clr_str(isc)[0]}{states[isc]}{clr_str(isc)[1]}' _repr = f"\n{sb}DISC connection{eb}: {state_str(self._is_disc_connected)}"\ f"\nEngine: {self._engine}"\ f"\nSelected database: {self.db}"\ f"\n{sb}Spark connection{eb}: {state_str(self._is_spark_connected)}" return _repr def _repr_html_(self): with open(f'{os.path.dirname(__file__)}/res/connector_mini.svg', 'r') as f: _svg_cnn = f.read() states = ('Not active', 'Active') colors = ('#C82806', '#138F0B') bcolors = ('#FCD9D9', '#DBFCD9') html_repr = _svg_cnn + f"""</br> <span style="white-space: nowrap;"> <b>DISC connection</b>: <span style="color:{colors[self._is_disc_connected]}; background-color:{bcolors[self._is_disc_connected]}"; white-space: nowrap;>{states[self._is_disc_connected]}</span> </span></br> <span style="white-space: nowrap;"> <span style="color: gray">Engine:</span> <span white-space: nowrap;>{self._engine}</span> </span></br> <span style="white-space: nowrap;"> <span style="color: gray">Selected database:</span> <span white-space: nowrap;>{self.db}</span> </span></br> </br> <span style="white-space: nowrap;"> <b>Spark Connection</b>: <span style="color:{colors[self._is_spark_connected]}; background-color:{bcolors[self._is_spark_connected]}"; white-space: nowrap;>{states[self._is_spark_connected]}</span> </span>""" if self._is_spark_connected: html_repr +=f"""</br> <b><i>SparkContext</i></b></br> <a href="{self._spark_uri}">Spark UI</a></br> <span style="white-space: nowrap;"> <span style="color: gray">Master:</span> <span white-space: nowrap;>{self.spark.sparkContext.master}</span> </span></br> <span style="white-space: nowrap;"> <span style="color: gray">AppName:</span> <span white-space: nowrap;>{self.spark.sparkContext.appName}</span> </span></br> """ html_repr += f"""</br></br> <a href="{self.USER_GUIDE_URL}"> Need help? Check the documentation!</a>""" return html_repr def show_spark_conf(self): """Displays spark configurations.""" if not self._is_spark_connected: print('Spark is not connected. To connect, try `disc.connect_spark()`') else: from IPython.core.display import HTML, display html_repr = f"""</br><b><i>Spark Configurations</i></b></br>""" confs = self.spark.sparkContext.getConf().getAll() for (cnf_k,cnf_v) in confs: html_repr += f""" <span style="white-space: nowrap;"> <span style="color: gray">{cnf_k[6:]}:</span> <span white-space: nowrap;>{cnf_v}</span> </span></br>""" display(HTML(html_repr)) def __log(self): try: _path = '/data/lab/dlb_ecb_public/share/_CONNECTORS_LOG' date = str(datetime.datetime.today().date()) if date not in self.listdir(_path): self.to_pickle(0, f'{_path}/{date}/logs.p') else: L = self.read_pickle(f'{_path}/{date}/logs.p') self.to_pickle(L+1, f'{_path}/{date}/logs.p') except: pass def close(self, rm_local_temp=False): """Closes connection to DISC. Args: rm_local_temp (bool): Delete local temp folder. Default is False. """ if rm_local_temp: shutil.rmtree(self.__TEMP_LOCAL_DIR, ignore_errors=True) self._cursor.close() self._cnxn.close() self._is_disc_connected = False self.stop_spark() if IS_WINDOWS: del self._hdfs_cnxn else: self._hdfs_cnxn.close() print('Closed connection to DISC.') def __del__(self): try: self.close() except: pass