def keytab(self, arg): """ Look for Kerberos Ticket. """ args = ParameterArgs(parse_argstring(self.url, arg)) if not utils.renew_kerberos_ticket(args.get("principal"), args.get("keytab")): raise Exception("Unable to renew kerberos ticket")
def sts(self, arg, line='', cell='', local_ns=None): """Connects to spark thrift server and executes the query Example2: # To initialize spark thrift server connection. %%sts -h 127.0.0.1 -p 1000 select * form dim_cust limit 10 """ # save globals and locals so they can be referenced in bind vars if not (line or cell): if not arg.startswith("-"): line = arg arg = '' args = ParameterArgs(parse_argstring(self.sts, arg)) user_ns = self.shell.user_ns.copy() if local_ns: user_ns.update(local_ns) if not cell: cell = line result_set = self._get_connection_(ConnectionType.STS, cluster=args.get("cluster_name"), host=args.get("host"), port=args.get("port"), auth=args.get("auth")).execute( cell, self.autolimit, self.displaylimit, self.progress_bar) return self._process_results_(result_set, args.get('tableau'), args.get('publish'), args.get('tde_name'), args.get('project_name'))
def csv(self, arg, line='', cell='', local_ns=None): # save globals and locals so they can be referenced in bind vars """CSV Magic Accepted Query formats: All select sql statements: select * from filename.csv/tsv return: Dataframe Example Queries: 1. select * from test.csv 2. select col1 from test.csv where col1=1 3.select * from test.tsv Note: Currently csv magic supports only select sqls """ if not (line or cell): if not arg.startswith("-"): line = arg arg = '' args = ParameterArgs(parse_argstring(self.csv, arg)) user_ns = self.shell.user_ns.copy() if local_ns: user_ns.update(local_ns) if not cell: cell = line result_set = self._get_connection_(ConnectionType.CSV, '').execute(cell) return self._process_results_(result_set, args.get('tableau'), args.get('publish'), args.get('tde_name'), args.get('project_name'))
def presto(self, arg, line='', cell='', local_ns=None): """Connects to presto execution engine for query execution. Example2: %presto select * from cluster.default.dim_cust limit 10 # To download data %%presto -d True select * from cluster.default.dim_cust limit 10 """ # save globals and locals so they can be referenced in bind vars if not (line or cell): if not arg.startswith("-"): line = arg arg = '' args = ParameterArgs(parse_argstring(self.presto, arg)) user_ns = self.shell.user_ns.copy() if local_ns: user_ns.update(local_ns) if not cell: cell = line result_set = self._get_connection_(ConnectionType.PRESTO, args.get("cluster_name"), args.get("host"), args.get("port"), args.get("auth")).execute( cell, self.autolimit, self.displaylimit, self.progress_bar) return self._process_results_(result_set, args.get('tableau'), args.get('publish'), args.get('tde_name'), args.get('project_name'))
def publish(self, arg, line='', cell='', local_ns=None): """ Publish to Tableau. """ if not (line or cell): if not arg.startswith("-"): line = arg arg = '' args = ParameterArgs(parse_argstring(self.publish, arg)) user_ns = self.shell.user_ns.copy() if local_ns: user_ns.update(local_ns) if not cell: cell = line if cell.startswith('%%'): magic = cell.split(" ", 1)[0].lstrip('%') arg = cell.split(" ", 1)[1].split("\n", 1)[0] query = cell.split(" ", 1)[1].split("\n", 1)[1] result = get_ipython().run_cell_magic(magic, arg, query) return publish(result, args.get('tde_name'), args.get('project_name')) elif cell.startswith('%'): magic = cell.split(" ", 1)[0].lstrip('%') query = cell.split(" ", 1)[1] result = get_ipython().run_line_magic(magic, query) return publish(result, args.get('tde_name'), args.get('project_name')) df_name = user_ns[cell] return publish(df_name, args.get('tde_name'), args.get('project_name'))
def teradata(self, arg, line='', cell='', local_ns=None): """Connects to teradata system and executes the query. Example2: # To download data %%teradata --host select * from database.table_name sample 10 # To insert csv data to a table %teradata -f dim_cust.csv -t pp_scratch.dim_cust """ # save globals and locals so they can be referenced in bind vars if not (line or cell): if not arg.startswith("-"): line = arg arg = '' args = ParameterArgs(parse_argstring(self.teradata, arg)) user_ns = self.shell.user_ns.copy() if local_ns: user_ns.update(local_ns) if not cell: cell = line if args.get("table") and (args.get("csv") or args.get("dataframe")): data_frame = utils.csv_to_df(user_ns, args) return self._get_connection_(ConnectionType.TERADATA, args.get("cluster_name"), args.get("host")).insert_csv( args.get("table"), data_frame, self.autolimit, self.displaylimit) result_set = self._get_connection_(ConnectionType.TERADATA, args.get("cluster_name"), args.get("host")).execute( cell, self.autolimit, self.displaylimit, self.progress_bar) return self._process_results_(result_set, args.get('tableau'), args.get('publish'), args.get('tde_name'), args.get('project_name'))
def run_pipeline(self, arg, line='', cell='', local_ns=None): """Run notebooks sequentially in a pipeline. A dictionary called _pipeline_workspace is created by the magic that will be shared by all the notebooks in the pipeline. The state can contain DataFrames, Lists, Dictionaries and objects. Notebook parameterization can be used to load and read from the shared state. The pipeline supports execution of parameterized notebooks. If parameters are used, the first code cell will be treated to contain only parameter assignments. Parameters can be a string, number, list or dictionary. To save a notebook's execution in the pipeline, the save name should be specified along with the execution notebook separated with a colon. Run parameters will only change their equivalent parameters from the first code cell. Unknown parameters will be ignored. Adding parameters on an execution is optional. # simple pipeline Example1: %%run_pipeline first notebook in pipeline; second notebook in pipeline; third notebook in pipeline # pipleine with parameterized notebooks Example2: %%run_pipeline first notebook in pipeline key01=int key01=string key02={'key01': param01}; second notebook in pipeline; third notebook in pipeline:your save name key01=int key02=string key03=[param01, param02] """ # save globals and locals so they can be referenced in bind vars clear_namespace_cell = nbformat.v4.new_code_cell( source="from IPython import get_ipython\n" + "_ip = get_ipython()\n" + "_user_vars = %who_ls\n" + "for _var in _user_vars:\n" + " if _var != '_pipeline_workspace':\n" + " del _ip.user_ns[_var]\n" + "import gc\n" + "gc.collect()") pipeline_state_cell = nbformat.v4.new_code_cell( source="_pipeline_workspace = {'frames': list()}") if not (line or cell): if not arg.startswith("-"): line = arg arg = '' args = ParameterArgs(parse_argstring(self.run, arg)) user_ns = self.shell.user_ns.copy() if local_ns: user_ns.update(local_ns) if not cell: cell = line notebook_run_cmds = cell.split(';') notebook_run_cmds = [ notebook_run_cmd.strip() for notebook_run_cmd in notebook_run_cmds ] execute_preprocessor = ExecutePreprocessor( kernel_name='python3', timeout=args.get('cell_timeout')) kernel_manager, kernel_comm = start_new_kernel(kernel_name='python3') execute_preprocessor.km = kernel_manager execute_preprocessor.kc = kernel_comm def execute_cell(nb4_cell): try: execute_preprocessor.run_cell(nb4_cell) except BaseException: if kernel_manager or kernel_comm: kernel_comm.stop_channels() kernel_manager.shutdown_kernel() def execute_notebook(notebook_filename, notebook_save_filename, params): with open(notebook_filename) as file_handler: notebook = nbformat.read(file_handler, as_version=4) b_errors = False if params: for nb_cell in notebook.cells: if nb_cell.cell_type == 'code': new_cell_source = utils.substitute_params( nb_cell.source, params) nb_cell.source = new_cell_source break try: execute_preprocessor.nb = notebook progress_bar = widgets.IntProgress( value=0, min=0, max=len(notebook.cells), step=1, bar_style= 'info', # 'success', 'info', 'warning', 'danger' or '' orientation='horizontal') display_label = notebook_filename if notebook_save_filename: display_label = display_label + ' : ' + notebook_save_filename display( widgets.HBox( [widgets.Label(display_label), progress_bar])) for idx, nb_cell in enumerate(notebook.cells): execute_preprocessor.preprocess_cell( nb_cell, resources={'metadata': {}}, cell_index=idx) progress_bar.value = idx + 1 except CellExecutionError: b_errors = True progress_bar.bar_style = 'danger' if kernel_manager or kernel_comm: kernel_comm.stop_channels() kernel_manager.shutdown_kernel() raise finally: if notebook_save_filename: with open(notebook_save_filename, mode='wt') as file_handler: nbformat.write(notebook, file_handler) if not b_errors: progress_bar.bar_style = 'success' execute_cell(pipeline_state_cell) for notebook_run_cmd in notebook_run_cmds: run_notebook_name, notebook_save_name, nb_params = utils.parse_run_str( notebook_run_cmd) execute_notebook(run_notebook_name, notebook_save_name, nb_params) execute_cell(clear_namespace_cell) if kernel_manager or kernel_comm: kernel_comm.stop_channels() kernel_manager.shutdown_kernel()
def run(self, arg, line='', cell='', local_ns=None): """Runs a notebook from another notebook. Allows for running parameterized notebooks. If using parameters the first codecell will be treated to contain only parameter assignments. Parameters can be strings, numbers, lists or dictionaries. The magic can enable sequential or parallel execution of notebooks. To save a notebook's execution, the save name should be specified along with the execution notebook separated with a colon. Run parameters will only change their equivalent parameters from the first code cell. Unknown parameters will be ignored. Adding parameters on an execution is optional. # simple run Example1: %run your notebook # simple sequential run Example1: %%run your notebook 01; your notebook 02 # simple run allow errors Example1: %%run -e True your notebook # simple run show progress bar Example1: %%run -pbar True your notebook # simple run show progress bar and save execution Example1: %%run -pbar True your notebook:your save notebook # simple run in parallel with progressbar Example1: %%run -pbar True -p True your notebook 01; your notebook 02 # simple run in parallel with progressbar and disabling cell timeout Example1: %%run -pbar True -t -1 your notebook 01; your notebook 02 # parameterized run in parallel with progressbar Example1: %%run -pbar True -p True your notebook 01 key01=int key01=string key02={'key01': param01}; your notebook 02:your save name key01=int key02=string key03=[param01, param02] """ # save globals and locals so they can be referenced in bind vars if not (line or cell): if not arg.startswith("-"): line = arg arg = '' args = ParameterArgs(parse_argstring(self.run, arg)) user_ns = self.shell.user_ns.copy() if local_ns: user_ns.update(local_ns) if not cell: cell = line notebook_run_cmds = cell.split(';') notebook_run_cmds = [ notebook_run_cmd.strip() for notebook_run_cmd in notebook_run_cmds ] def execute_notebook(notebook_filename, notebook_save_filename, params): log = UserMessages() with open(notebook_filename) as file_handler: notebook = nbformat.read(file_handler, as_version=4) b_errors = False execute_preprocessor = ExecutePreprocessor( timeout=args.get('cell_timeout'), allow_errors=args.get('allow_errors')) kernel_manager = None kernel_comm = None progress_bar = args.get('enable_progress_bar') if params: for nb_cell in notebook.cells: if nb_cell.cell_type == 'code': new_cell_source = utils.substitute_params( nb_cell.source, params) nb_cell.source = new_cell_source break try: if progress_bar: progress_bar = widgets.IntProgress( value=0, min=0, max=len(notebook.cells), step=1, bar_style= 'info', # 'success', 'info', 'warning', 'danger' or '' orientation='horizontal') kernel_manager, kernel_comm = start_new_kernel( kernel_name=notebook['metadata']['kernelspec'] ['name']) execute_preprocessor.km = kernel_manager execute_preprocessor.kc = kernel_comm execute_preprocessor.nb = notebook display_label = notebook_filename if notebook_save_filename: display_label = display_label + ' : ' + notebook_save_filename display( widgets.HBox( [widgets.Label(display_label), progress_bar])) for idx, nb_cell in enumerate(notebook.cells): execute_preprocessor.preprocess_cell( nb_cell, resources={'metadata': {}}, cell_index=idx) progress_bar.value = idx + 1 else: log.info("Running Notebook: " + notebook_filename) execute_preprocessor.preprocess( notebook, {'metadata': {}}) except CellExecutionError: b_errors = True if progress_bar: progress_bar.bar_style = 'danger' raise except AttributeError: b_errors = True if progress_bar: progress_bar.bar_style = 'danger' raise finally: if notebook_save_filename: with open(notebook_save_filename, mode='wt') as file_handler: nbformat.write(notebook, file_handler) if kernel_manager or kernel_comm: kernel_comm.stop_channels() kernel_manager.shutdown_kernel() if not b_errors: if progress_bar: progress_bar.bar_style = 'success' else: log.info(notebook_filename + " was executed successfully.") elif b_errors and not progress_bar: log.error(notebook_filename + " execution failed.") if args.get('parallel'): futures = [] with concurrent.futures.ThreadPoolExecutor( max_workers=20) as executor: for notebook_run_cmd in notebook_run_cmds: run_notebook_name, notebook_save_name, nb_params = utils.parse_run_str( notebook_run_cmd) futures.append( executor.submit(execute_notebook, run_notebook_name, notebook_save_name, nb_params)) # Handle other notebook runs if one or more fails intermittently for future in concurrent.futures.as_completed(futures): try: future.result() except CellExecutionError: raise else: for notebook_run_cmd in notebook_run_cmds: run_notebook_name, notebook_save_name, nb_params = utils.parse_run_str( notebook_run_cmd) execute_notebook(run_notebook_name, notebook_save_name, nb_params)
def hive(self, arg, line='', cell='', local_ns=None): """Connects to hive execution engine and executes the query. Example2: %%hive --hive_server hive.server.com --port 10000 --auth gssapi select * from database.table_name limit 10 # To query data from hive %%hive select * from database.table_name limit 10 # To insert csv data to a table %hive -f file.csv -t database.table_name """ # save globals and locals so they can be referenced in bind vars if not (line or cell): if not arg.startswith("-"): line = arg arg = '' args = ParameterArgs(parse_argstring(self.hive, arg)) user_ns = self.shell.user_ns.copy() if local_ns: user_ns.update(local_ns) if not cell: cell = line if args.get("table") and (args.get("csv") or args.get("dataframe")): csv = utils.df_to_csv(user_ns, args) df_flag = False if args.get("dataframe"): df_flag = True return self._get_connection_(ConnectionType.HIVE, cluster=args.get("cluster_name"), host=args.get("hive_server"), port=args.get("port"), auth=args.get("auth")).insert_csv( args.get("table"), args.get("name_node_url"), args.get("name_node_options"), csv, df_flag, self.autolimit, self.displaylimit) result_set = self._get_connection_(ConnectionType.HIVE, cluster=args.get("cluster_name"), host=args.get("hive_server"), port=args.get("port"), auth=args.get("auth"), resource_manager=args.get("resource_manager_url")).\ execute(cell, self.autolimit, self.displaylimit, self.progress_bar) return self._process_results_(result_set, args.get('tableau'), args.get('publish'), args.get('tde_name'), args.get('project_name'))