def decorator_hidden(func): @functools.wraps(func) def wrapper_hidden(*args, **kwargs): return func(*args, **kwargs) if debug_decorators: console.print("hidden decorator called, name=", name, ", func=", func.__name__) global current_cmd_info if not current_cmd_info: errors.internal_error( "@hidden decorators must be followed by a single @command decorator" ) # a hidden is really just a hidden option type_name = type if isinstance(type, str) else type.__name__ option_info = { "name": name, "hidden": True, "type": type_name, "default": default, "help": help } #current_cmd_info["hiddens"].append(hidden_info) update_or_insert_argument(current_cmd_info, "options", option_info) return wrapper_hidden
def decorator_keyword_arg(func): @functools.wraps(func) def wrapper_keyword_arg(*args, **kwargs): return func(*args, **kwargs) if debug_decorators: console.print("keyword_arg decorator called, name=", name, ", func=", func.__name__) global current_cmd_info if not current_cmd_info: errors.internal_error( "@keyword_arg decorators must be followed by a single @command decorator" ) type_name = type if isinstance(type, str) else type.__name__ arg_info = { "name": name, "keywords": keywords, "required": required, "type": type_name, "help": help, "default": default } #current_cmd_info["keyword_args"].insert(0, arg_info) update_or_insert_argument(current_cmd_info, "arguments", arg_info) return wrapper_keyword_arg
def report_on_runs(self, runs_by_exper, stage, max_items=None): runs_reported = 0 console.print("target={} runs: {}".format(self.compute, stage)) exper_names = list(runs_by_exper.keys()) exper_names.sort() for exper_name in exper_names: runs = runs_by_exper[exper_name] # filter runs for this stage runs = [run for run in runs if self.match_stage(run, stage)] if runs: console.print("\nruns for experiment {}:".format(exper_name)) columns = [ "xt_run_name", "status", "id", "number", "PORTAL_URL" ] lb = ReportBuilder(self.config, self.store, client=None) if max_items and len(runs) > max_items: runs = runs[:max_items] text, rows = lb.build_formatted_table(runs, columns, max_col_width=100) console.print(text) runs_reported += len(runs) if runs_reported: console.print("total runs {}: {}".format(runs_reported, stage)) else: console.print(" no {} runs found\n".format(stage))
def _restart_runs_for_node(self): ''' non-atomic update of all active runs for this node: set to constants.WAITING ''' elem_dict = { "node_id": self.node_id, "status": { "$in": [constants.STARTED, constants.RESTARTED] } } fd = {"_id": self.job_id, "active_runs": {"$elemMatch": elem_dict}} while True: # this will only update a single array entry at a time (using mongo 3.2) cmd = lambda: self.mongo.mongo_db["__jobs__"].find_and_modify( fd, update={"$set": { "active_runs.$.status": constants.WAITING }}, new=True) dd = self.mongo.mongo_with_retries("_restart_runs_for_node", cmd) if not dd: break console.print("_restart_runs_for_node: found a run on node=" + self.node_id)
def decorator_clone(func): @functools.wraps(func) def wrapper_clone(*args, **kwargs): return func(*args, **kwargs) if debug_decorators: console.print("clone decorator called, source=", source, ", func=", func.__name__) global current_cmd_info, root_cmd_info if not current_cmd_info: errors.internal_error( "@clone decorators must be followed by a single @command or @root decorator" ) source_cmd_info = get_command_by_words(source.split("_")) if arguments: current_cmd_info["arguments"] += source_cmd_info["arguments"] if options: current_cmd_info["options"] += source_cmd_info["options"] return wrapper_clone
def root(self, name, value): #console.print("setting root name={} to value={}".format(name, value)) if name == "help": pass # will be handled later elif name == "console": console.set_level(value) elif name == "stack-trace": utils.show_stack_trace = value elif name == "new": if value and process_utils.can_create_console_window(): cmd = qfe.current_dispatcher.dispatch_cmd echo_cmd = "xt " + cmd.replace("--new", "--echo", 1) process_utils.run_cmd_in_new_console(echo_cmd) errors.early_exit_without_error() elif name == "echo": if value: cmd = qfe.current_dispatcher.dispatch_cmd console.print("xt " + cmd, flush=True) elif name == "quick-start": pass # was already handled elif name == "prep": self.prep_machine_for_controller() else: errors.syntax_error("unrecognized root flag=" + name)
def decorator_flag(func): @functools.wraps(func) def wrapper_flag(*args, **kwargs): return func(*args, **kwargs) if debug_decorators: console.print("flag decorator called, name=", name, ", func=", func.__name__) global current_cmd_info, root_cmd_info # a flag is really just a type=flag option option_info = { "name": name, "hidden": False, "type": "flag", "multiple": False, "default": default, "help": help } if not current_cmd_info: errors.internal_error( "@flag decorators must be followed by a single @command or @root decorator" ) update_or_insert_argument(current_cmd_info, "options", option_info) return wrapper_flag
def decorator_option(func): @functools.wraps(func) def wrapper_option(*args, **kwargs): return func(*args, **kwargs) if debug_decorators: console.print("option decorator called, name=", name, ", func=", func.__name__) global current_cmd_info if not current_cmd_info: errors.internal_error( "@option decorators must be followed by a single @command decorator" ) type_name = type if isinstance(type, str) else type.__name__ option_info = { "name": name, "hidden": False, "required": required, "type": type_name, "multiple": multiple, "default": default, "values": values, "help": help } #current_cmd_info["options"].append(option_info) update_or_insert_argument(current_cmd_info, "options", option_info) return wrapper_option
def decorator_command(func): @functools.wraps(func) def wrapper_command(*args, **kwargs): return func(*args, **kwargs) # begin actual decorater processing global first_command if first_command: first_command = False # console.diag("processing first cmd decorator") #console.print("first command...") if name: cmd_name = name else: cmd_name = func.__name__.replace("_", " ") if debug_decorators: console.print("command decorator called, func=", func.__name__) dd = commands for name_part in cmd_name.split(" "): if name_part not in dd: dd[name_part] = {} dd = dd[name_part] cmd_info = { "name": cmd_name, "options_before_args": options_before_args, "keyword_optional": keyword_optional, "pass_by_args": pass_by_args, "group": group, "func": func, "arguments": [], "options": [], "examples": [], "faqs": [], "hidden": False, "see_alsos": [], "kwgroup": kwgroup, "kwhelp": kwhelp, "help": help } dd[""] = cmd_info if keyword_optional: # only 1 command can use this if "" in commands: errors.internal_error( "processing command decoration for '{}'; only 1 command can use 'keyword_optional'" .format(func.__name__)) commands[""] = cmd_info global current_cmd_info current_cmd_info = cmd_info # end actual decorater processing return wrapper_command
def syntax_error(self, msg): console.print(msg) self.show_current_command_syntax() if self.raise_syntax_exception: errors.syntax_error("syntax error") errors.syntax_error_exit()
def warning(self, *msg_args): msg = "WARNING: xt_config file -" for arg in msg_args: msg += " " + str(arg) if self.get("internal", "raise", suppress_warning=True): errors.config_error(msg) else: console.print(msg)
def debug_break(): import ptvsd # 5678 is the default attach port in the VS Code debug configurations console.print("Waiting for debugger attach") ptvsd.enable_attach(address=('localhost', 5678), redirect_output=True) ptvsd.wait_for_attach() breakpoint()
def rerun(self, run_name, workspace, response): # NOTE: validate_run_name() call must be AFTER we call process_named_options() run_name, workspace = run_helper.parse_run_name(workspace, run_name) # extract "prompt" and "args" from cmdline cmdline, xt_cmdline, box_name, parent_name, node_index = self.get_info_for_run( workspace, run_name) #console.print("cmdline=", cmdline) prompt = "" if xt_cmdline: args = " " + xt_cmdline else: # legacy run; just use subset of xt cmd args = " xt " + cmdline console.print("edit/accept xt cmd for {}/{}".format( workspace, run_name)) if response: # allow user to supplement the cmd with automation if "$cmd" in response: response = response.replace("$cmd", args) console.print(response) else: response = pc_utils.input_with_default(prompt, args) # keep RERUN cmd simple by reusing parse_python_or_run_cmd() full_cmd = response.strip() #console.print(" new_cmd=" + full_cmd) if not full_cmd.startswith("xt "): errors.syntax_error( "command must start with 'xt ': {}".format(full_cmd)) # this temp dir cannot be removed immediately after job is submitted (TBD why) tmp_dir = file_utils.make_tmp_dir("rerun_cmd") job_id = self.store.get_job_id_of_run(workspace, run_name) capture.download_before_files(self.store, job_id, workspace, run_name, tmp_dir, silent=True, log_events=False) # move to tmp_dir so files get captured correctly prev_cwd = os.getcwd() os.chdir(tmp_dir) try: # recursive invoke of QFE parser to parse command (orginal + user additions) args = full_cmd.split(" ") args = args[1:] # drop the "xt" at beginning inner_dispatch(args, is_rerun=True) finally: # change back to original dir os.chdir(prev_cwd)
def print_cancel_all_results(self, cancel_results_by_boxes): for target, results in cancel_results_by_boxes.items(): console.print("Target: {}".format(target)) for result in results: console.print( "canceled: {}, service_status: {}, simple_status: {}". format(result.get("cancelled"), result.get("service_status"), result.get("simple_status")))
def zip(self, files, zipfile): filenames = file_helper.get_filenames_from_include_lists( [files], [".git", "__pycache__"], recursive=True) count = len(filenames) source_dir = os.path.dirname(files) remove_prefix_len = 1 + len(source_dir) file_helper.zip_up_filenames(zipfile, filenames, True, remove_prefix_len) console.print("{:,} files written to: {}".format(count, zipfile))
def get_merged_config(create_if_needed=True, local_overrides_path=None, suppress_warning=False, mini=False): fn_default = get_default_config_path() config = load_and_validate_config(fn_default, validate_as_default=True) # apply local override file, if present fn_overrides = local_overrides_path if local_overrides_path else constants.FN_CONFIG_FILE fn_overrides = os.path.realpath(fn_overrides) sc = os.getenv("XT_STORE_CREDS") mc = os.getenv("XT_MONGO_CONN_STR") if sc and mc: # we are running on compute node (launched by script) console.print( "XT: detected run on compute node; setting mini_mode=False") config.mini_mode = False else: # get mini_mode value from default config (modified further below) config.mini_mode = not config.get("general", "advanced-mode") if config.mini_mode: suppress_warning = True if os.path.exists(fn_overrides): overrides = load_and_validate_config(fn_overrides, validate_as_default=False) if not overrides.data: console.warning("local xt_config.yaml file contains no properties") else: # allow overrides to override the mini_mode flag if not (sc and mc): config.mini_mode = not overrides.get( "general", "advanced-mode", suppress_warning=True) # hardcoded MINI options (can be overwritten by local confile file) if config.mini_mode: # single workspace config.data["general"]["workspace"] = "txt" # single target config.data["xt-services"]["target"] = "batch" # merge the overrides config with the default config merge_configs(config, overrides) else: if not suppress_warning: console.print("warning: no local config file found") console.detail("after loading/validation of merged config files") return config
def remove_cache(self, ws_name): if self.run_cache_dir: # remove appropriate node of run_cache_dir cache_fn = os.path.expanduser( self.run_cache_dir) + "/" + constants.RUN_SUMMARY_CACHE_FN cache_fn = cache_fn.replace("$ws", ws_name) cache_dir = os.path.dirname(cache_fn) if os.path.exists(cache_dir): console.print(" zapping cache_dir=", cache_dir) file_utils.zap_dir(cache_dir)
def load_runs(self, all_run_records, plot_x_metric_name, plot_y_metric_name, hist_x_metric_name): self.runs = [] for record in all_run_records: run = Run(record, plot_x_metric_name, plot_y_metric_name, hist_x_metric_name) if len(run.metric_reports) == 0: # Exclude parent runs. continue self.runs.append(run) if MAX_NUM_RUNS > 0: if len(self.runs) == MAX_NUM_RUNS: break console.print("{} runs downloaded".format(len(self.runs)))
def monitor_with_jupyter(self, workspace, run_name): if not self.is_aml_ws(workspace): errors.combo_error( "the monitor command is only supported for Azure ML runs") run_name, actual_ws = run_helper.parse_run_name(workspace, run_name) fn = self.azure_ml.make_monitor_notebook(actual_ws, run_name) dir = os.path.dirname(fn) #console.print("jupyter notebook written to: " + fn) monitor_cmd = "jupyter notebook --notebook-dir=" + dir console.print("monitoring notebook created; to run:") console.print(" " + monitor_cmd)
def addr(self, box): box_name = box info = box_information.get_box_addr(self.config, box_name, self.store) box_addr = info["box_addr"] controller_port = info["controller_port"] tb_port = info["tensorboard_port"] if controller_port: console.print( "{} address: {}, controller port={}, tensorboard port".format( box_name, box_addr, controller_port, tb_port)) else: console.print("{} address: {}".format(box_name, box_addr))
def collect_logs(self, workspace, run_names, log_path): run_names, actual_ws = run_helper.parse_run_list( self.store, workspace, run_names) if len(run_names) == 0: self.store_error("No matching runs found") grok_server = None # self.config.get("logging", "grok-server") count = 0 for run_name in run_names: count += self.core.collect_logs_for_run(actual_ws, run_name, log_path, grok_server) console.print("{} log file collected to grok server: {}".format( count, grok_server))
def cancel_runs_by_property(self, prop_name, prop_value, box_name): cancel_results = None try: # connect to specified box if self.client.change_box(box_name): cancel_results = self.client.cancel_runs_by_property( prop_name, prop_value) else: console.print( "couldn't connect to controller for {}".format(box_name)) except BaseException as ex: errors.report_exception(ex) pass return cancel_results
def docker_login(self, target, docker): reg_creds = self.get_registry_creds(target, docker) if not reg_creds: if docker: errors.env_error( "no dockers entry defined for docker '{}'".format(docker)) else: errors.env_error( "no docker property defined for target '{}'".format( target)) server = reg_creds["login-server"] username = reg_creds["username"] password = reg_creds["password"] text = self.core.docker_login(server, username, password) console.print(text)
def mongo_with_retries(self, name, mongo_cmd, ignore_error=False): retry_count = 25 result = None import pymongo.errors for i in range(retry_count): try: result = mongo_cmd() break # watch out for these exceptions: AutoReconnect, OperationFailure (and ???) except BaseException as ex: # pymongo.errors.OperationFailure as ex: # since we cannot config logger to supress stderr, don't log this #logger.exception("Error in mongo_with_retries, ex={}".format(ex)) # pymongo.errors.OperationFailure: Message: {"Errors":["Request rate is large"]} if ignore_error: console.print( "ignoring mongo-db error: name={}, ex={}".format( name, ex)) break if i == retry_count - 1: # we couldn't recover - signal a hard error/failure raise ex # we get hit hard on the "Request rate is large" errors when running # large jobs (500 simultaneous runs), so beef up the backoff times to # [1,61] so we don't die with a hard failure here if i == 0: backoff = 1 + 10 * np.random.random() self.retry_errors += 1 else: backoff = 1 + 60 * np.random.random() ex_code = ex.code if hasattr(ex, "code") else "" ex_msg = str(ex)[0:60] + "..." console.print( "retrying mongo-db: name={}, retry={}/{}, backoff={:.2f}, ex.code={}, ex.msg={}" .format(name, i + 1, retry_count, backoff, ex_code, ex_msg)) time.sleep(backoff) return result
def create_demo(self, destination, response, overwrite): ''' This command will removed the specified destination directory if it exists (prompting the user for approval). Specifying the current directory as the destination will produce an error. ''' # set up from_dir from_dir = file_utils.get_xtlib_dir() + "/demo_files" # set up dest_dir dest_dir = destination if not dest_dir: errors.syntax_error("An output directory must be specified") create = True console.print("creating demo files at: {}".format( os.path.abspath(dest_dir))) if os.path.exists(dest_dir): answer = pc_utils.input_response( "'{}' already exists; OK to delete? (y/n): ".format(dest_dir), response) if answer != "y": create = False if create: file_utils.ensure_dir_deleted(dest_dir) shutil.copytree(from_dir, dest_dir) #file_utils.copy_tree(from_dir, dest_dir) if not self.store.does_workspace_exist("xt-demo"): # import xt-demo workspace from archive file console.print( "importing xt-demo workspace (usually takes about 30 seconds)" ) impl_storage_api = ImplStorageApi(self.config, self.store) fn_archive = os.path.join(file_utils.get_xtlib_dir(), "demo_files", "xt-demo-archive.zip") impl_storage_api.import_workspace(fn_archive, "xt-demo", "xtd", overwrite=overwrite, show_output=False)
def parse_string_list(self, tok, scanner, pipe_objects_enabled=True): global pipe_object_list #print("parse_string_list, tok=", tok) if not tok: # empty string specified value = [] tok = scanner.scan() # skip over the empty string elif tok == "$": if pipe_objects_enabled: global pipe_object_list pipe_object_list = get_xt_objects_from_cmd_piping() console.diag("pipe_object_list: {}".format(pipe_object_list)) if pipe_objects_enabled and pipe_object_list: #print("found '*', pipe_object_list=", pipe_object_list) value = pipe_object_list console.print("replacing '$' with: ", value) else: errors.combo_error( "'$' can only be used for piping the output of a previous XT command into this run" ) # mark pipe objects as having been consumed by this parsing pipe_object_list = None tok = scanner.scan() # skip over the $ else: # scan a comma separated list of tokens (some of which can be single quoted strings) value = [] while tok != None: if tok.startswith("--"): break ev = self.expand_system_values(tok) value.append(ev) tok = scanner.scan() if tok != ",": break tok = scanner.scan() # skip over the comma return value, tok
def download_runs(self, store, ws_name, run_group_name, run_group_type, hp_config_cloud_path, hp_config_local_dir): # Download the all_runs file local_cache_path = "{}/{}/{}/".format(hp_config_local_dir, ws_name, run_group_type) local_config_file_path = "{}{}".format(local_cache_path, "hp-config.yaml") if run_group_name == "experiment": console.print( "downloading runs for EXPERIMENT={}...".format(run_group_type)) # files are at EXPERIMENT LEVEL # read SWEEPS file if not store.does_experiment_file_exist(ws_name, run_group_type, hp_config_cloud_path): errors.store_error( "missing experiment hp_config file (ws={}, exper={}, fn={})" .format(ws_name, run_group_type, hp_config_cloud_path)) store.download_file_from_experiment(ws_name, run_group_type, hp_config_cloud_path, local_config_file_path) # read ALLRUNS info aggregated in EXPERIMENT allrun_records = store.get_all_runs(run_group_name, ws_name, run_group_type) else: console.print( "downloading runs for JOB={}...".format(run_group_type)) # files are at JOB LEVEL # read SWEEPS file if not store.does_job_file_exist(run_group_type, hp_config_cloud_path): errors.store_error( "missing job hp_config file (job={}, fn={})".format( run_group_type, hp_config_cloud_path)) store.download_file_from_job(run_group_type, hp_config_cloud_path, local_config_file_path) # read ALLRUNS info aggregated in JOB allrun_records = store.get_all_runs(run_group_name, ws_name, run_group_type) console.diag("after downloading all runs") return local_config_file_path, allrun_records
def restart_controller(self, job_id, node_index, delay): result = None # get the connection string for the job/node cs_plus = job_helper.get_client_cs(self.core, job_id, node_index) cs = cs_plus["cs"] box_secret = cs_plus["box_secret"] with XTClient(self.config, cs, box_secret) as xtc: if xtc.connect(): result = xtc.restart_controller(delay) if result: console.print("controller restarted") else: console.print( "could not connect to controller: ip={}, port={}".format( cs["ip"], cs["port"]))
def import_workspace(self, input_file, new_workspace, job_prefix, overwrite, show_output=True): if not job_prefix: errors.combo_error("job prefix cannot be blank") with tempfile.TemporaryDirectory(prefix="import-") as temp_dir: self.import_workspace_core(temp_dir, input_file, new_workspace, job_prefix, overwrite, show_output=show_output) if show_output: console.print(" import completed")
def run(self, timeout=None): if len(self.runs) == 0: console.print("error - no valid runs found") return if timeout: # build a thread to close our plot window after specified time from threading import Thread def set_timer(timeout): console.print("set_timer called: timeout=", timeout) time.sleep(timeout) console.diag("timer triggered!") plt.close("all") thread = Thread(target=set_timer, args=[timeout]) thread.daemon = True # mark as background thread thread.start() plt.show()