def get_client_cs(self, service_node_info): ''' Args: service_node_info: info that service maps to a compute node for a job Returns: {"ip": value, "port": value, "box_name": value} ''' box_name = service_node_info["box_name"] controller_port = constants.CONTROLLER_PORT tensorboard_port = None ssh_port = 22 if not box_name in self.config.get("boxes"): if pc_utils.is_localhost(box_name): box_name = "local" box_addr = self.config.get( "boxes", box_name, dict_key="address", default_value=box_name, prop_error="box not defined in config file: " + box_name) if "@" in box_addr: # strip off the username _, box_addr = box_addr.split("@", 1) #console.print("box_addr=", box_addr) if not "." in box_addr and box_addr != "localhost": raise Exception( "box option must specify a machine by its IP address: " + str(box_addr)) cs = {"ip": box_addr, "port": controller_port, "box_name": box_name} return cs
def get_psm_client(self, service_node_info): box_os = service_node_info["box_os"] box_addr = service_node_info["box_addr"] is_box_windows = (box_os == "windows") if pc_utils.is_localhost(box_addr=box_addr): psm_client = LocalPsmClient() else: psm_client = RemotePsmClient(box_addr, is_box_windows) return psm_client
def run_job_on_box(self, job_id, run_data_list, box_index, box_info, app_info, pool_info, resume_name=None, repeat=None, using_hp=None, exper_name=None, snapshot_dir=None, args=None): box_name = box_info.box_name box_addr = box_info.address box_os = box_info.box_os is_box_windows = (box_os == "windows") run_data = run_data_list[0] run_name = run_data["run_name"] if pc_utils.is_localhost(box_addr=box_addr): psm_client = LocalPsmClient() else: psm_client = RemotePsmClient(box_addr, is_box_windows) psm_client.restart_psm_if_needed() #print("psm created for box: " + box_addr) team = self.config.get("general", "xt-team-name") node_id = utils.node_id(box_index) cwd_dir = os.path.expanduser(constants.CWD_DIR) fn_src_zip = file_utils.path_join(cwd_dir, constants.CODE_ZIP_FN) fn_entry = psm_client.enqueue(team, job_id, run_name, node_id, fn_src_zip) service_node_info = { "fn_entry": fn_entry, "box_addr": box_addr, "box_os": box_os, "box_name": box_name, "job_id": job_id, "run_name": run_name } fb.feedback("submitted", is_final=True) return service_node_info
def cancel_thru_os(self, box_name, show_progress=True): progress = console.print if show_progress else console.diag progress(" checking running processes on: " + box_name) is_local = pc_utils.is_localhost(box_name) #console.print("box_name=", box_name, ", is_local=", is_local) ''' kill the controller process on the specified local/remote box''' if is_local: # pc_utils.is_localhost(box_name, box_addr): result = self.cancel_local_controller(progress) else: result = self.cancel_remote_controller(box_name, progress) return result
def keysend(self, box): # syntax: xt keysend <box name> box_name = box if not box_name: errors.syntax_error("must specify a box name/address") info = box_information.get_box_addr(self.config, box_name, self.store) box_addr = info["box_addr"] if pc_utils.is_localhost(box_name, box_addr) or box_name == "azure-batch": errors.syntax_error( "must specify a remote box name or address (e.g., xt keysend [email protected]" ) console.print( "this will require 2 connections to the remote host, so you will be prompted for a password twice" ) status = self.core.keysend(box_name) if status: console.print("public key successfully sent.")
def adjust_run_commands(self, job_id, job_runs, using_hp, experiment, service_type, snapshot_dir, args): ''' This method is called to allow the backend to inject needed shell commands before the user cmd. This base implementation does so by generating a new script file and adding it to the snapshot_dir. ''' store_data_dir, data_action, data_writable, store_model_dir, model_action, model_writable, \ storage_name, storage_key = self.get_action_args(args) # local or POOL of vm's fn_wrapped = None # we use same generated script on each box/job data_local = args["data_local"] model_local = args["model_local"] for i, box_runs in enumerate(job_runs): # wrap the user commands in FIRST RUN of each box (apply data/model actions) br = box_runs[0] box_info = br["box_info"] box_name = box_info.box_name box_secret = br["box_secret"] actions = box_info.actions node_id = utils.node_id(i) is_windows = box_info.box_os == "windows" run_specs = br["run_specs"] cmd_parts = run_specs["cmd_parts"] run_name = br["run_name"] if not fn_wrapped: # we only do this once (for the first box/job) using_localhost = pc_utils.is_localhost( box_name, box_info.address) # data_local overrides store_data_dir for LOCAL machine if using_localhost and data_local: store_data_dir = os.path.join( os.path.expanduser(data_local), store_data_dir) data_action = "use_local" if not "data" in actions: actions.append("data") # model_local overrides store_model_dir for LOCAL machine if using_localhost and model_local: store_model_dir = os.path.join( os.path.expanduser(model_local), store_model_dir) model_action = "use_local" if not "model" in actions: actions.append("model") setup = self.config.get_setup_from_target_def(self.compute_def) env_vars = self.get_env_vars_for_box(box_name, box_info, i, box_secret) post_cmds = [] # add env vars to script setter = "@set" if is_windows else "export" for name, value in env_vars.items(): cmd = "{} {}={}".format(setter, name, value) post_cmds.append(cmd) #"xt download before/code --job={} --unzip " fn_wrapped = super().wrap_user_command( cmd_parts, snapshot_dir, store_data_dir, data_action, data_writable, store_model_dir, model_action, model_writable, storage_name, storage_key, actions, is_windows=is_windows, sudo_available=False, pip_freeze=False, setup=setup, post_setup_cmds=post_cmds, args=args, nonempty=True) # we update each box's command script_part = "{} {} {}".format(os.path.basename(fn_wrapped), node_id, run_name) if self.is_windows: sh_parts = [script_part] else: sh_parts = ['/bin/bash', '--login', script_part] run_specs["cmd_parts"] = sh_parts