def get_box_run_status_inner(self, box_name, ws=None, run_name=None, stage_flags=""): ''' requires that the desired box is the current box''' text = "" info = box_information.get_box_addr(self.config, box_name, self.store) box_addr = info["box_addr"] controller_port = info["controller_port"] if not self.client.is_controller_running(box_name, box_addr, controller_port): text += "box: " + box_name + "\n" text += " controller is NOT running\n" else: self.client.change_box(box_name, port=controller_port) #text += self.get_core_status(ws) + "\n" text += "\n" + self.get_box_status(box_name=box_name) + "\n" text += "\n" + stage_flags + " runs on " + box_name.upper() + ":\n" text += self.client.jobs_report(ws=ws, run_name=run_name, stage_flags=stage_flags) return text
def addr(self, box): box_name = box info = box_information.get_box_addr(self.config, box_name, self.store) box_addr = info["box_addr"] controller_port = info["controller_port"] tb_port = info["tensorboard_port"] if controller_port: console.print( "{} address: {}, controller port={}, tensorboard port".format( box_name, box_addr, controller_port, tb_port)) else: console.print("{} address: {}".format(box_name, box_addr))
def keysend(self, box): # syntax: xt keysend <box name> box_name = box if not box_name: errors.syntax_error("must specify a box name/address") info = box_information.get_box_addr(self.config, box_name, self.store) box_addr = info["box_addr"] if pc_utils.is_localhost(box_name, box_addr) or box_name == "azure-batch": errors.syntax_error( "must specify a remote box name or address (e.g., xt keysend [email protected]" ) console.print( "this will require 2 connections to the remote host, so you will be prompted for a password twice" ) status = self.core.keysend(box_name) if status: console.print("public key successfully sent.")
def scp(self, cmd): # fixup the boxname:xxx patterns parts = cmd.split(" ") for i, part in enumerate(parts): if ":" in part: # remove surrounding quotes if part.startswith('"') and part.endswith('"'): part = part[1:-1] elif part.startswith("'") and part.endswith("'"): part = part[1:-1] names = part.split(":") if len(names) == 2 and len(names[0]) > 1: # it looks like a box name box_name = names[0] #console.print("box_name=", box_name) info = box_information.get_box_addr( self.config, box_name, self.store) box_addr = info["box_addr"] #console.print("box_addr=", box_addr) if box_addr: new_part = box_addr + ":" + ":".join(names[1:]) #console.print("new part=", new_part) parts[i] = new_part #cmd = " ".join(parts).replace(": ", ":") #console.print("new cmd=", cmd) # remove empty parts parts = [part for part in parts if part] exit_code, output = process_utils.run_scp_cmd(self, parts, report_error=True) if output: console.print(output) else: console.print("SCP command completed")
def connect_to_controller(self, box_name=None, ip_addr=None, port=None): ''' establish communication with the XT controller process on the specified box. return True if connection established, False otherwise. ''' connected = False console.diag("init_controler: box_name={}".format(box_name)) if self.conn == box_name: connected = True else: if ip_addr: box_addr = ip_addr else: info = box_information.get_box_addr(self.config, box_name, self.store) box_addr = info["box_addr"] controller_port = info["controller_port"] self.token = info["box_secret"] ip_addr = self.core.get_ip_addr_from_box_addr(box_addr) port = controller_port if controller_port else constants.CONTROLLER_PORT # the controller should now be running - try to connect try: console.diag(" connecting to controller") self.connect(box_name, ip_addr, port=port) console.diag(" connection successful!") # magic step: allows our callback to work correctly! # this must always be executed (even if self.conn is already true) bgsrv = rpyc.BgServingThread(self.conn) console.diag(" now running BgServingThread") connected = True except BaseException as ex: #self.report_controller_init_failure(box_name, box_addr, self.port, ex) # most common reasons for failure: not yet running (backend service) or finished running pass return connected
def connect_to_box_for_run(self, ws_name, run_name): state = None box_name, job_id, node_index = self.get_run_info(ws_name, run_name) info = box_information.get_box_addr(self.config, box_name, self.store) ip_addr = info["box_addr"] controller_port = info["controller_port"] if not controller_port: controller_port = self.port if state == "deallocated": connected = False elif controller_port: connected = self.connect_to_controller(ip_addr=ip_addr, port=controller_port) else: connected = self.connect_to_controller(box_name=box_name) if controller_port: box_name = ip_addr + ":" + str(controller_port) return state, connected, box_name, job_id
def get_tensorboard_status(self, ws_name, run_name, box_name): if ws_name and run_name: self.connect_to_box_for_run(run_name) else: self.change_box(box_name) # get running status from controller status = self.conn.root.get_tensorboard_status(self.token) # add other info to status if not box_name: box_name, job_id, node_index = self.get_run_info(ws_name, run_name) info = box_information.get_box_addr(self.config, box_name, self.store) tensorboard_port = info["tensorboard_port"] status["box_name"] = box_name status["ip_addr"] = ip_addr status[ "tensorboard_port"] = tensorboard_port if tensorboard_port else constants.TENSORBOARD_PORT return status
def cancel_controller(self, box_name, os_call_only=False): shutdown = False if not os_call_only: try: # first try to cancel it thru a SHUTDOWN REQUEST self.ensure_token_is_set() info = box_information.get_box_addr(self.config, box_name, self.store) box_addr = info["box_addr"] is_running = self.is_controller_running(box_name, box_addr) if is_running: self.conn.root.shutdown(self.token) shutdown = True except BaseException as ex: console.print("shutdown request result: ex={}".format(ex)) raise ex if not shutdown: # if above fails, kill the process if local or PEER self.cancel_thru_os(box_name)
def ssh(self, name, cmd, workspace, output): capture_output = True if cmd else False if name.startswith("run"): # assume it's a RUN name # from xtlib.backends.backend_philly import Philly # rr = run_helper.get_run_record(self.store, workspace, name) # if not "cluster" in rr: # errors.store_error("only philly runs are currently supported for this cmd") # philly = Philly(core=self.core) # ssh_cmd = philly.get_ssh_for_run(workspace, name) # print("ssh_cmd: " + ssh_cmd) # exit_code, output = process_utils.sync_run(ssh_cmd, report_error=True, capture_output=capture_output) pass else: # assume it's a BOX name info = box_information.get_box_addr(self.config, name, self.store) ssh_ip = info["box_addr"] #console.print("ssh_cmd: ssh_ip=", ssh_ip, ", cmd=", cmd) capture_as_bytes = bool(output) exit_code, ssh_output = process_utils.sync_run_ssh( self, ssh_ip, cmd, capture_output=capture_output, capture_as_bytes=capture_as_bytes) if output: # write as bytes with open(output, "wb") as outfile: outfile.write(ssh_output) elif capture_output: console.print(ssh_output)