def get_status(self, fn_entry): status = "completed" # unless below finds different fn_queue_entry = file_utils.path_join(self.psm_queue_path, fn_entry, for_windows=False) ssh_cmd = "ls -lt " + fn_queue_entry result = None #error_code, result = process_utils.sync_run_ssh(None, self.box_addr, ssh_cmd, report_error=False) result = self.run_cmd(ssh_cmd) if result and fn_entry in result: status = "queued" else: text = self.get_running_entry_name() if text == fn_entry: # entry might be running; is the runner script OR controller active? if self._get_runner_script_process_id(): status = "running" elif self._get_controller_process_id(): status = "running" else: console.print("--> runner script and controller processes not running") else: console.print("PSM current job:", text) return status
def poll_for_tensorboard_files(self, last_changed, blob_path, start_index, tb_path, run_name): # get all blobs in the run's output dir blobs = self.store.list_blobs(self.ws_name, blob_path, return_names=False) download_count = 0 #console.print("blob_names=", blob_names) for blob in blobs: # is this a tensorboard file? basename = os.path.basename(blob.name) if not basename.startswith("events.out.tfevents"): continue # get interesting part of blob's path (after run_name/) bn = blob.name[start_index:] modified = blob.properties.last_modified if not bn in last_changed or last_changed[bn] != modified: last_changed[bn] = modified if "{logdir}" in tb_path: # extract parent dir of blob test_train_node = os.path.basename( os.path.dirname(blob.name)) console.print("tb_path=", tb_path, ", test_train_node=", test_train_node, ", basename=", basename) # apply to remaining template tb_path_full = tb_path.format( **{"logdir": test_train_node}) #console.print("tb_path_full=", tb_path_full) local_fn = file_utils.path_join(tb_path_full, basename) else: local_fn = tb_path local_fn = os.path.join("logs", local_fn) console.print("our local_fn=", local_fn) # download the new/changed blob try: console.print("downloading bn={}, local_fn={}".format( bn, local_fn)) file_utils.ensure_dir_exists(file=local_fn) self.store.download_file_from_run(self.ws_name, run_name, bn, local_fn) download_count += 1 if self.print_progress: console.print("d", end="", flush=True) except BaseException as ex: logger.exception( "Error in download_file_from_run, from tensorboard_reader, ex={}" .format(ex)) return download_count
def get_running_entry_name(self): text = None fn_current = file_utils.path_join(self.controller_cwd, constants.CURRENT_RUNNING_ENTRY, for_windows=False) new_bytes = self.read_file(fn_current, 0, None) text = new_bytes.decode() text = text.strip() # remove newline, spaces return text
def enqueue(self, team, job, run, node, fn_zip): # copy file to box (with unique name) guid = str(uuid.uuid4()) ticks = time.time() # copy with ".tmp" extension so PSM doesn't see partial file fn_tmp_entry = "{}.{}.{}.{}.{}.tmp".format(team, job, run, node, int(10*ticks)) fn_tmp_dest = file_utils.path_join(self.psm_queue_path, fn_tmp_entry, for_windows=False) fn_entry = os.path.splitext(fn_tmp_entry)[0] + ".zip" fn_dest = file_utils.path_join(self.psm_queue_path, fn_entry, for_windows=False) # don't do a confirm since PSM can remove it from queue instantly self.ftp_client.put(fn_zip, fn_tmp_dest) # now, rename the file so that PSM can see it self.ftp_client.rename(fn_tmp_dest, fn_dest) return fn_entry
def run_job_on_box(self, job_id, run_data_list, box_index, box_info, app_info, pool_info, resume_name=None, repeat=None, using_hp=None, exper_name=None, snapshot_dir=None, args=None): box_name = box_info.box_name box_addr = box_info.address box_os = box_info.box_os is_box_windows = (box_os == "windows") run_data = run_data_list[0] run_name = run_data["run_name"] if pc_utils.is_localhost(box_addr=box_addr): psm_client = LocalPsmClient() else: psm_client = RemotePsmClient(box_addr, is_box_windows) psm_client.restart_psm_if_needed() #print("psm created for box: " + box_addr) team = self.config.get("general", "xt-team-name") node_id = utils.node_id(box_index) cwd_dir = os.path.expanduser(constants.CWD_DIR) fn_src_zip = file_utils.path_join(cwd_dir, constants.CODE_ZIP_FN) fn_entry = psm_client.enqueue(team, job_id, run_name, node_id, fn_src_zip) service_node_info = { "fn_entry": fn_entry, "box_addr": box_addr, "box_os": box_os, "box_name": box_name, "job_id": job_id, "run_name": run_name } fb.feedback("submitted", is_final=True) return service_node_info
def get_controller_cwd(is_windows, is_local=False): if is_windows: # we only support windows as a local machine #cwd = os.path.expanduser("~/.xt/cwd") # docker has problems mapping paths to user home directories (~/) # controller app has problems copying/deleting files in 'programdata' folder # so, for windows, we use this: sys_drive = os.getenv("SystemDrive") cwd = file_utils.path_join(sys_drive + "/xt", "cwd") else: cwd = "~/.xt/cwd" # only safe to expand if local if is_local: cwd = os.path.expanduser(cwd) return cwd
def upload_sweep_data(self, sweeps_text, exper_name, job_id, args): ''' we have extracted/parsed HP sweeps data; write it to the experiment/job store where we can find it during dynamic HP searches (running in controller). ''' # upload SWEEP file to job or experiment directory fn_sweeps = args["hp_config"] agg_dest = args["aggregate_dest"] if not fn_sweeps: # must have extracted sweeps data from cmd line options fn_sweeps = constants.HP_CONFIG_FN args["hp_config"] = fn_sweeps # upload to a known folder name (since value of fn_sweeps can vary) and we need to find it later (HX usage) target_name = file_utils.path_join(constants.HP_CONFIG_DIR, os.path.basename(fn_sweeps)) if agg_dest == "experiment": self.store.create_experiment_file(workspace, exper_name, target_name, sweeps_text) else: self.store.create_job_file(job_id, target_name, sweeps_text)
def restart_psm_if_needed(self): ''' processing: - if PSM is running on old psm.py, kill the process and restart it. - if PMS is not running, start it. ''' kill_needed = False start_needed = False fn_src = os.path.join(file_utils.get_my_file_dir(__file__), constants.PSM) fn_dest = file_utils.path_join(self.xt_path, constants.PSM, for_windows=self.box_is_windows) running = bool(self._get_psm_process_id()) #print("PSM running=", running) if running: # do file contents match? text_src = file_utils.read_text_file(fn_src) text_dest = "" if self.remote_file_exists(fn_dest): # read text of fn_dest on remote box with self.ftp_client.open(fn_dest, "rb") as infile: bytes_dest = infile.read() text_dest = bytes_dest.decode() # normalize NEWLINE chars before comparison # (ftp_client seems to add CR when called frm windows) text_src = text_src.replace("\r\n", "\n") text_dest = text_dest.replace("\r\n", "\n") if text_src != text_dest: kill_needed = True else: start_needed = True if kill_needed: p = self._get_psm_process_id() ssh_cmd = "kill -kill {}".format(p) self.run_cmd(ssh_cmd) start_needed = True if start_needed: # create required dirs self._make_dir(self.psm_queue_path) self._make_dir(self.cwd_path) # copy psm.py # caution: node slashes in fn_dest must all match box's OS style fn_dest = file_utils.fix_slashes(fn_dest, is_linux=True) status = self.ftp_client.put(fn_src, fn_dest) # run psm fn_log = os.path.join(self.xt_path, constants.PSMLOG) if self.box_is_windows: cmd_parts = ["cmd", "/c", "python -u {} > {}".format(fn_dest, fn_log)] cmd = " ".join(cmd_parts) else: fn_log = file_utils.fix_slashes(fn_log, is_linux=True) cmd = 'nohup bash --login -c "python -u {}" </dev/null > {} 2>&1 &'.format(fn_dest, fn_log) #print("cmd=", cmd) #process_utils.sync_run_ssh(self, self.box_addr, cmd) self.run_cmd(cmd) for i in range(20): # don't return until PSM is running running = bool(self._get_psm_process_id()) if running: break time.sleep(.5) if not running: errors.general_error("Could not start remote PSM on box={}".format(self.box_addr))
def get_client_context(self, exper_name, run_name, app_info, box_info, job_id, node_index, run_specs, resume_name=None, using_hp=False, repeat=None, args=None): ''' this function gathers up all of the job-level context needed to run the job on the specified node (node_index). ''' config = self.config cmd_parts = run_specs["cmd_parts"] workspace = args['workspace'] working_dir = args['working_dir'] context = Bag() context.ws = workspace context.working_dir = working_dir context.exper_name = exper_name context.run_name = run_name context.job_id = job_id context.sku = args["sku"] context.app_name = app_info.app_name if app_info else None context.box = args["box"] context.from_ip = pc_utils.get_ip_address() context.from_host = pc_utils.get_hostname() context.box_name = box_info.box_name context.target_file, _, _ = self.get_target(cmd_parts) context.resume_name = resume_name context.generated_sweep_text = None # will be conditionally set in controller context.pool = args["pool"] context.node_index = node_index context.compute = args["target"] context.service_type = args["service_type"] # provide all provider info to controller context.providers = config.get("providers") #context.run_specs = run_specs context.cmd_parts = cmd_parts context.xt_cmd = args[ "xt_cmd"] # log our full cmd to support correct rerun's context.run_script = run_specs["run_script"] context.parent_script = run_specs["parent_script"] # for helping docker login to user's Azure Container Registry is_docker = (args["docker"] != None) # if cmd_parts: # is_docker = (cmd_parts[0] == "docker") or (cmd_parts[0] == "sudo" and cmd_parts[1] == "docker") #registry = config.get("environment", "registry", suppress_warning=True) registry = None compute_def = args["compute_def"] if compute_def and "docker" in compute_def: docker_name = compute_def["docker"] docker_def = self.config.get_docker_def(docker_name) if docker_def and "registry" in docker_def: registry = docker_def["registry"] if registry: registry_creds = config.get("external-services", registry) needs_login = is_docker and utils.safe_value( registry_creds, "login") login_server = utils.safe_value(registry_creds, "login-server") username = utils.safe_value(registry_creds, "username") password = utils.safe_value(registry_creds, "password") else: needs_login = False login_server = None username = None password = None context.docker_login = needs_login context.docker_server = login_server context.docker_username = username context.docker_password = password context.username = self.config.get("general", "username") setup = self.config.get_setup_from_target_def(compute_def) activate_cmd = utils.safe_value(setup, "activate") context.activate_cmd = activate_cmd # config info #box_os = self.get_box_os(box_name) box_os = box_info.box_os after_files_list = args["after_dirs"] after_files_list = utils.parse_list_option_value(after_files_list) context.after_files_list = after_files_list after_omit_list = args["after_omit"] after_omit_list = utils.parse_list_option_value(after_omit_list) context.after_omit_list = after_omit_list context.primary_metric = args["primary_metric"] context.maximize_metric = args["maximize_metric"] context.report_rollup = args["report_rollup"] context.after_upload = args["after_upload"] #context.scrape = config.get("general", "scrape") context.log = args["log"] # PARENT/CHILD info context.repeat = repeat context.repeats_remaining = None # will be set in controller context.total_run_count = args["total_run_count"] context.search_style = args["search_style"] context.is_parent = context.search_style != "single" # HPARAM search hp_config = args["hp_config"] if hp_config: hp_config = file_utils.path_join(constants.HP_CONFIG_DIR, os.path.basename(hp_config)) context.hp_config = hp_config context.fn_generated_config = args["fn_generated_config"] context.using_hp = using_hp context.search_type = args["search_type"] context.option_prefix = args["option_prefix"] context.restart = False context.concurrent = args["concurrent"] context.xtlib_capture = args["xtlib_upload"] # for mirroring files to grok server or storage context.mirror_dest = args["mirror_dest"] context.mirror_files = args["mirror_files"] context.grok_server = None # args["grok_server"] context.aggregate_dest = args["aggregate_dest"] context.dest_name = exper_name if context.aggregate_dest == "experiment" else job_id store_creds = self.config.get_storage_creds() context.store_creds = store_creds context.store_code_path = config.get_storage_provider_code_path( store_creds) mongo_creds, mongo_name = self.config.get_mongo_creds() context.mongo_conn_str = mongo_creds["mongo-connection-string"] context.shell_launch_prefix = box_info.shell_launch_prefix #console.print("context=", context) return context