def test_impl(self, search_type): # run test on impl test_dir = os.path.realpath(os.path.dirname(__file__)) # using YAML config file #1 fn_text = os.path.join(test_dir, "miniSweeps.yaml") config_text = file_utils.read_text_file(fn_text) self.test_from_config(fn_text, config_text, search_type) # using YAML config file #2 fn_yaml = os.path.join(test_dir, "hp_search.yaml") yaml_text = file_utils.read_text_file(fn_yaml) self.test_from_config(fn_yaml, yaml_text, search_type)
def restart_psm_if_needed(self): ''' processing: - if PSM is running on old psm.py, kill the process and restart it. - if PMS is not running, start it. ''' kill_needed = False start_needed = False fn_src = os.path.join(file_utils.get_my_file_dir(__file__), constants.PSM) fn_dest = os.path.join(self.cwd_path, constants.PSM) running = self._is_psm_running() #print("PSM running=", running) if running: # do file contents match? text_src = file_utils.read_text_file(fn_src) text_dest = file_utils.read_text_file(fn_dest) if os.path.exists( fn_dest) else None if text_src != text_dest: kill_needed = True else: start_needed = True if kill_needed: p = self._get_psm_process() p.kill() start_needed = True if start_needed: # always copy psm.py (for xt dev/debug purposes) shutil.copyfile(fn_src, fn_dest) # run psm fn_log = os.path.join(self.cwd_path, constants.PSMLOG) if self.box_is_windows: cmd_parts = [ "cmd", "/c", "python -u {} > {}".format(fn_dest, fn_log) ] else: cmd_parts = [ "bash", "-c", "python -u {} > {}".format(fn_dest, fn_log) ] fn_psm_log = os.path.expanduser("~/.xt/cwd/runpsm.log") process_utils.start_async_run_detached(cmd_parts, self.cwd_path, fn_psm_log)
def read_user_multi_commands(self, using_hp, run_script, cmd_parts, args): cmds = None lines = self.config.get("commands") if lines: # commands specified in the config file args["multi_commands"] = True multi_commands = True else: # did user specify --multi-commands multi_commands = args["multi_commands"] if multi_commands: if using_hp: errors.combo_error("Cannot specify both -multi-commands and hyperparameter search") # read MULTI CMDS if not lines: fn_cmds = args["script"] # run_script if run_script else cmd_parts[0] lines = file_utils.read_text_file(fn_cmds, as_lines=True) lines = [line.strip() for line in lines if line and not line.strip().startswith("#")] cmds = [self.fixup_script_in_cmd(line) for line in lines] return cmds
def help_topics(self, topic, browse, prefix="topics", title="help topics"): # build list of help topics from xtlib/help_topics directory topics_dir = os.path.join(file_utils.get_xtlib_dir(), "help_topics", prefix) if not os.path.isdir(topics_dir): errors.env_error("Missing help topics dir: {}".format(topics_dir)) topic_files, _ = file_utils.get_local_filenames(topics_dir) # build a map from topic names to the files topic_map = {file_utils.root_name(fn): fn for fn in topic_files} if not topic: console.print("available {}:".format(title)) keys = list(topic_map.keys()) keys.sort() for topic_name in keys: console.print(" {}".format(topic_name)) console.print() console.print( "To display a help topic, use 'xt help topic <topic name>'") else: # print a specific topic topic_low = topic.lower() if not topic_low in topic_map: errors.general_error( "help topic not found: {}".format(topic_low)) text = file_utils.read_text_file(topic_map[topic_low]) print(text)
def compare_log_files(self, fn_approved, fnx): ''' we mask and compare at file level. if errors, we drill down to property level for easier debugging. only file compare counts should be counted as tests. ''' name = os.path.basename(fn_approved) approved_text = file_utils.read_text_file(fn_approved) texta = self.mask_out_regular_changes(approved_text) tested_text = file_utils.read_text_file(fnx) textx = self.mask_out_regular_changes(tested_text) match = texta == textx if not match: # files did not match after masking, but property level matching may work self.file_compare_errors += 1 before_asserts = self.assert_count # compare detail; we should see an assert on a smaller property file_ext = os.path.splitext(fnx)[1] if texta.startswith("{") and file_ext in [".json", ".log"]: # debug print("loading JSON: fn_approved={}, fnx={}".format(fn_approved, fnx)) file_utils.write_text_file("texta.json", texta) file_utils.write_text_file("textx.json", textx) dda = json.loads(texta) ddx = json.loads(textx) self.compare_json_values(name, dda, ddx) else: texta = self.mask_out_regular_changes_from_wrapper(approved_text) testx = self.mask_out_regular_changes_from_wrapper(tested_text) self.compare_text(name, texta, textx) if self.assert_count == before_asserts: # no new asserts, so undo our file error count # this happens because some failed file compares # succeed when compared at property level self.file_compare_errors -= 1 elif STOP_ON_FIRST_ERROR: self._assert( False )
def get_secret(name): name = correct_name(name) secrets = {} if os.path.exists(FN_SECRETS): text = file_utils.read_text_file(FN_SECRETS) secrets = json.loads(text) value = secrets[name] if name in secrets else None console.diag("get_secret: name={}, value={}".format(name, value)) return value
def get_running_entry_name(self): text = None controller_cwd = utils.get_controller_cwd(self.box_is_windows, is_local=True) fn_current = os.path.join(controller_cwd, constants.CURRENT_RUNNING_ENTRY) if os.path.exists(fn_current): text = file_utils.read_text_file(fn_current).strip() return text
def _load_grok_creds(self): fn_keys = "keys.bin" loaded = False if not os.path.exists(fn_keys): fn_keys = os.path.expanduser("~/.xt/teams/{}/keys.bin".format( self.team_name)) if os.path.exists(fn_keys): # GROK server creds creds = file_utils.read_text_file(fn_keys) self.apply_creds(creds) fn_cert = os.path.join(os.path.dirname(fn_keys), "xt_cert.pem") if os.path.exists(fn_cert): cert = file_utils.read_text_file(fn_keys) self.keys["xt_server_cert"] = cert console.diag("init_creds: using grok server 'keys.bin' file") loaded = True return loaded
def import_job_mongo_document(self, fn_mongo_job, new_workspace, prev_job_id, new_job_id): text = file_utils.read_text_file(fn_mongo_job) job = json.loads(text) # update job_id job["_id"] = new_job_id job["job_id"] = new_job_id job["job_num"] = job_helper.get_job_number(prev_job_id) # update workspace job["ws_name"] = new_workspace # add to mongo self.store.mongo.update_job_info(new_job_id, job)
def import_run_mongo_document(self, mongo_run_fn, workspace, new_workspace, prev_job_id, new_job_id, run_name): text = file_utils.read_text_file(mongo_run_fn) run = json.loads(text) # update job_id run["job_id"] = new_job_id # update workspace run["ws"] = new_workspace # add to mongo self.store.mongo.update_run_info(new_workspace, run_name, run) end_id = utils.safe_value(run, "end_id") return end_id
def set_secret(name, value): name = correct_name(name) console.diag("set_secret: name={}, value={}".format(name, value)) file_utils.ensure_dir_exists(file=FN_SECRETS) secrets = {} # read existing secrets, if any if os.path.exists(FN_SECRETS): text = file_utils.read_text_file(FN_SECRETS) secrets = json.loads(text) secrets[name] = value # write updates secrets text = json.dumps(secrets) file_utils.write_text_file(FN_SECRETS, text)
def __init__(self, config=None, store=None, xt_logging=True, aml_logging=True, checkpoints_enabled=True, tensorboard_path=None, supress_normal_output=False): ''' this initializes an XT Run object so that ML apps can use XT services from within their app, including: - hyperparameter logging - metrics logging - uploading files to an XT share - downloading files from an XT share - checkpoint support - explict HP search calls note: Azure ML child runs seem to get their env variables inherited from their parent run correctly, so we no need to use parent run for info. ''' self.store = None self.xt_logging = False self.metric_report_count = 0 self.metric_names = OrderedDict() self.supress_normal_output = supress_normal_output # tensorboard writers self.train_writer = None self.test_writer = None # 2nd set of writers for Philly self.train_writer2 = None self.test_writer2 = None self.tensorboard_path = tensorboard_path if self.tensorboard_path: # TENSORBOARD WORKAROUND: this code causes tensorboard files to be closed when they are appended to # this allow us to just write the files to MOUNTED output dir (and not have to mirror them) try: from tensorboard.compat import tf delattr(tf.io.gfile.LocalFileSystem, 'append') except: import tensorflow as tf import tensorboard as tb tf.io.gfile = tb.compat.tensorflow_stub.io.gfile delattr(tf.io.gfile.LocalFileSystem, 'append') self.init_tensorboard() self.ws_name = os.getenv("XT_WORKSPACE_NAME", None) self.exper_name = os.getenv("XT_EXPERIMENT_NAME", None) self.run_name = os.getenv("XT_RUN_NAME", None) self.resume_name = os.getenv("XT_RESUME_NAME") # load context, if present self.context = None if self.run_name: fn_context = os.path.abspath(constants.FN_RUN_CONTEXT) if os.path.exists(fn_context): json_context = file_utils.read_text_file(fn_context) context_dict = json.loads(json_context) self.context = utils.dict_to_object(context_dict) if not supress_normal_output: console.print("run context loaded: {}".format( self.context.run_name)) else: if not supress_normal_output: console.print( "run context file not found: {}".format(fn_context)) mc = os.getenv("XT_MONGO_CONN_STR") self.mongo_conn_str = utils.base64_to_text(mc) # convert store_creds from string to dict sc = os.getenv("XT_STORE_CREDS") self.store_creds = utils.base64_to_text(sc) if self.store_creds: self.store_creds = json.loads(self.store_creds) provider_code_path = os.getenv("XT_STORE_CODE_PATH") run_cache_dir = None self.config = config if config: run_cache_dir = config.get("general", "run-cache-dir") is_aml_run = bool(os.getenv("AML_WORKSPACE_NAME")) if is_aml_run: # load azure libraries on demand from .backends.backend_aml import AzureML from azureml.core import Run as AmlRun self.aml_run = AmlRun.get_context( ) # assumes app is running under AML else: self.aml_run = None self.aml_logging = aml_logging self.is_aml_child = False self.is_aml = False # TODO: remove need for this since we now treat AML runs normally if True: # self.run_name: if not self.store_creds and not config: # if store_creds not set, this app is running outside of XT control # provide access to XT store for dev/test purposes config = get_merged_config(suppress_warning=True) self.config = config self.store = Store(self.store_creds, provider_code_path=provider_code_path, run_cache_dir=run_cache_dir, mongo_conn_str=self.mongo_conn_str, config=config) # if not supress_normal_output: # console.print("XT logging enabled: ", self.run_name) # distributed training support self.rank = None self.world_size = None self.master_ip = None self.master_port = None self.xt_logging = xt_logging and self.run_name != None self.checkpoints_enabled = checkpoints_enabled self.direct_run = not os.getenv("XT_CONTROLLER") if self.xt_logging and self.direct_run and self.store: # log stuff normally done by controller at start of run self.store.log_run_event(self.ws_name, self.run_name, "started", {}) self.store.mongo.run_start(self.ws_name, self.run_name) if self.context: self.store.mongo.job_run_start(self.context.job_id)
def process_args(self, args): run_script = None parent_script = None run_cmd_from_script = None target_file = args["script"] target_args = args["script_args"] code_upload = args["code_upload"] # user may have wrong slashes for this OS target_file = file_utils.fix_slashes(target_file) if os.path.isabs(target_file): errors.syntax_error("path to app file must be specified with a relative path: {}".format(target_file)) is_rerun = "is_rerun" in args if is_rerun: # will be running from script dir, so remove any path to script file self.script_dir = os.path.dirname(target_file) target_file = os.path.basename(target_file) if target_file.endswith(".py"): # PYTHON target cmd_parts = ["python"] cmd_parts.append("-u") cmd_parts.append(target_file) else: cmd_parts = [target_file] if target_args: # split on unquoted spaces arg_parts = utils.cmd_split(target_args) cmd_parts += arg_parts if target_file == "docker": self.is_docker = True if not self.is_docker and code_upload and not os.path.exists(target_file): errors.env_error("script file not found: {}".format(target_file)) ps_path = args["parent_script"] if ps_path: parent_script = file_utils.read_text_file(ps_path, as_lines=True) if target_file.endswith(".bat") or target_file.endswith(".sh"): # a RUN SCRIPT was specified as the target run_script = file_utils.read_text_file(target_file, as_lines=True) run_cmd_from_script = scriptor.get_run_cmd_from_script(run_script) compute = args["target"] box_def = self.config.get("boxes", compute, suppress_warning=True) setup = utils.safe_value(box_def, "setup") compute_def = self.config.get_compute_def(compute) if compute_def: # must be defined in [compute-targets] compute_def = self.config.get_compute_def(compute) if not "service" in compute_def: errors.config_error("compute target '{}' must define a 'service' property".format(compute)) service = compute_def["service"] if service in ["local", "pool"]: # its a list of box names boxes = compute_def["boxes"] if len(boxes)==1 and boxes[0] == "localhost": pool = None box = "local" service_type = "pool" else: pool = compute box = None service_type = "pool" else: # it a set of compute service properties pool = compute box = None service_name = compute_def["service"] service_type = self.config.get_service_type(service_name) elif box_def: # translate single box name to a compute_def box = compute pool = None service_type = "pool" compute_def = {"service": service_type, "boxes": [box], setup: setup} else: errors.config_error("unknown target or box: {}".format(compute)) args["target"] = compute args["compute_def"] = compute_def args["service_type"] = service_type # for legacy code args["box"] = box args["pool"] = pool return service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, \ compute, compute_def
def get_config_template(self, template): # load default config file as lines fn = get_default_config_template_path() default_text = file_utils.read_text_file(fn) default_lines = default_text.split("\n") # convert lines to sections dict sections = yaml.safe_load(default_text) if not template or template == "empty": # EMPTY hdr = \ "# local xt_config.yaml\n" + \ "# uncomment the below lines to start populating your config file\n\n" text = \ "#general:\n" + \ " #workspace: 'ws1'\n" + \ " #experiment: 'exper1'\n" elif template == "philly": # PHILLY hdr = "# local xt_config.yaml for Philly compute service\n\n" text = self.copy_and_merge_sections( sections, [ "external-services.philly", "external-services.philly-registry", "external-services.phoenixkeyvault", "external-services.phoenixmongodb", "external-services.phoenixregistry", "external-services.phoenixstorage", "xt-services", "compute-targets.philly", "setups.philly", "dockers.philly-pytorch", "general" ], update_keys={"xt-services.target": "philly"}) elif template == "batch": # BATCH hdr = "# local xt_config.yaml (for Azure Batch compute services)\n\n" text = self.copy_and_merge_sections( sections, [ "external-services.phoenixbatch", "external-services.phoenixkeyvault", "external-services.phoenixmongodb", "external-services.phoenixregistry", "external-services.phoenixstorage", "xt-services", "compute-targets.batch", "azure-batch-images", "general" ], update_keys={"xt-services.target": "batch"}) elif template == "aml": # AML hdr = "# local xt_config.yaml (for Azure ML compute service)\n\n" text = self.copy_and_merge_sections( sections, [ "external-services.phoenixaml", "external-services.phoenixkeyvault", "external-services.phoenixmongodb", "external-services.phoenixregistry", "external-services.phoenixstorage", "xt-services", "compute-targets.aml", "aml-options", "general" ], update_keys={"xt-services.target": "aml"}) elif template == "pool": # POOL hdr = "# local xt_config.yaml (for local machine and Pool compute service)\n\n" text = self.copy_and_merge_sections(sections, [ "external-services.phoenixkeyvault", "external-services.phoenixmongodb", "external-services.phoenixregistry", "external-services.phoenixstorage", "xt-services", "compute-targets.local", "compute-targets.local-docker", "boxes", "setups.local", "dockers.pytorch-xtlib", "dockers.pytorch-xtlib-local", "general" ]) elif template == "all": # ALL hdr = "# local xt_config.yaml (for all compute services)\n\n" text = "\n".join(default_lines) else: errors.syntax_error( "unrecognized --create value: {}".format(template)) return hdr + text
def import_workspace_core(self, temp_dir, input_file, new_workspace, job_prefix, overwrite, show_output): # unzip files and use contents.json file_helper.unzip_files(input_file, temp_dir) fn_contents = os.path.join(temp_dir, "contents.json") text = file_utils.read_text_file(fn_contents) contents = json.loads(text) workspaces = contents["workspaces"] if len(workspaces) > 1: errors.combo_error( "import of archive files with multiple workspaces not yet supported" ) workspace = workspaces[0] jobs = contents["jobs"] if not new_workspace: new_workspace = workspace if self.store.does_workspace_exist(new_workspace): errors.combo_error( "cannot import to an existing workspace name: {}".format( new_workspace)) if show_output: console.print( "\nimporting workspace {} ({} jobs) as {} from: {}".format( workspace, len(jobs), new_workspace, input_file)) if not overwrite: # before making any changes, verify all job names are available job_ids = [] for jc in jobs: prev_job_id = jc["job_id"] prev_base = prev_job_id.split("_")[-1] new_job_id = "{}_{}".format(job_prefix, prev_base) job_ids.append(new_job_id) filter_dict = {"job_id": {"$in": job_ids}} records = self.store.mongo.get_info_for_jobs( filter_dict, {"_id": 1}) if records: id = records[0]["_id"] errors.general_error( "at least 1 job ID with prefix already exists: {}".format( id)) # create the new workspace self.store.create_workspace(new_workspace) # now, import each JOB max_run_seen = 0 max_end_seen = 0 for jc in jobs: prev_job_id = jc["job_id"] prev_base = prev_job_id.split("_")[-1] new_job_id = "{}_{}".format(job_prefix, prev_base) runs = jc["runs"] if show_output: console.print(" importing: {} => {} ({} runs)".format( prev_job_id, new_job_id, len(runs))) # create MONGO JOB document mongo_job_fn = os.path.join( temp_dir, "mongo/jobs/{}/mongo_job.json".format(prev_job_id)) self.import_job_mongo_document(mongo_job_fn, new_workspace, prev_job_id, new_job_id) # create STORAGE JOB blobs storage_job_path = os.path.join( temp_dir, "storage/jobs/{}".format(prev_job_id)) self.import_job_storage_blobs(storage_job_path, new_workspace, prev_job_id, new_job_id) # for each run in job for run_name in runs: run_number = run_helper.get_parent_run_number(run_name) max_run_seen = max(max_run_seen, run_number) # copy MONGO RUN document mongo_run_fn = os.path.join( temp_dir, "mongo/workspaces/{}/runs/{}/mongo_run.json".format( workspace, run_name)) end_id = self.import_run_mongo_document( mongo_run_fn, workspace, new_workspace, prev_job_id, new_job_id, run_name) max_end_seen = max(max_end_seen, end_id) # copy STORAGE RUN blobs storage_run_path = os.path.join( temp_dir, "storage/workspaces/{}/runs/{}".format( workspace, run_name)) self.import_run_storage_blobs(storage_run_path, workspace, new_workspace, prev_job_id, new_job_id, run_name) # update MONGO counters for new workspace self.store.mongo.init_workspace_counters(new_workspace, 1 + max_run_seen, 1 + max_end_seen)
def restart_psm_if_needed(self): ''' processing: - if PSM is running on old psm.py, kill the process and restart it. - if PMS is not running, start it. ''' kill_needed = False start_needed = False fn_src = os.path.join(file_utils.get_my_file_dir(__file__), constants.PSM) fn_dest = file_utils.path_join(self.xt_path, constants.PSM, for_windows=self.box_is_windows) running = bool(self._get_psm_process_id()) #print("PSM running=", running) if running: # do file contents match? text_src = file_utils.read_text_file(fn_src) text_dest = "" if self.remote_file_exists(fn_dest): # read text of fn_dest on remote box with self.ftp_client.open(fn_dest, "rb") as infile: bytes_dest = infile.read() text_dest = bytes_dest.decode() # normalize NEWLINE chars before comparison # (ftp_client seems to add CR when called frm windows) text_src = text_src.replace("\r\n", "\n") text_dest = text_dest.replace("\r\n", "\n") if text_src != text_dest: kill_needed = True else: start_needed = True if kill_needed: p = self._get_psm_process_id() ssh_cmd = "kill -kill {}".format(p) self.run_cmd(ssh_cmd) start_needed = True if start_needed: # create required dirs self._make_dir(self.psm_queue_path) self._make_dir(self.cwd_path) # copy psm.py # caution: node slashes in fn_dest must all match box's OS style fn_dest = file_utils.fix_slashes(fn_dest, is_linux=True) status = self.ftp_client.put(fn_src, fn_dest) # run psm fn_log = os.path.join(self.xt_path, constants.PSMLOG) if self.box_is_windows: cmd_parts = ["cmd", "/c", "python -u {} > {}".format(fn_dest, fn_log)] cmd = " ".join(cmd_parts) else: fn_log = file_utils.fix_slashes(fn_log, is_linux=True) cmd = 'nohup bash --login -c "python -u {}" </dev/null > {} 2>&1 &'.format(fn_dest, fn_log) #print("cmd=", cmd) #process_utils.sync_run_ssh(self, self.box_addr, cmd) self.run_cmd(cmd) for i in range(20): # don't return until PSM is running running = bool(self._get_psm_process_id()) if running: break time.sleep(.5) if not running: errors.general_error("Could not start remote PSM on box={}".format(self.box_addr))