def ensure_script_ext_matches_box(self, script_name, fn_script, box_info): _, file_ext = os.path.splitext(fn_script) if file_ext in [".bat", ".sh"]: expected_ext = ".bat" if box_info.box_os == "windows" else ".sh" if file_ext != expected_ext: errors.combo_error("{} file ext='{}' doesn't match box.os='{}'".format(script_name, file_ext, box_info.box_os))
def read_user_multi_commands(self, using_hp, run_script, cmd_parts, args): cmds = None lines = self.config.get("commands") if lines: # commands specified in the config file args["multi_commands"] = True multi_commands = True else: # did user specify --multi-commands multi_commands = args["multi_commands"] if multi_commands: if using_hp: errors.combo_error("Cannot specify both -multi-commands and hyperparameter search") # read MULTI CMDS if not lines: fn_cmds = args["script"] # run_script if run_script else cmd_parts[0] lines = file_utils.read_text_file(fn_cmds, as_lines=True) lines = [line.strip() for line in lines if line and not line.strip().startswith("#")] cmds = [self.fixup_script_in_cmd(line) for line in lines] return cmds
def __init__(self, run_record, plot_x_metric_name, plot_y_metric_name): metric_dict = run_record["data"] if not plot_x_metric_name in metric_dict: errors.combo_error( "step name hyperparameter '{}' (named in XT config file) not found in hp search file" .format(plot_x_metric_name)) if not plot_y_metric_name in metric_dict: errors.combo_error( "primary_metric hyperparameter '{}' (named in XT config file) not found in hp search file" .format(plot_y_metric_name)) self.x = int(metric_dict[plot_x_metric_name]) self.y = float(metric_dict[plot_y_metric_name])
def monitor_with_jupyter(self, workspace, run_name): if not self.is_aml_ws(workspace): errors.combo_error( "the monitor command is only supported for Azure ML runs") run_name, actual_ws = run_helper.parse_run_name(workspace, run_name) fn = self.azure_ml.make_monitor_notebook(actual_ws, run_name) dir = os.path.dirname(fn) #console.print("jupyter notebook written to: " + fn) monitor_cmd = "jupyter notebook --notebook-dir=" + dir console.print("monitoring notebook created; to run:") console.print(" " + monitor_cmd)
def parse_string_list(self, tok, scanner, pipe_objects_enabled=True): global pipe_object_list #print("parse_string_list, tok=", tok) if not tok: # empty string specified value = [] tok = scanner.scan() # skip over the empty string elif tok == "$": if pipe_objects_enabled: global pipe_object_list pipe_object_list = get_xt_objects_from_cmd_piping() console.diag("pipe_object_list: {}".format(pipe_object_list)) if pipe_objects_enabled and pipe_object_list: #print("found '*', pipe_object_list=", pipe_object_list) value = pipe_object_list console.print("replacing '$' with: ", value) else: errors.combo_error( "'$' can only be used for piping the output of a previous XT command into this run" ) # mark pipe objects as having been consumed by this parsing pipe_object_list = None tok = scanner.scan() # skip over the $ else: # scan a comma separated list of tokens (some of which can be single quoted strings) value = [] while tok != None: if tok.startswith("--"): break ev = self.expand_system_values(tok) value.append(ev) tok = scanner.scan() if tok != ",": break tok = scanner.scan() # skip over the comma return value, tok
def validate_storage_and_mongo(self, mongo): ''' 1. ensure storage has been initialized for XT 2. ensure mongo and storage point to each other 3. update storage format if needed 4. update mongo format if needed ''' # ensure storage has been initialized for XT self._create_info_container_if_needed() # ensure mongo points to our storage storage_name = self.provider.get_service_name() connected_mongo = mongo.get_service_name() mongo_info = mongo.get_mongo_info() paired_storage = utils.safe_value(mongo_info, "paired_storage") if paired_storage and storage_name != paired_storage: errors.combo_error("mongo paired with storage service='{}', but passed XT storage service='{}'".format( \ paired_storage, storage_name)) storage_info = self._get_storage_info() paired_mongo = utils.safe_value(storage_info, "paired_mongo") if paired_mongo and connected_mongo != paired_mongo: errors.combo_error("this storage paired with mongo service='{}', but passed connection string for mongo service='{}'".format( \ connected_mongo, paired_mongo)) if not paired_storage: mongo_info = { "paired_storage": storage_name, "storage_version": constants.STORAGE_VERSION } mongo.set_mongo_info(mongo_info) if not paired_mongo: storage_info = { "paired_mongo": connected_mongo, "storage_version": constants.STORAGE_VERSION } self._set_storage_info(storage_info) # only check once, (takes .5 secs if already imported) # remove this check after all XT users have imported (approx. Dec 2019) # but keep around (good for mongodb repair, if needed) self.import_jobs_to_mongo_if_needed(mongo)
def import_workspace(self, input_file, new_workspace, job_prefix, overwrite, show_output=True): if not job_prefix: errors.combo_error("job prefix cannot be blank") with tempfile.TemporaryDirectory(prefix="import-") as temp_dir: self.import_workspace_core(temp_dir, input_file, new_workspace, job_prefix, overwrite, show_output=show_output) if show_output: console.print(" import completed")
def write_hparams_to_files(self, job_id, cmds, fake_submit, using_hp, args): # write to job-level sweeps-list file #console.print("cmds=", cmds) cmds_text = json.dumps(cmds) if not fake_submit: self.store.create_job_file(job_id, constants.HP_SWEEP_LIST_FN, cmds_text) boxes, pool_info, service_type = box_information.get_box_list(self, job_id=job_id, args=args) num_boxes = len(boxes) is_distributed = args["distributed"] if is_distributed: # check for conflicts if using_hp: errors.combo_error("Cannot do hyperparamer search on a distributed-training job") if service_type != "aml": errors.combo_error("Distributed-training is currently only supported for AML jobs") return boxes, num_boxes
def calc_actual_layout(self, count, layout): if not "x" in layout: errors.syntax_error( "layout string must be of form RxC (R=# rows, C=# cols)") r, c = layout.split("x", 1) if r: r = int(r) c = int(c) if c else math.ceil(count / r) elif c: c = int(c) r = int(r) if r else math.ceil(count / c) full_count = r * c if full_count < count: errors.combo_error( "too many plots ({}) for layout cells ({})".format( count, full_count)) return r, c
def import_workspace_core(self, temp_dir, input_file, new_workspace, job_prefix, overwrite, show_output): # unzip files and use contents.json file_helper.unzip_files(input_file, temp_dir) fn_contents = os.path.join(temp_dir, "contents.json") text = file_utils.read_text_file(fn_contents) contents = json.loads(text) workspaces = contents["workspaces"] if len(workspaces) > 1: errors.combo_error( "import of archive files with multiple workspaces not yet supported" ) workspace = workspaces[0] jobs = contents["jobs"] if not new_workspace: new_workspace = workspace if self.store.does_workspace_exist(new_workspace): errors.combo_error( "cannot import to an existing workspace name: {}".format( new_workspace)) if show_output: console.print( "\nimporting workspace {} ({} jobs) as {} from: {}".format( workspace, len(jobs), new_workspace, input_file)) if not overwrite: # before making any changes, verify all job names are available job_ids = [] for jc in jobs: prev_job_id = jc["job_id"] prev_base = prev_job_id.split("_")[-1] new_job_id = "{}_{}".format(job_prefix, prev_base) job_ids.append(new_job_id) filter_dict = {"job_id": {"$in": job_ids}} records = self.store.mongo.get_info_for_jobs( filter_dict, {"_id": 1}) if records: id = records[0]["_id"] errors.general_error( "at least 1 job ID with prefix already exists: {}".format( id)) # create the new workspace self.store.create_workspace(new_workspace) # now, import each JOB max_run_seen = 0 max_end_seen = 0 for jc in jobs: prev_job_id = jc["job_id"] prev_base = prev_job_id.split("_")[-1] new_job_id = "{}_{}".format(job_prefix, prev_base) runs = jc["runs"] if show_output: console.print(" importing: {} => {} ({} runs)".format( prev_job_id, new_job_id, len(runs))) # create MONGO JOB document mongo_job_fn = os.path.join( temp_dir, "mongo/jobs/{}/mongo_job.json".format(prev_job_id)) self.import_job_mongo_document(mongo_job_fn, new_workspace, prev_job_id, new_job_id) # create STORAGE JOB blobs storage_job_path = os.path.join( temp_dir, "storage/jobs/{}".format(prev_job_id)) self.import_job_storage_blobs(storage_job_path, new_workspace, prev_job_id, new_job_id) # for each run in job for run_name in runs: run_number = run_helper.get_parent_run_number(run_name) max_run_seen = max(max_run_seen, run_number) # copy MONGO RUN document mongo_run_fn = os.path.join( temp_dir, "mongo/workspaces/{}/runs/{}/mongo_run.json".format( workspace, run_name)) end_id = self.import_run_mongo_document( mongo_run_fn, workspace, new_workspace, prev_job_id, new_job_id, run_name) max_end_seen = max(max_end_seen, end_id) # copy STORAGE RUN blobs storage_run_path = os.path.join( temp_dir, "storage/workspaces/{}/runs/{}".format( workspace, run_name)) self.import_run_storage_blobs(storage_run_path, workspace, new_workspace, prev_job_id, new_job_id, run_name) # update MONGO counters for new workspace self.store.mongo.init_workspace_counters(new_workspace, 1 + max_run_seen, 1 + max_end_seen)
def export_workspace_core(self, temp_dir, output_file, workspace, tags_all, tags_any, jobs, experiment, show_output): # get specified jobs from workspace (by job name, or by workspace name) args = { "job_list": jobs, "tags_all": tags_all, "tags_any": tags_any, "workspace": workspace, "all": True, "target": None, "available": None, "experiment": experiment, "service_type": None, "username": None, "filter": None, "columns": ["job", "workspace"] } job_list, _, _, _, _ = job_helper.get_list_jobs_records( self.store, self.config, args) if show_output: console.print("\nexporting workspace {} ({} jobs) to: {}".format( workspace, len(job_list), output_file)) # build a table of contents structure describing this archive archive_version = "1" build = constants.BUILD username = self.config.get("general", "username") dt = datetime.datetime.now() dt_text = str(dt) storage_name = self.store.get_name() mongo_name = self.store.mongo.get_service_name() workspaces = [] jobs = [] contents = { "user": username, "export_date": dt_text, "archive_version": archive_version, "xt_build": build, "storage": storage_name, "mongo": mongo_name, "workspaces": workspaces, "jobs": jobs } first_job = None first_ws = None # for each job in workspace for jr in job_list: job_id = jr["job"] job_ws = jr["workspace"] mongo_runs = self.store.mongo.get_info_for_runs( job_ws, {"job_id": job_id}, None) run_names = [mr["run_name"] for mr in mongo_runs] if show_output: console.print(" exporting: {} ({} runs)".format( job_id, len(mongo_runs))) job_content = { "job_id": job_id, "workspace": job_ws, "runs": run_names } jobs.append(job_content) if first_job is None: first_job = job_id first_ws = job_ws workspaces.append(job_ws) if job_ws != first_ws: errors.combo_error("can only export jobs from a single workspace (job {} has ws={}, job {} as ws={})". \ format(first_job, first_ws, job_id, job_ws)) # copy MONGO JOB document temp_mongo_path = os.path.join(temp_dir, "mongo/jobs/{}".format(job_id)) self.export_job_mongo_document(job_id, temp_mongo_path) # copy STORAGE JOB blobs temp_store_path = os.path.join(temp_dir, "storage/jobs/{}".format(job_id)) self.export_job_storage_blobs(job_id, temp_store_path) # for each run in job for mr in mongo_runs: # copy MONGO RUN document run_name = mr["run_name"] temp_mongo_path = os.path.join( temp_dir, "mongo/workspaces/{}/runs/{}".format(job_ws, run_name)) self.export_run_mongo_document(mr, temp_mongo_path) # copy STORAGE RUN blobs temp_store_path = os.path.join( temp_dir, "storage/workspaces/{}/runs/{}".format(job_ws, run_name)) self.export_run_storage_blobs(job_ws, run_name, temp_store_path) # add contents text = json.dumps(contents, indent=4) fn_contents = os.path.join(temp_dir, "contents.json") file_utils.write_text_file(fn_contents, text) # create zip file filenames, local_path = file_utils.get_local_filenames(temp_dir + "/**") prefix_len = 1 + len(temp_dir) file_helper.zip_up_filenames(output_file, filenames, compress=True, remove_prefix_len=prefix_len)
def emit_mount_cmds( self, cmds, storage_name, storage_key, container, store_path, mnt_path, is_writable, install_blobfuse, sudo_available, use_username, use_allow_other, env_var_name, env_var_name2, nonempty=False, cleanup_needed=False, ): if cleanup_needed: # on pool machines, for any action, always UNMOUNT mnt_dir # also, always zap the folder in case in was used in downloading files if self.is_windows: self.append(cmds, "rd /s /q {}".format(mnt_path)) else: sudo = "sudo " if sudo_available else "" self.append(cmds, "{}fusermount -u -q {}".format(sudo, mnt_path)) # do NOT call rm as it can delete cloud data if fusermount -u failed #self.append(cmds,"{}rm -rf {}".format(sudo, mnt_path)) if self.is_windows: # TODO: provide pseudo-mount for local machine by using data-local and store-local config properties errors.combo_error( "Mounting of Azure storage (for '{}') not supported by target OS (Windows)" .format(store_data_dir)) # for now, all commands can assume linux form self.append( cmds, "echo MOUNTING {} to container {}".format(mnt_path, container)) full_mnt_path = mnt_path + "/" + store_path self.append( cmds, "echo running export {}={}".format(env_var_name, full_mnt_path)) self.append(cmds, 'echo setting {}="{}"'.format(env_var_name, full_mnt_path)) self.append(cmds, 'export {}="{}"'.format(env_var_name, full_mnt_path)) self.append(cmds, 'export {}="{}"'.format(env_var_name2, full_mnt_path)) requests = [{ "container": container, "mnt_dir": mnt_path, "readonly": not is_writable }] sub_cmds = self.create_blobfuse_commands( storage_name, storage_key, sudo_available, requests, install_blobfuse=install_blobfuse, use_username=use_username, use_allow_other=use_allow_other, nonempty=nonempty) cmds += sub_cmds