def main(arg_list=None): utils.init_logging(constants.FN_XT_EVENTS, logger, "XT Demo") args = parse_args(arg_list) auto_mode = args.auto nomonitor = args.nomonitor nogui = args.nogui quick_test = args.quick_test philly = args.philly build_cmds(auto_mode, quick_test, nomonitor, nogui, philly=philly) steps = parse_steps(args.steps) response = "" if not auto_mode: print() print("This demonstrates how to run common XT commands") print("Press ENTER to execute each command (or s=SKIP, b=BACK, q=QUIT)") print() print("hit any key to continue: ", end="", flush=True) response = wait_for_any_key(auto_mode) if response != "q": navigate(cmds, auto_mode, steps) # clean-up file_utils.ensure_dir_deleted(ARCHIVES_DIR) print("end of xt_demo") return cmd_count
def main(): # init environment config = xt_config.get_merged_config() file_utils.ensure_dir_exists(TEST_DIR) with DirChange(TEST_DIR): tester = StorageProviderTests() tester.test_impl("xtsandboxstorage") tester.test_impl("filestorage") file_utils.ensure_dir_deleted(TEST_DIR) return tester._assert_count
def cleanup(): set_test_group("cleanup...") file_utils.ensure_dir_deleted("upload_testing") file_utils.ensure_dir_deleted("download_testing") # check for errors in runs text = xt("xt list runs --status=error", capture_output=True) # print("\nruns with errors:") # print(text) if not "no matching runs found" in text[0]: errors.internal_error("quick-test: above 'list runs' contains errors")
def create_demo(self, destination, response, overwrite): ''' This command will removed the specified destination directory if it exists (prompting the user for approval). Specifying the current directory as the destination will produce an error. ''' # set up from_dir from_dir = file_utils.get_xtlib_dir() + "/demo_files" # set up dest_dir dest_dir = destination if not dest_dir: errors.syntax_error("An output directory must be specified") create = True console.print("creating demo files at: {}".format( os.path.abspath(dest_dir))) if os.path.exists(dest_dir): answer = pc_utils.input_response( "'{}' already exists; OK to delete? (y/n): ".format(dest_dir), response) if answer != "y": create = False if create: file_utils.ensure_dir_deleted(dest_dir) shutil.copytree(from_dir, dest_dir) #file_utils.copy_tree(from_dir, dest_dir) if not self.store.does_workspace_exist("xt-demo"): # import xt-demo workspace from archive file console.print( "importing xt-demo workspace (usually takes about 30 seconds)" ) impl_storage_api = ImplStorageApi(self.config, self.store) fn_archive = os.path.join(file_utils.get_xtlib_dir(), "demo_files", "xt-demo-archive.zip") impl_storage_api.import_workspace(fn_archive, "xt-demo", "xtd", overwrite=overwrite, show_output=False)
def snapshot_all_code(self, snapshot_dir, cmd_parts, args): ''' make local snapshot of each code_dir (and xtlib, if needed) ''' code_dirs = args["code_dirs"] xtlib_capture = args["xtlib_upload"] code_omit = args["code_omit"] script_dir = None code_upload = args["code_upload"] # this step should always be done so that script_dir is removed from cmd_parts script_dir = self.remove_script_dir_from_parts(cmd_parts) if code_upload: for i, code_dir in enumerate(code_dirs): # fixup "$scriptdir" relative paths if "$scriptdir" in code_dir: code_dir = code_dir.replace("$scriptdir", script_dir) if "::" in code_dir: code_dir, dest_dir = code_dir.split("::") else: dest_dir = "." self.make_local_snapshot(snapshot_dir, code_dir, dest_dir, code_omit) else: script_dir = snapshot_dir if xtlib_capture: # copy XTLIB directory to "xtlib" subdir of temp xtlib_dir = file_utils.get_xtlib_dir() dest_dir = snapshot_dir + "/xtlib" file_utils.ensure_dir_deleted(dest_dir) # don't copy the "demo_files" directory shutil.copytree(xtlib_dir, dest_dir, ignore=shutil.ignore_patterns("demo_files")) console.diag("after create local snapshot") return script_dir
def process_run_command(self, args): self.args = args # ensure workspace exists workspace = args['workspace'] dry_run = args['dry_run'] fake_submit = args["fake_submit"] if not fake_submit: self.store.ensure_workspace_exists(workspace, flag_as_error=False) # PRE-PROCESS ARGS service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, compute, compute_def = \ self.process_args(args) # create backend helper (pool, philly, batch, aml) cluster = utils.safe_value(compute_def, "cluster") vc = utils.safe_value(compute_def, "vc") self.backend = self.core.create_backend(compute, cluster=cluster, vc=vc, username=None) # add conda_packages and pip_packages from SETUP to ARGS setup_def = self.config.get_setup_from_target_def(compute_def) conda_packages = utils.safe_value(setup_def, "conda-packages") pip_packages = utils.safe_value(setup_def, "pip-packages") args["conda_packages"] = conda_packages if conda_packages else [] args["pip_packages"] = pip_packages if pip_packages else [] self.adjust_pip_packages(args) snapshot_dir = self.temp_dir if fake_submit: script_dir = snapshot_dir else: # note: always create a snapshot dir for backends to add needed files file_utils.ensure_dir_deleted(snapshot_dir) script_dir = self.snapshot_all_code(snapshot_dir, cmd_parts, args) self.script_dir = script_dir direct_run = args["direct_run"] # do we need to start the xt controller? use_controller = not direct_run adjustment_scripts = None # create a job_secret that can later be used to authenticate with the XT controller # NOTE: we currently log this secret as a job property, which allows all team members to view and control this job job_secret = str(uuid.uuid4()) # do we need to build a "docker run" command? if not self.backend.provides_container_support(): env = args["docker"] if not env: docker_name = utils.safe_value(compute_def, "docker") if docker_name and docker_name != "none": cmd_parts = self.build_docker_cmd(docker_name, target_file, cmd_parts, script_dir, snapshot_dir, job_secret, args) args["docker"] = docker_name # for use in building run context info # BUILD CMDS (from static hparam search, user multi cmds, or single user cmd) cmds, total_run_count, repeat_count, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, search_style = \ self.build_cmds_with_search(service_type, cmd_parts, parent_script, run_script, run_cmd_from_script, use_controller, dry_run, args) if dry_run: return # make new values available args["search_style"] = search_style args["total_run_count"] = total_run_count resume_name = args['resume_name'] keep_name = False # args['keep_name'] experiment = args['experiment'] is_distributed = args['distributed'] direct_run = args["direct_run"] # CREATE JOB to hold all runs if fake_submit: # use lastrun/lastjob info to get a fast incremental fake job number xtd = xt_dict.read_xt_dict() fake_job_num = xtd["fake_job_num"] if "fake_job_num" in xtd else 1 xtd["fake_job_num"] = fake_job_num + 1 xt_dict.write_xt_dict(xtd) job_id = "fake_job" + str(fake_job_num) else: job_id = self.store.create_job() fb.feedback(job_id) # start the feedback (by parts) fb.feedback("{}: {}".format("target", compute)) # write hparams to FILES boxes, num_boxes = self.write_hparams_to_files(job_id, cmds, fake_submit, using_hp, args) if sweeps_text and not fake_submit: self.upload_sweep_data(sweeps_text, experiment, job_id, args=args) # if num_boxes > 1 and service_type != "batch": # fb.feedback("", is_final=True) parent_name = None # BUILD RUNS, by box job_runs = [] run_count = 1 if is_distributed else len(boxes) secrets_by_node = {} remote_control = args["remote_control"] for i in range(run_count): box_name = boxes[i] # generate a box secret for talking to XT controller for this node box_secret = str(uuid.uuid4()) if remote_control else "" # build runs for box_name run_data = self.build_first_run_for_node(i, boxes[i], target_file, ps_path, using_hp, using_aml_hparam, run_specs, job_id, parent_name, cmds, pool_info, repeat_count, fake_submit, search_style, box_secret, args) # for now, adhere to the more general design of multiple runs per box box_runs = [run_data] job_runs.append(box_runs) node_id = utils.node_id(i) secrets_by_node[node_id] = box_secret # FEEDBACK ptype = "single " if search_style == "single" else "parent " if is_distributed: ptype = "master " if run_count == 1: node_msg = "creating {}run".format(ptype) else: node_msg = "creating {}runs: {}/{}".format(ptype, i+1, run_count) if service_type == "pool": node_msg += ", box: " + box_name fb.feedback(node_msg, id="node_msg") # , add_seperator=is_last) last_msg = node_msg # run the job # build box: runs dict for job info file runs_by_box, last_run = self.build_runs_by_box(job_runs, workspace) # now that we have run names for all static run names for all nodes, we can adjust cmds (and before files) for using the controller if use_controller: # we will create 2 temp. controller files in the CURRENT DIRECTORY (that will be captured to JOB) # this will also adjust commands for each node to run the XT controller adjustment_scripts = self.core.adjust_job_for_controller_run(job_id, job_runs, cmds, using_hp, experiment, service_type, snapshot_dir, search_style, args=args) else: adjustment_scripts = self.core.adjust_job_for_direct_run(job_id, job_runs, cmds, using_hp, experiment, service_type, snapshot_dir, search_style, args=args) # add env vars used by both controller and runs env_vars = args["env_vars"] # create a job guid to uniquely identify this job across all XT instances job_guid = str(uuid.uuid4()) # we add with "node0" and "job_secret", but backend service will override for each node scriptor.add_controller_env_vars(env_vars, self.config, None, "node0") data_local = args["data_local"] if "$scriptdir" in data_local: data_local = os.path.realpath(data_local.replace("$scriptdir", script_dir)) args["data_local"] = data_local model_local = args["model_local"] if "$scriptdir" in model_local: model_local = os.path.realpath(model_local.replace("$scriptdir", script_dir)) args["model_local"] = model_local # ADJUST CMDS: this allows backend to write scripts to snapshot dir, if needed, as a way of adjusting/wrapping run commands self.backend.adjust_run_commands(job_id, job_runs, using_hp, experiment, service_type, snapshot_dir, args=args) # upload CODE from snapshot_dir code_upload = args["code_upload"] code_omit = args["code_omit"] code_zip = args["code_zip"] if not fake_submit: if code_upload: self.core.upload_before_files_to_job(job_id, snapshot_dir, "before/code", code_omit, code_zip, "code", args) # upload DATA from data_local (do we need to keep this? should we upload to normal DATA location, vs. job?) data_upload = args["data_upload"] if data_upload: if not data_local: errors.config_error("cannot do data-upload because no data-local path is defined in the XT config file") data_omit = args["data_omit"] data_zip = "none" self.core.upload_before_files_to_job(job_id, data_local, "before/data", data_omit, data_zip, "data", args) # dispatch to BACKEND submitters ''' Note: backend submitter functions are responsible for: - submitting the job (for each node, queue runs for that node) - return service job id (or list of them if per node) NOTE: there is a timing issue where submitted job needs access to job info, but final piece of job info (service info) is only return after job is submitted. Therefore, we structure steps as follows: - primary job info is logged - job is submitted thru backend - service info for job is logged ''' # LOG PRIMARY JOB INFO dd = {} if not fake_submit: # mark runs as QUEUED for runs in runs_by_box.values(): first_run = runs[0] self.store.log_run_event(workspace, first_run["run_name"], "status-change", {"status": "queued"}) # write the job info file (now that backend has had a chance to update it) job_num = int(job_id[3:]) xt_cmd = args["xt_cmd"] schedule = args["schedule"] concurrent = args["concurrent"] # this job property is used to ensure we don't exceed the specified # of runs when using repeat_count on each node dynamic_runs_remaining = None if search_style == "single" else total_run_count node_count = len(runs_by_box) # static_runs_by_node = None # if schedule == "static": # static_runs_by_node = self.build_static_runs_by_node(total_run_count, node_count) #console.diag("static_runs_by_node=", static_runs_by_node) active_runs = mongo_run_index.build_active_runs(schedule, total_run_count, node_count) dd = {"job_id": job_id, "job_num": job_num, "compute": compute, "ws_name": workspace, "exper_name": experiment, "pool_info": compute_def, "runs_by_box": runs_by_box, "primary_metric": args["primary_metric"], "run_count": total_run_count, "repeat": repeat_count, "search_type": args["search_type"], "username": args["username"], "hold": args["hold"], "started": utils.get_time(), "job_status": "submitted", "running_nodes": 0, "running_runs": 0, "error_runs": 0, "completed_runs": 0, "job_guid": job_guid, "job_secret": job_secret, "dynamic_runs_remaining": dynamic_runs_remaining, "search_style": search_style, "active_runs": active_runs, "connect_info_by_node": {}, "secrets_by_node": secrets_by_node, "xt_cmd": xt_cmd, "schedule": schedule, "node_count": node_count, "concurrent": concurrent, "service_job_info": None, "service_info_by_node": None, } self.store.log_job_info(job_id, dd) # SUBMIT JOB # NOTE: we use "pool_info" here (vs. compute_def, which has not been updated with explicit args) service_job_info, service_info_by_node = self.backend.submit_job(job_id, job_runs, workspace, pool_info, resume_name, repeat_count, using_hp, runs_by_box, experiment, snapshot_dir, adjustment_scripts, args) # POST SUBMIT processing # update job info if not fake_submit: dd["service_job_info"] = service_job_info dd["service_info_by_node"] = service_info_by_node self.store.log_job_info(job_id, dd) # update lastrun/lastjob info xtd = xt_dict.read_xt_dict() xtd["last_run"] = last_run xtd["last_job"] = job_id xt_dict.write_xt_dict(xtd) # return values for API support (X) return cmds, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, job_id