def create_group_directory(self, campaign_name, app_dir, group_name, runs, max_nprocs, nodes, launch_mode, component_subdirs, walltime, node_exclusive, timeout, machine, sosd_path=None, sos_analysis_path=None, tau_profiling=False, tau_tracing=False, kill_on_partial_failure=False, run_post_process_script=None, run_post_process_stop_on_failure=False, scheduler_options=None, run_dir_setup_script=None): """Copy scripts for the appropriate scheduler to group directory, and write environment configuration. Returns required number of nodes, which will be calculated if the passed nodes is None""" script_dir = os.path.join(config.CHEETAH_PATH_SCHEDULER, self.scheduler_name, 'group') if not os.path.isdir(script_dir): raise ValueError("scheduler '%s' is not yet supported (path '%s')" % (self.scheduler_name, script_dir)) if scheduler_options is None: scheduler_options = {} copytree_to_dir(script_dir, self.output_directory) fobs_path = os.path.join(self.output_directory, 'fobs.json') min_nodes = 1 f = open(fobs_path, 'w') fob_list = [] for i, run in enumerate(runs): # TODO: abstract this to higher levels os.makedirs(run.run_path, exist_ok=True) # Create working dir for each component for rc in run.run_components: os.makedirs(rc.working_dir, exist_ok=True) if run.sosflow_profiling: run.insert_sosflow(sosd_path, sos_analysis_path, run.run_path, machine.processes_per_node) # Copy the global input files common to all components for input_rpath in run.inputs: copy_to_dir(input_rpath, run.run_path) # Copy input files requested by each component # save working dirs for later use working_dirs = {} # map component name to path for rc in run.run_components: working_dirs[rc.name] = rc.working_dir # if rc has an adios xml file, copy it to working dir if rc.adios_xml_file: copy_to_dir(rc.adios_xml_file, rc.working_dir) # now copy other inputs marked under component_inputs if rc.component_inputs is not None: for input_file in rc.component_inputs: dest = os.path.join(rc.working_dir, os.path.basename( input_file)) # input type is symlink if type(input_file) == SymLink: os.symlink(input_file, dest) # input type is a regular file elif os.path.isfile(input_file): copy_to_dir(input_file, rc.working_dir) # Input file is a directory elif os.path.isdir(input_file): copytree_to_dir(input_file, dest) else: raise exc.CheetahException \ ("Could not component input {}" .format(input_file)) # ADIOS XML param support adios_xml_params = \ run.instance.get_parameter_values_by_type(ParamAdiosXML) or \ run.instance.get_parameter_values_by_type(ParamADIOS2XML) for pv in adios_xml_params: working_dir = working_dirs[pv.target] # dirty way of getting the adios xml filename of the rc # that is represented by pv.target rc_adios_xml = self._get_rc_adios_xml_filename( run, pv.target) xml_filepath = os.path.join(working_dir, os.path.basename(rc_adios_xml)) # Check if this is adios1 or adios2 adios_version = get_adios_version(rc_adios_xml) if adios_version == 1: if pv.param_type == "adios_transform": adios_params.adios_xml_transform( xml_filepath,pv.group_name, pv.var_name, pv.value) elif pv.param_type == "adios_transport": # value could be # "MPI_AGGREGATE:num_aggregators=64;num_osts" # extract the method name and the method options method_name = pv.value method_opts = "" if ":" in pv.value: value_tokens = pv.value.split(":", 1) method_name = value_tokens[0] method_opts = value_tokens[1] adios_params.adios_xml_transport( xml_filepath, pv.group_name, method_name, method_opts) else: raise exc.CheetahException("Unrecognized adios param") else: # adios version == 2 operation_value = list(pv.value.keys())[0] if pv.operation_name in ('engine', 'transport'): parameters = list(pv.value.values())[0] if pv.operation_name == 'engine': adios2.set_engine(xml_filepath, pv.io_name, operation_value, parameters) else: adios2.set_transport(xml_filepath, pv.io_name, operation_value, parameters) else: # operation_name == 'var_operation' var_name = list(pv.value.keys())[0] var_name_dict = pv.value[var_name] var_operation_value = list(var_name_dict.keys())[0] var_op_dict = var_name_dict[var_operation_value] parameters = var_op_dict adios2.set_var_operation(xml_filepath, pv.io_name, var_name, var_operation_value, parameters) # Calculate the no. of nodes required by this run. # This must be done after dataspaces support is added. if run.total_nodes > min_nodes: min_nodes = run.total_nodes # Generic config file support. Note: slurps entire # config file into memory, requires adding file to # campaign 'inputs' option. config_params = \ run.instance.get_parameter_values_by_type(ParamConfig) for pv in config_params: working_dir = working_dirs[pv.target] src_filepath = relative_or_absolute_path(app_dir, pv.config_filename) # Allow for relative pathnames in the spec src_filename = pv.config_filename if pv.config_filename[0] == '/': src_filename = os.path.basename(src_filepath) config_filepath = os.path.join(working_dir, src_filename) if not os.path.isfile(config_filepath): copy_to_path(src_filepath, config_filepath) lines = [] # read and modify lines # hack: handle json files. currently works only on singly # nested json files if config_filepath.endswith(".json"): json_config_set_option(config_filepath, pv.match_string, pv.value) else: # handle other file types with open(config_filepath) as config_f: for line in config_f: line = line.replace(pv.match_string, pv.value) lines.append(line) # rewrite file with modified lines with open(config_filepath, 'w') as config_f: config_f.write("".join(lines)) # Key value config file support. Note: slurps entire # config file into memory, requires adding file to # campaign 'inputs' option. kv_params = \ run.instance.get_parameter_values_by_type(ParamKeyValue) for pv in kv_params: working_dir = working_dirs[pv.target] src_filepath = relative_or_absolute_path(app_dir, pv.config_filename) # Allow for relative pathnames in the spec src_filename = pv.config_filename if pv.config_filename[0] == '/': src_filename = os.path.basename(src_filepath) kv_filepath = os.path.join(working_dir, src_filename) if not os.path.isfile(kv_filepath): copy_to_path(src_filepath, kv_filepath) lines = [] # read and modify lines key_found = False with open(kv_filepath) as kv_f: for line in kv_f: parts = line.split('=', 1) if len(parts) == 2: k = parts[0].strip() if k == pv.key_name: # assume all k=v type formats will # support no spaces around equals line = k + '=' + str(pv.value) # preserve a user comment if it exists if '!' in parts[1]: line = line + " !" + \ parts[1].strip().split('!')[1] line = line + '\n' key_found = True lines.append(line) assert key_found, \ "Issue parsing a ParamKeyValue: Could not find key {}"\ " in config file {}".format(pv.key_name, src_filepath) # rewrite file with modified lines with open(kv_filepath, 'w') as kv_f: kv_f.write("".join(lines)) # Env var parameter values kv_params = run.instance.get_parameter_values_by_type(ParamEnvVar) for pv in kv_params: rc = run._get_rc_by_name(pv.target) rc.env[pv.option] = str(pv.value) # save code commands as text params_path_txt = os.path.join(run.run_path, self.run_command_name) with open(params_path_txt, 'w') as params_f: for rc in run.run_components: params_f.write(' '.join(map(shlex.quote, [rc.exe] + rc.args))) params_f.write('\n') # save params as JSON for use in post-processing, more # useful for post-processing scripts then the command # text params_path_json = os.path.join(run.run_path, self.run_json_name) run_data = run.get_app_param_dict() with open(params_path_json, 'w') as params_f: json.dump(run_data, params_f, indent=2) fob_runs = [] for j, rc in enumerate(run.run_components): if timeout is not None: rc.timeout = parse_timedelta_seconds(timeout) fob_runs.append(rc.as_fob_data()) fob = dict(id=run.run_id, launch_mode=launch_mode, runs=fob_runs, working_dir=run.run_path, kill_on_partial_failure=kill_on_partial_failure, post_process_script=run_post_process_script, post_process_stop_on_failure= run_post_process_stop_on_failure, post_process_args=[params_path_json], node_layout=run.node_layout.serialize_to_dict(), total_nodes=run.total_nodes, machine_name=machine.name, tau_profiling=tau_profiling, tau_tracing=tau_tracing) fob_list.append(fob) # write to file run dir run_fob_path = os.path.join(run.run_path, "codar.cheetah.fob.json") with open(run_fob_path, "w") as runf: runf.write(json.dumps(fob, sort_keys=True, indent=4)) runf.write("\n") if run_dir_setup_script is not None: self._execute_run_dir_setup_script(run.run_path, run_dir_setup_script) # Get the size of the run dir. This should be the last step # in the creation of the run dir. self._get_pre_submit_dir_size(run) # Write fob_list to group-level json file f.write(json.dumps(fob_list, sort_keys=True, indent=4)) f.close() if nodes is None: nodes = min_nodes elif nodes < min_nodes: raise exc.CheetahException( "nodes for group is too low, need at least %d, got %d" % (min_nodes, nodes)) # TODO: what case does this handle? should have a test case for # it. if machine.node_exclusive: group_ppn = machine.processes_per_node else: group_ppn = math.ceil((max_nprocs) / nodes) env_path = os.path.join(self.output_directory, 'group-env.sh') group_env = templates.GROUP_ENV_TEMPLATE.format( walltime=parse_timedelta_seconds(walltime), max_procs=max_nprocs, processes_per_node=group_ppn, nodes=nodes, node_exclusive=node_exclusive, account=scheduler_options.get('project', ''), queue=scheduler_options.get('queue', ''), reservation=scheduler_options.get('reservation', ''), # TODO: require name be valid for all schedulers campaign_name='codar.cheetah.'+campaign_name, group_name=group_name, constraint=scheduler_options.get('constraint', ''), license=scheduler_options.get('license', ''), machine_name=machine.name ) with open(env_path, 'w') as f: f.write(group_env) return nodes
def make_experiment_run_dir(self, output_dir, _check_code_paths=True): """Produce scripts and directory structure for running the experiment. Directory structure will be a subdirectory for each scheduler group, and within each scheduler group directory, a subdirectory for each run.""" # set to False for unit tests if _check_code_paths: self._check_code_paths() if self.umask: umask_int = int(self.umask, 8) if ((umask_int & stat.S_IXUSR) or (umask_int & stat.S_IRUSR)): raise exc.CheetahException( 'bad umask, user r-x must be allowed') os.umask(umask_int) # Get the sweep groups for this machine if type(self.sweeps) == dict: _sweeps_this_mc = self.sweeps.get(self.machine.name, None) or [] _sweeps_any_mc = self.sweeps.get(sweeps_any_machine, None) or [] self.sweeps = [] self.sweeps.extend(_sweeps_this_mc) self.sweeps.extend(_sweeps_any_mc) assert len(self.sweeps) > 0, "No sweep groups found." # Create the top level campaign directory _output_dir = os.path.abspath(output_dir) os.makedirs(_output_dir, exist_ok=True) # Write campaign id file at the top-level campaign directory id_fpath = os.path.join(_output_dir, self._id_file) Path(id_fpath).touch() # Create a directory for the user and set it as the campaign location output_dir = os.path.join(_output_dir, getpass.getuser()) run_all_script = os.path.join(config.CHEETAH_PATH_SCHEDULER, self.machine.scheduler_name, 'run-all.sh') os.makedirs(output_dir, exist_ok=True) # Check if campaign dir already has groups with the same name self._assert_unique_group_names(output_dir) # Create run script and campaign environment info file copy_to_dir(run_all_script, output_dir) campaign_env = templates.CAMPAIGN_ENV_TEMPLATE.format( experiment_dir=output_dir, machine_config=config.machine_submit_env_path(self.machine.name), app_config=self.machine_app_config_script or "", workflow_script_path=config.WORKFLOW_SCRIPT, workflow_runner=self.machine.runner_name, workflow_debug_level="DEBUG", umask=(self.umask or ""), codar_python=self.python_path, ) campaign_env_path = os.path.join(output_dir, 'campaign-env.sh') with open(campaign_env_path, 'w') as f: f.write(campaign_env) # Traverse through sweep groups for group_i, group in enumerate(self.sweeps): # Validate component inputs. # 1. Ensure all keys are valid code names code_names = list(self.codes.keys()) if group.component_inputs is not None: c_input_keys = list(group.component_inputs.keys()) for key in c_input_keys: assert key in code_names, \ "Error in component_inputs for {}. '{}' not a valid " \ "code name".format(group.name, key) # each scheduler group gets it's own subdir # TODO: support alternate template for dirs? group_name = group.name group_output_dir = os.path.join(output_dir, group_name) launcher = machine_launchers.get_launcher(self.machine, group_output_dir, len(self.codes)) group_runs = [] for repeat_index in range(0, group.run_repetitions + 1): group_run_offset = 0 for sweep in group.parameter_groups: # node layout is map of machine names to layout for each # machine. If unspecified, or certain machine is # unspecified, use default. if sweep.node_layout is None: node_layout = None else: node_layout = sweep.node_layout.get(self.machine.name) # Summit requires a node layout if self.machine.name.lower() == "summit": assert node_layout is not None, \ "Must provide a node layout for a Sweep on Summit" if node_layout is None: node_layout = NodeLayout.default_no_share_layout( self.machine.processes_per_node, self.codes.keys()) else: node_layout = NodeLayout(node_layout) # TODO: validate node layout against machine model sweep_runs = [ Run( inst, self.codes, self.app_dir, os.path.join( group_output_dir, 'run-{}.iteration-{}'.format( group_run_offset + i, repeat_index)), self.inputs, self.machine, node_layout, sweep.rc_dependency, group.component_subdirs, group.sosflow_profiling, group.sosflow_analysis, group.component_inputs) for i, inst in enumerate(sweep.get_instances()) ] # we dont support mpmd mode with dependencies try: if group.launch_mode.lower() == 'mpmd': assert sweep.rc_dependency is None, \ "Dependencies in MPMD mode not supported" except AttributeError: pass # we dont support mpmd on deepthought2 try: if self.machine.machine_name.lower() == 'deepthought2': assert group.launch_mode.lower() not in 'mpmd',\ "mpmd mode not implemented for deepthought2" except AttributeError: pass group_runs.extend(sweep_runs) group_run_offset += len(sweep_runs) self.runs.extend(group_runs) if group.max_procs is None: max_procs = max([r.get_total_nprocs() for r in group_runs]) else: procs_per_run = max([r.get_total_nprocs() for r in group_runs]) if group.max_procs < procs_per_run: # TODO: improve error message, specifying which # group and by how much it's off etc raise exc.CheetahException( "max_procs for group is too low") max_procs = group.max_procs if group.per_run_timeout: per_run_seconds = parse_timedelta_seconds( group.per_run_timeout) walltime_guess = (per_run_seconds * len(group_runs)) + 60 walltime_group = parse_timedelta_seconds(group.walltime) if walltime_group < walltime_guess: warnings.warn('group "%s" walltime %d is less than ' '(per_run_timeout * nruns) + 60 = %d, ' 'it is recommended to set it higher to ' 'avoid problems with the workflow ' 'engine being killed before it can write ' 'all status information' % (group.name, walltime_group, walltime_guess)) # TODO: refactor so we can just pass the campaign and group # objects, i.e. add methods so launcher can get all info it needs # and simplify this loop. group.nodes = launcher.create_group_directory( self.name, self.app_dir, group_name, group_runs, max_procs, nodes=group.nodes, launch_mode=group.launch_mode, component_subdirs=group.component_subdirs, walltime=group.walltime, timeout=group.per_run_timeout, node_exclusive=self.machine.node_exclusive, tau_profiling=group.tau_profiling, tau_tracing=group.tau_tracing, machine=self.machine, sosd_path=self.sosd_path, sos_analysis_path=self.sos_analysis_path, kill_on_partial_failure=self.kill_on_partial_failure, run_post_process_script=self.run_post_process_script, run_post_process_stop_on_failure=self. run_post_process_stop_group_on_failure, scheduler_options=self.machine_scheduler_options, run_dir_setup_script=self.run_dir_setup_script) # TODO: track directories and ids and add to this file all_params_json_path = os.path.join(output_dir, "params.json") with open(all_params_json_path, "w") as f: json.dump([run.get_app_param_dict() for run in self.runs], f, indent=2)
def create_group_directory(self, campaign_name, group_name, runs, max_nprocs, nodes, component_subdirs, walltime, node_exclusive, timeout, machine, sosd_path=None, sos_analysis_path=None, tau_config=None, kill_on_partial_failure=False, run_post_process_script=None, run_post_process_stop_on_failure=False, scheduler_options=None, run_dir_setup_script=None): """Copy scripts for the appropriate scheduler to group directory, and write environment configuration. Returns required number of nodes, which will be calculated if the passed nodes is None""" script_dir = os.path.join(config.CHEETAH_PATH_SCRIPTS, self.scheduler_name, 'group') if not os.path.isdir(script_dir): raise ValueError("scheduler '%s' is not yet supported" % self.scheduler_name) if scheduler_options is None: scheduler_options = {} copytree_to_dir(script_dir, self.output_directory) fobs_path = os.path.join(self.output_directory, 'fobs.json') min_nodes = 1 with open(fobs_path, 'w') as f: for i, run in enumerate(runs): # TODO: abstract this to higher levels os.makedirs(run.run_path, exist_ok=True) # Create working dir for each component for rc in run.run_components: os.makedirs(rc.working_dir, exist_ok=True) if run.sosflow_profiling: run.insert_sosflow(sosd_path, sos_analysis_path, run.run_path, machine.processes_per_node) if tau_config is not None: copy_to_dir(tau_config, run.run_path) # Copy the global input files common to all components for input_rpath in run.inputs: copy_to_dir(input_rpath, run.run_path) # Copy input files requested by each component # save working dirs for later use working_dirs = {} # map component name to path for rc in run.run_components: working_dirs[rc.name] = rc.working_dir # if rc has an adios xml file, copy it to working dir if rc.adios_xml_file: copy_to_dir(rc.adios_xml_file, rc.working_dir) # now copy other inputs marked under component_inputs if rc.component_inputs is not None: for input_file in rc.component_inputs: # input type is symlink if type(input_file) == SymLink: dest = os.path.join( rc.working_dir, os.path.basename(input_file)) os.symlink(input_file, dest) # input type is a regular file else: copy_to_dir(input_file, rc.working_dir) # ADIOS XML param support adios_xml_params = \ run.instance.get_parameter_values_by_type(ParamAdiosXML) for pv in adios_xml_params: working_dir = working_dirs[pv.target] # dirty way of getting the adios xml filename of the rc # that is represented by pv.target rc_adios_xml = self._get_rc_adios_xml_filename( run, pv.target) xml_filepath = os.path.join(working_dir, os.path.basename(rc_adios_xml)) if pv.param_type == "adios_transform": adios_params.adios_xml_transform( xml_filepath, pv.group_name, pv.var_name, pv.value) elif pv.param_type == "adios_transport": # value could be # "MPI_AGGREGATE:num_aggregators=64;num_osts" # extract the method name and the method options method_name = pv.value method_opts = "" if ":" in pv.value: value_tokens = pv.value.split(":", 1) method_name = value_tokens[0] method_opts = value_tokens[1] adios_params.adios_xml_transport( xml_filepath, pv.group_name, method_name, method_opts) else: raise exc.CheetahException("Unrecognized adios param") # Insert dataspaces server instances if RCs will couple # using dataspaces. # This must be called after the ADIOS params are parsed and # the final ADIOS XML is generated run.add_dataspaces_support(machine) # Calculate the no. of nodes required by this run. # This must be done after dataspaces support is added. if run.get_total_nodes() > min_nodes: min_nodes = run.get_total_nodes() # Generic config file support. Note: slurps entire # config file into memory, requires adding file to # campaign 'inputs' option. config_params = \ run.instance.get_parameter_values_by_type(ParamConfig) for pv in config_params: working_dir = working_dirs[pv.target] config_filepath = os.path.join(working_dir, pv.config_filename) lines = [] # read and modify lines with open(config_filepath) as config_f: for line in config_f: line = line.replace(pv.match_string, pv.value) lines.append(line) # rewrite file with modified lines with open(config_filepath, 'w') as config_f: config_f.write("".join(lines)) # Key value config file support. Note: slurps entire # config file into memory, requires adding file to # campaign 'inputs' option. kv_params = \ run.instance.get_parameter_values_by_type(ParamKeyValue) for pv in kv_params: working_dir = working_dirs[pv.target] kv_filepath = os.path.join(working_dir, pv.config_filename) lines = [] # read and modify lines with open(kv_filepath) as kv_f: for line in kv_f: parts = line.split('=', 1) if len(parts) == 2: k = parts[0].strip() if k == pv.key_name: # assume all k=v type formats will # support no spaces around equals line = k + '=' + str(pv.value) + '\n' lines.append(line) # rewrite file with modified lines with open(kv_filepath, 'w') as kv_f: kv_f.write("".join(lines)) # save code commands as text params_path_txt = os.path.join(run.run_path, self.run_command_name) with open(params_path_txt, 'w') as params_f: for rc in run.run_components: params_f.write(' '.join( map(shlex.quote, [rc.exe] + rc.args))) params_f.write('\n') # save params as JSON for use in post-processing, more # useful for post-processing scripts then the command # text params_path_json = os.path.join(run.run_path, self.run_json_name) run_data = run.get_app_param_dict() with open(params_path_json, 'w') as params_f: json.dump(run_data, params_f, indent=2) fob_runs = [] for j, rc in enumerate(run.run_components): tau_profile_dir = os.path.join( run.run_path, TAU_PROFILE_PATTERN.format(code=rc.name)) os.makedirs(tau_profile_dir) rc.env["PROFILEDIR"] = tau_profile_dir rc.env["TRACEDIR"] = tau_profile_dir if timeout is not None: rc.timeout = parse_timedelta_seconds(timeout) fob_runs.append(rc.as_fob_data()) fob = dict(id=run.run_id, runs=fob_runs, working_dir=run.run_path, kill_on_partial_failure=kill_on_partial_failure, post_process_script=run_post_process_script, post_process_stop_on_failure= run_post_process_stop_on_failure, post_process_args=[params_path_json], node_layout=run.node_layout.as_data_list()) fob_s = json.dumps(fob) # write to file run dir run_fob_path = os.path.join(run.run_path, "codar.cheetah.fob.json") with open(run_fob_path, "w") as runf: runf.write(fob_s) runf.write("\n") if run_dir_setup_script is not None: self._execute_run_dir_setup_script(run.run_path, run_dir_setup_script) # append to fob list file in group dir f.write(fob_s) f.write("\n") # Get the size of the run dir. This should be the last step # in the creation of the run dir. self._get_pre_submit_dir_size(run) if nodes is None: nodes = min_nodes elif nodes < min_nodes: raise exc.CheetahException( "nodes for group is too low, need at least %d, got %d" % (min_nodes, nodes)) # TODO: what case does this handle? should have a test case for # it. if machine.node_exclusive: group_ppn = machine.processes_per_node else: group_ppn = math.ceil((max_nprocs) / nodes) env_path = os.path.join(self.output_directory, 'group-env.sh') group_env = templates.GROUP_ENV_TEMPLATE.format( walltime=parse_timedelta_seconds(walltime), max_procs=max_nprocs, processes_per_node=group_ppn, nodes=nodes, node_exclusive=node_exclusive, account=scheduler_options.get('project', ''), queue=scheduler_options.get('queue', ''), # TODO: require name be valid for all schedulers campaign_name='codar.cheetah.' + campaign_name, group_name=group_name, constraint=scheduler_options.get('constraint', ''), license=scheduler_options.get('license', '')) with open(env_path, 'w') as f: f.write(group_env) return nodes