示例#1
0
 def _check_code_paths(self):
     if not os.path.isdir(self.app_dir):
         raise exc.CheetahException(
             'specified app directory "%s" does not exist' % self.app_dir)
     for code_name, code in self.codes.items():
         exe_path = code['exe']
         if not os.path.isfile(exe_path):
             raise exc.CheetahException(
                 'code "%s" exe at "%s" is not a file' %
                 (code_name, exe_path))
         if not os.access(exe_path, os.X_OK):
             raise exc.CheetahException(
                 'code "%s" exe at "%s" is not executable by current user' %
                 (code_name, exe_path))
示例#2
0
 def _get_machine(self, machine_name):
     machine = None
     for m in self.supported_machines:
         if m == machine_name:
             machine = machines.get_by_name(m)
     if machine is None:
         raise exc.CheetahException(
             "machine '%s' not supported by experiment '%s'" %
             (machine_name, self.name))
     return machine
示例#3
0
    def _get_rc_adios_xml_filename(self, run, rc_name):
        adios_xml_file = None
        for rc in run.run_components:
            if rc_name == rc.name:
                adios_xml_file = rc.adios_xml_file

        if adios_xml_file is None:
            raise exc.CheetahException("An ADIOS XML file was not found "
                                       "for {}. Set the adios_xml_file "
                                       "option for the component in "
                                       "codes.".format(rc_name))
        return adios_xml_file
示例#4
0
def require_campaign_directory(path):
    """Raise CheetahException if the specified path is not a top-level
    campaign directory."""
    if not is_campaign_directory(path):
        raise exc.CheetahException("Path '%s' is not a " \
                                   "top-level campaign directory" % path)
示例#5
0
    def make_experiment_run_dir(self, output_dir, _check_code_paths=True):
        """Produce scripts and directory structure for running the experiment.

        Directory structure will be a subdirectory for each scheduler group,
        and within each scheduler group directory, a subdirectory for each
        run."""

        # set to False for unit tests
        if _check_code_paths:
            self._check_code_paths()

        if self.umask:
            umask_int = int(self.umask, 8)
            if ((umask_int & stat.S_IXUSR) or (umask_int & stat.S_IRUSR)):
                raise exc.CheetahException(
                    'bad umask, user r-x must be allowed')
            os.umask(umask_int)

        # Get the sweep groups for this machine
        if type(self.sweeps) == dict:
            _sweeps_this_mc = self.sweeps.get(self.machine.name, None) or []
            _sweeps_any_mc = self.sweeps.get(sweeps_any_machine, None) or []

            self.sweeps = []
            self.sweeps.extend(_sweeps_this_mc)
            self.sweeps.extend(_sweeps_any_mc)

            assert len(self.sweeps) > 0, "No sweep groups found."

        # Create the top level campaign directory
        _output_dir = os.path.abspath(output_dir)
        os.makedirs(_output_dir, exist_ok=True)

        # Write campaign id file at the top-level campaign directory
        id_fpath = os.path.join(_output_dir, self._id_file)
        Path(id_fpath).touch()

        # Create a directory for the user and set it as the campaign location
        output_dir = os.path.join(_output_dir, getpass.getuser())
        run_all_script = os.path.join(config.CHEETAH_PATH_SCHEDULER,
                                      self.machine.scheduler_name,
                                      'run-all.sh')
        os.makedirs(output_dir, exist_ok=True)

        # Check if campaign dir already has groups with the same name
        self._assert_unique_group_names(output_dir)

        # Create run script and campaign environment info file
        copy_to_dir(run_all_script, output_dir)

        campaign_env = templates.CAMPAIGN_ENV_TEMPLATE.format(
            experiment_dir=output_dir,
            machine_config=config.machine_submit_env_path(self.machine.name),
            app_config=self.machine_app_config_script or "",
            workflow_script_path=config.WORKFLOW_SCRIPT,
            workflow_runner=self.machine.runner_name,
            workflow_debug_level="DEBUG",
            umask=(self.umask or ""),
            codar_python=self.python_path,
        )
        campaign_env_path = os.path.join(output_dir, 'campaign-env.sh')
        with open(campaign_env_path, 'w') as f:
            f.write(campaign_env)

        # Traverse through sweep groups
        for group_i, group in enumerate(self.sweeps):
            # Validate component inputs.
            #   1. Ensure all keys are valid code names
            code_names = list(self.codes.keys())
            if group.component_inputs is not None:
                c_input_keys = list(group.component_inputs.keys())
                for key in c_input_keys:
                    assert key in code_names, \
                        "Error in component_inputs for {}. '{}' not a valid " \
                        "code name".format(group.name, key)

            # each scheduler group gets it's own subdir
            # TODO: support alternate template for dirs?
            group_name = group.name
            group_output_dir = os.path.join(output_dir, group_name)
            launcher = machine_launchers.get_launcher(self.machine,
                                                      group_output_dir,
                                                      len(self.codes))
            group_runs = []
            for repeat_index in range(0, group.run_repetitions + 1):
                group_run_offset = 0
                for sweep in group.parameter_groups:
                    # node layout is map of machine names to layout for each
                    # machine. If unspecified, or certain machine is
                    # unspecified, use default.
                    if sweep.node_layout is None:
                        node_layout = None
                    else:
                        node_layout = sweep.node_layout.get(self.machine.name)

                    # Summit requires a node layout
                    if self.machine.name.lower() == "summit":
                        assert node_layout is not None, \
                            "Must provide a node layout for a Sweep on Summit"

                    if node_layout is None:
                        node_layout = NodeLayout.default_no_share_layout(
                            self.machine.processes_per_node, self.codes.keys())
                    else:
                        node_layout = NodeLayout(node_layout)

                    # TODO: validate node layout against machine model

                    sweep_runs = [
                        Run(
                            inst, self.codes, self.app_dir,
                            os.path.join(
                                group_output_dir, 'run-{}.iteration-{}'.format(
                                    group_run_offset + i,
                                    repeat_index)), self.inputs, self.machine,
                            node_layout, sweep.rc_dependency,
                            group.component_subdirs, group.sosflow_profiling,
                            group.sosflow_analysis, group.component_inputs)
                        for i, inst in enumerate(sweep.get_instances())
                    ]

                    # we dont support mpmd mode with dependencies
                    try:
                        if group.launch_mode.lower() == 'mpmd':
                            assert sweep.rc_dependency is None, \
                                "Dependencies in MPMD mode not supported"
                    except AttributeError:
                        pass

                    # we dont support mpmd on deepthought2
                    try:
                        if self.machine.machine_name.lower() == 'deepthought2':
                            assert group.launch_mode.lower() not in 'mpmd',\
                                "mpmd mode not implemented for deepthought2"
                    except AttributeError:
                        pass

                    group_runs.extend(sweep_runs)
                    group_run_offset += len(sweep_runs)
            self.runs.extend(group_runs)

            if group.max_procs is None:
                max_procs = max([r.get_total_nprocs() for r in group_runs])
            else:
                procs_per_run = max([r.get_total_nprocs() for r in group_runs])
                if group.max_procs < procs_per_run:
                    # TODO: improve error message, specifying which
                    # group and by how much it's off etc
                    raise exc.CheetahException(
                        "max_procs for group is too low")
                max_procs = group.max_procs

            if group.per_run_timeout:
                per_run_seconds = parse_timedelta_seconds(
                    group.per_run_timeout)
                walltime_guess = (per_run_seconds * len(group_runs)) + 60
                walltime_group = parse_timedelta_seconds(group.walltime)
                if walltime_group < walltime_guess:
                    warnings.warn('group "%s" walltime %d is less than '
                                  '(per_run_timeout * nruns) + 60 = %d, '
                                  'it is recommended to set it higher to '
                                  'avoid problems with the workflow '
                                  'engine being killed before it can write '
                                  'all status information' %
                                  (group.name, walltime_group, walltime_guess))

            # TODO: refactor so we can just pass the campaign and group
            # objects, i.e. add methods so launcher can get all info it needs
            # and simplify this loop.
            group.nodes = launcher.create_group_directory(
                self.name,
                self.app_dir,
                group_name,
                group_runs,
                max_procs,
                nodes=group.nodes,
                launch_mode=group.launch_mode,
                component_subdirs=group.component_subdirs,
                walltime=group.walltime,
                timeout=group.per_run_timeout,
                node_exclusive=self.machine.node_exclusive,
                tau_profiling=group.tau_profiling,
                tau_tracing=group.tau_tracing,
                machine=self.machine,
                sosd_path=self.sosd_path,
                sos_analysis_path=self.sos_analysis_path,
                kill_on_partial_failure=self.kill_on_partial_failure,
                run_post_process_script=self.run_post_process_script,
                run_post_process_stop_on_failure=self.
                run_post_process_stop_group_on_failure,
                scheduler_options=self.machine_scheduler_options,
                run_dir_setup_script=self.run_dir_setup_script)

        # TODO: track directories and ids and add to this file
        all_params_json_path = os.path.join(output_dir, "params.json")
        with open(all_params_json_path, "w") as f:
            json.dump([run.get_app_param_dict() for run in self.runs],
                      f,
                      indent=2)
示例#6
0
    def __init__(self, machine_name, app_dir):
        # check that subclasses set configuration
        # TODO: better errors
        # TODO: is class variables best way to model this??
        assert self.name is not None
        assert len(self.codes) > 0
        assert len(self.supported_machines) > 0
        assert len(self.sweeps) > 0
        self.machine = self._get_machine(machine_name)
        self.app_dir = os.path.abspath(app_dir)
        self.runs = []

        # allow inputs to be either aboslute paths or relative to
        # app_dir
        self.inputs = relative_or_absolute_path_list(self.app_dir, self.inputs)

        if not isinstance(self.codes, OrderedDict):
            self.codes = OrderedDict(self.codes)

        conflict_names = set(self.codes.keys()) & RESERVED_CODE_NAMES
        if conflict_names:
            raise exc.CheetahException(
                'Code names conflict with reserved names: ' +
                ", ".join(str(name) for name in conflict_names))

        # Resolve relative code exe pahts. Checking for existence is not
        # done until make_experiment_run_dir is called to simplify unit
        # testing.
        for code_name, code in self.codes.items():
            exe_path = code['exe']
            if not exe_path.startswith('/'):
                exe_path = os.path.join(self.app_dir, exe_path)
                code['exe'] = exe_path

        if self.run_post_process_script is not None:
            self.run_post_process_script = self._experiment_relative_path(
                self.run_post_process_script)

        if self.sosd_path is None:
            self.sosd_path = os.path.join(self.app_dir, 'sosd')
        elif not self.sosd_path.startswith('/'):
            self.sosd_path = os.path.join(self.app_dir, self.sosd_path)

        if self.sos_analysis_path is None:
            self.sos_analysis_path = os.path.join(self.app_dir,
                                                  'sos_wrapper.sh')
        elif not self.sos_analysis_path.startswith('/'):
            self.sos_analysis_path = os.path.join(self.app_dir,
                                                  self.sos_analysis_path)

        o = self.scheduler_options.get(machine_name, {})
        # TODO: deeper validation with knowledge of scheduler
        self.machine_scheduler_options = self.machine.get_scheduler_options(o)

        if self.run_dir_setup_script is not None:
            self.run_dir_setup_script = self._experiment_relative_path(
                self.run_dir_setup_script)

        self.machine_app_config_script = None
        if self.app_config_scripts is not None:
            assert isinstance(self.app_config_scripts, dict)
            script = self.app_config_scripts.get(machine_name)
            if script is not None:
                self.machine_app_config_script = \
                    self._experiment_relative_path(script)
示例#7
0
    def _insert_dataspaces_rc(self, client_rcs, machine):
        """
        Add dataspaces support for this run.
        Creates a new RC with dataspaces server as the exe.
        :param client_rcs: Dist of sets for clients coupling using
        dataspaces or dimes
        :param machine_name: Current machine
        :return:
        """

        # Sanity check. rc list for coupling must have >1 RCs
        for transport_type in client_rcs:
            if len(client_rcs[transport_type]) == 1:
                raise exc.CheetahException("Atleast 2 codes needed for "
                                           "coupling with DATASPACES/DIMES. "
                                           "Found 1.")

        # Check that codes has dataspaces_server exe
        ds_server = None
        sleep_after = 0
        for code in self.codes:
            exe = self.codes[code]['exe']
            if 'dataspaces_server' in exe:
                ds_server = exe
                ds_rc_name = code
                sleep_after = self.codes[code].get('sleep_after', 0)

        if not ds_server:
            raise exc.CheetahException("Dataspaces server needs to be "
                                       "specified in codes")

        # Copy the configuration file dataspaces.conf
        ds_conf = os.path.join(self.codes_path, "dataspaces.conf")
        if not os.path.isfile(ds_conf):
            raise exc.CheetahException("Could not find dataspaces.conf in " +
                                       self.codes_path)
        dst = os.path.join(self.run_path, "dataspaces.conf")
        copy_to_path(ds_conf, dst)

        # Get the no. of dataspaces and dimes clients.
        # RCs that have both must be counted as dataspaces clients
        num_ds_clients = sum(rc.nprocs for rc in client_rcs['dataspaces'])
        unique_dimes_rcs = client_rcs['dimes'] - client_rcs['dataspaces']
        num_dimes_clients = sum(rc.nprocs for rc in unique_dimes_rcs)

        num_servers = config.get_dataspaces_num_servers(
            num_dimes_clients, num_ds_clients)
        assert num_servers > 0

        rc_name = "dataspaces_server"
        args = [
            '-s',
            str(num_servers), '-c',
            str(num_ds_clients + num_dimes_clients)
        ]

        # Get the node layout
        node_layout = None
        for d in self.node_layout.layout_list:
            if ds_rc_name == list(d.keys())[0]:
                node_layout = d[ds_rc_name]
        if node_layout is None:
            node_layout = machine.dataspaces_servers_per_node

        rc = RunComponent(rc_name,
                          ds_server,
                          args,
                          nprocs=num_servers,
                          sleep_after=sleep_after,
                          working_dir=self.run_path)

        self.node_layout.add_node({rc_name: node_layout})
        self.run_components.insert(0, rc)
示例#8
0
    def create_group_directory(self, campaign_name, app_dir, group_name, runs,
                               max_nprocs, nodes, launch_mode,
                               component_subdirs, walltime, node_exclusive,
                               timeout, machine,
                               sosd_path=None,
                               sos_analysis_path=None,
                               tau_profiling=False, tau_tracing=False,
                               kill_on_partial_failure=False,
                               run_post_process_script=None,
                               run_post_process_stop_on_failure=False,
                               scheduler_options=None,
                               run_dir_setup_script=None):
        """Copy scripts for the appropriate scheduler to group directory,
        and write environment configuration. Returns required number of nodes,
        which will be calculated if the passed nodes is None"""
        script_dir = os.path.join(config.CHEETAH_PATH_SCHEDULER,
                                  self.scheduler_name, 'group')
        if not os.path.isdir(script_dir):
            raise ValueError("scheduler '%s' is not yet supported (path '%s')"
                             % (self.scheduler_name, script_dir))
        if scheduler_options is None:
            scheduler_options = {}
        copytree_to_dir(script_dir, self.output_directory)

        fobs_path = os.path.join(self.output_directory, 'fobs.json')
        min_nodes = 1

        f = open(fobs_path, 'w')
        fob_list = []
        for i, run in enumerate(runs):
            # TODO: abstract this to higher levels
            os.makedirs(run.run_path, exist_ok=True)

            # Create working dir for each component
            for rc in run.run_components:
                os.makedirs(rc.working_dir, exist_ok=True)

            if run.sosflow_profiling:
                run.insert_sosflow(sosd_path, sos_analysis_path,
                                   run.run_path,
                                   machine.processes_per_node)

            # Copy the global input files common to all components
            for input_rpath in run.inputs:
                copy_to_dir(input_rpath, run.run_path)

            # Copy input files requested by each component
            # save working dirs for later use
            working_dirs = {} # map component name to path
            for rc in run.run_components:
                working_dirs[rc.name] = rc.working_dir

                # if rc has an adios xml file, copy it to working dir
                if rc.adios_xml_file:
                    copy_to_dir(rc.adios_xml_file, rc.working_dir)

                # now copy other inputs marked under component_inputs
                if rc.component_inputs is not None:
                    for input_file in rc.component_inputs:
                        dest = os.path.join(rc.working_dir,
                                            os.path.basename(
                                                input_file))
                        # input type is symlink
                        if type(input_file) == SymLink:
                            os.symlink(input_file, dest)

                        # input type is a regular file
                        elif os.path.isfile(input_file):
                            copy_to_dir(input_file, rc.working_dir)

                        # Input file is a directory
                        elif os.path.isdir(input_file):
                            copytree_to_dir(input_file, dest)

                        else:
                            raise exc.CheetahException \
                                ("Could not component input {}"
                                 .format(input_file))

            # ADIOS XML param support
            adios_xml_params = \
                run.instance.get_parameter_values_by_type(ParamAdiosXML) or \
                run.instance.get_parameter_values_by_type(ParamADIOS2XML)
            for pv in adios_xml_params:
                working_dir = working_dirs[pv.target]

                # dirty way of getting the adios xml filename of the rc
                # that is represented by pv.target
                rc_adios_xml = self._get_rc_adios_xml_filename(
                    run, pv.target)
                xml_filepath = os.path.join(working_dir,
                                            os.path.basename(rc_adios_xml))

                # Check if this is adios1 or adios2
                adios_version = get_adios_version(rc_adios_xml)

                if adios_version == 1:
                    if pv.param_type == "adios_transform":
                        adios_params.adios_xml_transform(
                            xml_filepath,pv.group_name, pv.var_name, pv.value)
                    elif pv.param_type == "adios_transport":
                        # value could be
                        # "MPI_AGGREGATE:num_aggregators=64;num_osts"
                        # extract the method name and the method options
                        method_name = pv.value
                        method_opts = ""
                        if ":" in pv.value:
                            value_tokens = pv.value.split(":", 1)
                            method_name = value_tokens[0]
                            method_opts = value_tokens[1]

                        adios_params.adios_xml_transport(
                            xml_filepath, pv.group_name, method_name,
                            method_opts)
                    else:
                        raise exc.CheetahException("Unrecognized adios param")

                else:   # adios version == 2
                    operation_value = list(pv.value.keys())[0]
                    if pv.operation_name in ('engine', 'transport'):
                        parameters = list(pv.value.values())[0]
                        if pv.operation_name == 'engine':
                            adios2.set_engine(xml_filepath, pv.io_name,
                                              operation_value, parameters)
                        else:
                            adios2.set_transport(xml_filepath, pv.io_name,
                                                 operation_value, parameters)
                    else:   # operation_name == 'var_operation'
                        var_name = list(pv.value.keys())[0]
                        var_name_dict = pv.value[var_name]
                        var_operation_value = list(var_name_dict.keys())[0]
                        var_op_dict = var_name_dict[var_operation_value]
                        parameters = var_op_dict
                        adios2.set_var_operation(xml_filepath, pv.io_name,
                                                 var_name,
                                                 var_operation_value,
                                                 parameters)

            # Calculate the no. of nodes required by this run.
            # This must be done after dataspaces support is added.
            if run.total_nodes > min_nodes:
                min_nodes = run.total_nodes

            # Generic config file support. Note: slurps entire
            # config file into memory, requires adding file to
            # campaign 'inputs' option.
            config_params = \
                run.instance.get_parameter_values_by_type(ParamConfig)
            for pv in config_params:
                working_dir = working_dirs[pv.target]
                src_filepath = relative_or_absolute_path(app_dir,
                                                         pv.config_filename)
                # Allow for relative pathnames in the spec
                src_filename = pv.config_filename
                if pv.config_filename[0] == '/':
                    src_filename = os.path.basename(src_filepath)
                config_filepath = os.path.join(working_dir,
                                               src_filename)
                if not os.path.isfile(config_filepath):
                    copy_to_path(src_filepath, config_filepath)
                lines = []
                # read and modify lines
                # hack: handle json files. currently works only on singly
                # nested json files
                if config_filepath.endswith(".json"):
                    json_config_set_option(config_filepath, pv.match_string,
                                           pv.value)
                else:  # handle other file types
                    with open(config_filepath) as config_f:
                        for line in config_f:
                            line = line.replace(pv.match_string, pv.value)
                            lines.append(line)
                    # rewrite file with modified lines
                    with open(config_filepath, 'w') as config_f:
                        config_f.write("".join(lines))

            # Key value config file support. Note: slurps entire
            # config file into memory, requires adding file to
            # campaign 'inputs' option.
            kv_params = \
                run.instance.get_parameter_values_by_type(ParamKeyValue)
            for pv in kv_params:
                working_dir = working_dirs[pv.target]
                src_filepath = relative_or_absolute_path(app_dir,
                                                         pv.config_filename)
                # Allow for relative pathnames in the spec
                src_filename = pv.config_filename
                if pv.config_filename[0] == '/':
                    src_filename = os.path.basename(src_filepath)
                kv_filepath = os.path.join(working_dir, src_filename)
                if not os.path.isfile(kv_filepath):
                    copy_to_path(src_filepath, kv_filepath)
                lines = []
                # read and modify lines
                key_found = False
                with open(kv_filepath) as kv_f:
                    for line in kv_f:
                        parts = line.split('=', 1)
                        if len(parts) == 2:
                            k = parts[0].strip()
                            if k == pv.key_name:
                                # assume all k=v type formats will
                                # support no spaces around equals
                                line = k + '=' + str(pv.value)
                                # preserve a user comment if it exists
                                if '!' in parts[1]:
                                    line = line + " !" + \
                                           parts[1].strip().split('!')[1]
                                line = line + '\n'
                                key_found = True
                        lines.append(line)
                    assert key_found, \
                        "Issue parsing a ParamKeyValue: Could not find key {}"\
                        " in config file {}".format(pv.key_name, src_filepath)
                # rewrite file with modified lines
                with open(kv_filepath, 'w') as kv_f:
                    kv_f.write("".join(lines))

            # Env var parameter values
            kv_params = run.instance.get_parameter_values_by_type(ParamEnvVar)
            for pv in kv_params:
                rc = run._get_rc_by_name(pv.target)
                rc.env[pv.option] = str(pv.value)

            # save code commands as text
            params_path_txt = os.path.join(run.run_path,
                                           self.run_command_name)
            with open(params_path_txt, 'w') as params_f:
                for rc in run.run_components:
                    params_f.write(' '.join(map(shlex.quote,
                                                [rc.exe] + rc.args)))
                    params_f.write('\n')

            # save params as JSON for use in post-processing, more
            # useful for post-processing scripts then the command
            # text
            params_path_json = os.path.join(run.run_path,
                                            self.run_json_name)
            run_data = run.get_app_param_dict()
            with open(params_path_json, 'w') as params_f:
                json.dump(run_data, params_f, indent=2)

            fob_runs = []
            for j, rc in enumerate(run.run_components):
                if timeout is not None:
                    rc.timeout = parse_timedelta_seconds(timeout)

                fob_runs.append(rc.as_fob_data())

            fob = dict(id=run.run_id, launch_mode=launch_mode, runs=fob_runs,
                       working_dir=run.run_path,
                       kill_on_partial_failure=kill_on_partial_failure,
                       post_process_script=run_post_process_script,
                       post_process_stop_on_failure=
                            run_post_process_stop_on_failure,
                       post_process_args=[params_path_json],
                       node_layout=run.node_layout.serialize_to_dict(),
                       total_nodes=run.total_nodes,
                       machine_name=machine.name,
                       tau_profiling=tau_profiling, tau_tracing=tau_tracing)
            fob_list.append(fob)

            # write to file run dir
            run_fob_path = os.path.join(run.run_path,
                                        "codar.cheetah.fob.json")
            with open(run_fob_path, "w") as runf:
                runf.write(json.dumps(fob, sort_keys=True, indent=4))
                runf.write("\n")

            if run_dir_setup_script is not None:
                self._execute_run_dir_setup_script(run.run_path,
                                                   run_dir_setup_script)

            # Get the size of the run dir. This should be the last step
            # in the creation of the run dir.
            self._get_pre_submit_dir_size(run)

        # Write fob_list to group-level json file
        f.write(json.dumps(fob_list, sort_keys=True, indent=4))
        f.close()

        if nodes is None:
            nodes = min_nodes
        elif nodes < min_nodes:
            raise exc.CheetahException(
                "nodes for group is too low, need at least %d, got %d"
                % (min_nodes, nodes))

        # TODO: what case does this handle? should have a test case for
        # it.
        if machine.node_exclusive:
            group_ppn = machine.processes_per_node
        else:
            group_ppn = math.ceil((max_nprocs) / nodes)

        env_path = os.path.join(self.output_directory, 'group-env.sh')
        group_env = templates.GROUP_ENV_TEMPLATE.format(
            walltime=parse_timedelta_seconds(walltime),
            max_procs=max_nprocs,
            processes_per_node=group_ppn,
            nodes=nodes,
            node_exclusive=node_exclusive,
            account=scheduler_options.get('project', ''),
            queue=scheduler_options.get('queue', ''),
            reservation=scheduler_options.get('reservation', ''),
            # TODO: require name be valid for all schedulers
            campaign_name='codar.cheetah.'+campaign_name,
            group_name=group_name,
            constraint=scheduler_options.get('constraint', ''),
            license=scheduler_options.get('license', ''),
            machine_name=machine.name
        )
        with open(env_path, 'w') as f:
            f.write(group_env)

        return nodes
示例#9
0
    def create_group_directory(self,
                               campaign_name,
                               group_name,
                               runs,
                               max_nprocs,
                               nodes,
                               component_subdirs,
                               walltime,
                               node_exclusive,
                               timeout,
                               machine,
                               sosd_path=None,
                               sos_analysis_path=None,
                               tau_config=None,
                               kill_on_partial_failure=False,
                               run_post_process_script=None,
                               run_post_process_stop_on_failure=False,
                               scheduler_options=None,
                               run_dir_setup_script=None):
        """Copy scripts for the appropriate scheduler to group directory,
        and write environment configuration. Returns required number of nodes,
        which will be calculated if the passed nodes is None"""
        script_dir = os.path.join(config.CHEETAH_PATH_SCRIPTS,
                                  self.scheduler_name, 'group')
        if not os.path.isdir(script_dir):
            raise ValueError("scheduler '%s' is not yet supported" %
                             self.scheduler_name)
        if scheduler_options is None:
            scheduler_options = {}
        copytree_to_dir(script_dir, self.output_directory)

        fobs_path = os.path.join(self.output_directory, 'fobs.json')
        min_nodes = 1
        with open(fobs_path, 'w') as f:
            for i, run in enumerate(runs):
                # TODO: abstract this to higher levels
                os.makedirs(run.run_path, exist_ok=True)

                # Create working dir for each component
                for rc in run.run_components:
                    os.makedirs(rc.working_dir, exist_ok=True)

                if run.sosflow_profiling:
                    run.insert_sosflow(sosd_path, sos_analysis_path,
                                       run.run_path,
                                       machine.processes_per_node)

                if tau_config is not None:
                    copy_to_dir(tau_config, run.run_path)

                # Copy the global input files common to all components
                for input_rpath in run.inputs:
                    copy_to_dir(input_rpath, run.run_path)

                # Copy input files requested by each component
                # save working dirs for later use
                working_dirs = {}  # map component name to path
                for rc in run.run_components:
                    working_dirs[rc.name] = rc.working_dir

                    # if rc has an adios xml file, copy it to working dir
                    if rc.adios_xml_file:
                        copy_to_dir(rc.adios_xml_file, rc.working_dir)

                    # now copy other inputs marked under component_inputs
                    if rc.component_inputs is not None:
                        for input_file in rc.component_inputs:
                            # input type is symlink
                            if type(input_file) == SymLink:
                                dest = os.path.join(
                                    rc.working_dir,
                                    os.path.basename(input_file))
                                os.symlink(input_file, dest)

                            # input type is a regular file
                            else:
                                copy_to_dir(input_file, rc.working_dir)

                # ADIOS XML param support
                adios_xml_params = \
                    run.instance.get_parameter_values_by_type(ParamAdiosXML)
                for pv in adios_xml_params:
                    working_dir = working_dirs[pv.target]

                    # dirty way of getting the adios xml filename of the rc
                    # that is represented by pv.target
                    rc_adios_xml = self._get_rc_adios_xml_filename(
                        run, pv.target)
                    xml_filepath = os.path.join(working_dir,
                                                os.path.basename(rc_adios_xml))
                    if pv.param_type == "adios_transform":
                        adios_params.adios_xml_transform(
                            xml_filepath, pv.group_name, pv.var_name, pv.value)
                    elif pv.param_type == "adios_transport":
                        # value could be
                        # "MPI_AGGREGATE:num_aggregators=64;num_osts"
                        # extract the method name and the method options
                        method_name = pv.value
                        method_opts = ""
                        if ":" in pv.value:
                            value_tokens = pv.value.split(":", 1)
                            method_name = value_tokens[0]
                            method_opts = value_tokens[1]

                        adios_params.adios_xml_transport(
                            xml_filepath, pv.group_name, method_name,
                            method_opts)
                    else:
                        raise exc.CheetahException("Unrecognized adios param")

                # Insert dataspaces server instances if RCs will couple
                # using dataspaces.
                # This must be called after the ADIOS params are parsed and
                # the final ADIOS XML is generated
                run.add_dataspaces_support(machine)

                # Calculate the no. of nodes required by this run.
                # This must be done after dataspaces support is added.
                if run.get_total_nodes() > min_nodes:
                    min_nodes = run.get_total_nodes()

                # Generic config file support. Note: slurps entire
                # config file into memory, requires adding file to
                # campaign 'inputs' option.
                config_params = \
                    run.instance.get_parameter_values_by_type(ParamConfig)
                for pv in config_params:
                    working_dir = working_dirs[pv.target]
                    config_filepath = os.path.join(working_dir,
                                                   pv.config_filename)
                    lines = []
                    # read and modify lines
                    with open(config_filepath) as config_f:
                        for line in config_f:
                            line = line.replace(pv.match_string, pv.value)
                            lines.append(line)
                    # rewrite file with modified lines
                    with open(config_filepath, 'w') as config_f:
                        config_f.write("".join(lines))

                # Key value config file support. Note: slurps entire
                # config file into memory, requires adding file to
                # campaign 'inputs' option.
                kv_params = \
                    run.instance.get_parameter_values_by_type(ParamKeyValue)
                for pv in kv_params:
                    working_dir = working_dirs[pv.target]
                    kv_filepath = os.path.join(working_dir, pv.config_filename)
                    lines = []
                    # read and modify lines
                    with open(kv_filepath) as kv_f:
                        for line in kv_f:
                            parts = line.split('=', 1)
                            if len(parts) == 2:
                                k = parts[0].strip()
                                if k == pv.key_name:
                                    # assume all k=v type formats will
                                    # support no spaces around equals
                                    line = k + '=' + str(pv.value) + '\n'
                            lines.append(line)
                    # rewrite file with modified lines
                    with open(kv_filepath, 'w') as kv_f:
                        kv_f.write("".join(lines))

                # save code commands as text
                params_path_txt = os.path.join(run.run_path,
                                               self.run_command_name)
                with open(params_path_txt, 'w') as params_f:
                    for rc in run.run_components:
                        params_f.write(' '.join(
                            map(shlex.quote, [rc.exe] + rc.args)))
                        params_f.write('\n')

                # save params as JSON for use in post-processing, more
                # useful for post-processing scripts then the command
                # text
                params_path_json = os.path.join(run.run_path,
                                                self.run_json_name)
                run_data = run.get_app_param_dict()
                with open(params_path_json, 'w') as params_f:
                    json.dump(run_data, params_f, indent=2)

                fob_runs = []
                for j, rc in enumerate(run.run_components):

                    tau_profile_dir = os.path.join(
                        run.run_path, TAU_PROFILE_PATTERN.format(code=rc.name))
                    os.makedirs(tau_profile_dir)

                    rc.env["PROFILEDIR"] = tau_profile_dir
                    rc.env["TRACEDIR"] = tau_profile_dir

                    if timeout is not None:
                        rc.timeout = parse_timedelta_seconds(timeout)

                    fob_runs.append(rc.as_fob_data())

                fob = dict(id=run.run_id,
                           runs=fob_runs,
                           working_dir=run.run_path,
                           kill_on_partial_failure=kill_on_partial_failure,
                           post_process_script=run_post_process_script,
                           post_process_stop_on_failure=
                           run_post_process_stop_on_failure,
                           post_process_args=[params_path_json],
                           node_layout=run.node_layout.as_data_list())
                fob_s = json.dumps(fob)

                # write to file run dir
                run_fob_path = os.path.join(run.run_path,
                                            "codar.cheetah.fob.json")
                with open(run_fob_path, "w") as runf:
                    runf.write(fob_s)
                    runf.write("\n")

                if run_dir_setup_script is not None:
                    self._execute_run_dir_setup_script(run.run_path,
                                                       run_dir_setup_script)

                # append to fob list file in group dir
                f.write(fob_s)
                f.write("\n")

                # Get the size of the run dir. This should be the last step
                # in the creation of the run dir.
                self._get_pre_submit_dir_size(run)

        if nodes is None:
            nodes = min_nodes
        elif nodes < min_nodes:
            raise exc.CheetahException(
                "nodes for group is too low, need at least %d, got %d" %
                (min_nodes, nodes))

        # TODO: what case does this handle? should have a test case for
        # it.
        if machine.node_exclusive:
            group_ppn = machine.processes_per_node
        else:
            group_ppn = math.ceil((max_nprocs) / nodes)

        env_path = os.path.join(self.output_directory, 'group-env.sh')
        group_env = templates.GROUP_ENV_TEMPLATE.format(
            walltime=parse_timedelta_seconds(walltime),
            max_procs=max_nprocs,
            processes_per_node=group_ppn,
            nodes=nodes,
            node_exclusive=node_exclusive,
            account=scheduler_options.get('project', ''),
            queue=scheduler_options.get('queue', ''),
            # TODO: require name be valid for all schedulers
            campaign_name='codar.cheetah.' + campaign_name,
            group_name=group_name,
            constraint=scheduler_options.get('constraint', ''),
            license=scheduler_options.get('license', ''))
        with open(env_path, 'w') as f:
            f.write(group_env)

        return nodes