def _runjob(self): """Runs a job""" #check for valid user/group try: userid, groupid = pwd.getpwnam(self.user)[2:4] except KeyError: raise ProcessGroupCreationError("error getting uid/gid") try: os.setgid(groupid) os.setuid(userid) except OSError: logger.error( "failed to change userid/groupid for process group %s" % (self.id)) os._exit(1) if self.umask != None: try: os.umask(self.umask) except: logger.error("Failed to set umask to %s" % self.umask) try: partition = self.location[0] except IndexError: raise ProcessGroupCreationError("no location") kerneloptions = self.kerneloptions # export subset of MPIRUN_* variables to mpirun's environment # we explicitly state the ones we want since some are "dangerous" exportenv = [ 'MPIRUN_CONNECTION', 'MPIRUN_KERNEL_OPTIONS', 'MPIRUN_MAPFILE', 'MPIRUN_START_GDBSERVER', 'MPIRUN_LABEL', 'MPIRUN_NW', 'MPIRUN_VERBOSE', 'MPIRUN_ENABLE_TTY_REPORTING', 'MPIRUN_STRACE' ] app_envs = [] for key, value in self.env.iteritems(): if key in exportenv: os.environ[key] = value else: app_envs.append((key, value)) envs = " ".join(["%s=%s" % x for x in app_envs]) atexit._atexit = [] try: stdin = open(self.stdin, 'r') except (IOError, OSError, TypeError), e: logger.error( "process group %s: error opening stdin file %s: %s (stdin will be /dev/null)" % (self.id, self.stdin, e)) stdin = open("/dev/null", 'r')
def add_process_groups(self, specs): """Create a process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)" % (specs)) script_specs = [] other_specs = [] for spec in specs: if spec.get('mode', False) == "script": script_specs.append(spec) else: other_specs.append(spec) # start up script jobs script_pgroups = [] if script_specs: for spec in script_specs: try: self._set_kernel( spec.get('location')[0], spec.get('kernel', "default")) except Exception, e: new_pgroup = self.process_groups.q_add([spec]) pgroup = new_pgroup[0] pgroup.nodect = self._partitions[pgroup.location[0]].size pgroup.exit_status = 1 self.logger.info( "process group %s: job %s/%s failed to set the kernel; %s", pgroup.id, pgroup.jobid, pgroup.user, e) else: try: script_pgroup = ComponentProxy( "script-manager").add_jobs([spec]) except (ComponentLookupError, xmlrpclib.Fault): self._clear_kernel(spec.get('location')[0]) # FIXME: jobs that were already started are not reported raise ProcessGroupCreationError( "system::add_process_groups failed to communicate with script-manager" ) new_pgroup = self.process_groups.q_add([spec]) pgroup = new_pgroup[0] pgroup.script_id = script_pgroup[0]['id'] pgroup.nodect = self._partitions[pgroup.location[0]].size self.logger.info( "job %s/%s: process group %s created to track script", pgroup.jobid, pgroup.user, pgroup.id) self.reserve_resources_until( spec['location'], time.time() + 60 * float(spec['walltime']), pgroup.jobid) if pgroup.kernel != "default": self.logger.info( "process group %s: job %s/%s using kernel %s", pgroup.id, pgroup.jobid, pgroup.user, pgroup.kernel) script_pgroups.append(pgroup)
def prefork (self): ret = {} # check for valid user/group try: userid, groupid = pwd.getpwnam(self.user)[2:4] except KeyError: raise ProcessGroupCreationError("error getting uid/gid") ret["userid"] = userid ret["primary_group"] = groupid self.nodefile = "/var/tmp/cobalt.%s" % self.jobid # get supplementary groups supplementary_group_ids = [] for g in grp.getgrall(): if self.user in g.gr_mem: supplementary_group_ids.append(g.gr_gid) ret["other_groups"] = supplementary_group_ids ret["umask"] = self.umask try: rank0 = self.location[0].split(":")[0] except IndexError: raise ProcessGroupCreationError("no location") kerneloptions = self.kerneloptions ret["postfork_env"] = self.env ret["stdin"] = self.stdin ret["stdout"] = self.stdout ret["stderr"] = self.stderr cmd_string = "/usr/bin/cobalt-launcher.py --nf %s --jobid %s --cwd %s --exe %s" % (self.nodefile, self.jobid, self.cwd, self.executable) cmd = ("/usr/bin/ssh", "/usr/bin/ssh", rank0, cmd_string) ret["id"] = self.id ret["jobid"] = self.jobid ret["cobalt_log_file"] = self.cobalt_log_file ret["cmd" ] = cmd return ret
def prefork (self): ret = {} ret = ProcessGroup.prefork(self) sim_mode = get_orcm_system_config("simulation_mode", 'false').lower() in config_true_values if not sim_mode: nodefile_dir = get_orcm_system_config("nodefile_dir", "/var/tmp") self.nodefile = os.path.join(nodefile_dir, "cobalt.%s" % self.jobid) else: self.nodefile = "fake" try: #This is the head node, return this to the user. rank0 = self.location[0].split(":")[0] except IndexError: raise ProcessGroupCreationError("no location") split_args = self.args cmd_args = ['--nf', str(self.nodefile), '--jobid', str(self.jobid), '--cwd', str(self.cwd),] qsub_env_list = ["%s=%s" % (key, val) for key, val in self.env.iteritems()] for env in qsub_env_list: cmd_args.extend(['--env', env]) cmd_args.append(self.executable) cmd_exe = None if sim_mode: logger.debug("We are setting up with simulation mode.") cmd_exe = get_cluster_system_config("simulation_executable", None) if None == cmd_exe: logger.critical("Job: %s/%s: Executable for simulator not specified! This job will not run!") raise RuntimeError("Unspecified simulation_executable in cobalt config") else: cmd_exe = get_orcm_system_config('launcher','/usr/bin/cobalt-launcher.py') #run the user script off the login node, and on the compute node if (get_orcm_system_config("run_remote", 'true').lower() in config_true_values and not sim_mode): cmd = ("/usr/bin/ssh", rank0, cmd_exe, ) + tuple(cmd_args) + tuple(split_args) else: cmd = (cmd_exe,) + tuple(cmd_args) + tuple(split_args) ret["cmd" ] = cmd ret["args"] = cmd[1:] ret["executable"] = cmd[0] self.executable = ret["executable"] self.cmd = ret["cmd"] self.args = list(ret["args"]) return ret
def _get_argv(self, config_files=None): """Get a command string for a process group for a process group.""" if config_files is None: config_files = Cobalt.CONFIG_FILES config = ConfigParser() config.read(config_files) argv = [ config.get("bgpm", "mpirun"), os.path.basename(config.get("bgpm", "mpirun")), ] if self.true_mpi_args is not None: # arguments have been passed along in a special attribute. These arguments have # already been modified to include the partition that cobalt has selected # for the process group. argv.extend(self.true_mpi_args) return argv argv.extend([ "-np", str(self.size), "-mode", self.mode, "-cwd", self.cwd, "-exe", self.executable, ]) try: partition = self.location[0] except (KeyError, IndexError): raise ProcessGroupCreationError("location") argv.extend(["-partition", partition]) if self.kerneloptions: argv.extend(['-kernel_options', self.kerneloptions]) if self.args: argv.extend(["-args", " ".join(self.args)]) if self.env: env_kvstring = " ".join([ "%s=%s" % (key, value) for key, value in self.env.iteritems() ]) argv.extend(["-env", env_kvstring]) return argv
def prefork(self): ret = {} sim_mode = get_cluster_system_config( "simulation_mode", 'false').lower() in config_true_values if not sim_mode: nodefile_dir = get_cluster_system_config("nodefile_dir", "/var/tmp") self.nodefile = os.path.join(nodefile_dir, "cobalt.%s" % self.jobid) else: self.nodefile = "fake" try: rank0 = self.location[0].split(":")[0] except IndexError: raise ProcessGroupCreationError("no location") split_args = self.args.split() cmd_args = ('--nf', str(self.nodefile), '--jobid', str(self.jobid), '--cwd', str(self.cwd), '--exe', str(self.executable)) cmd_exe = None if sim_mode: cmd_exe = get_cluster_system_config("simulation_executable", None) if None == cmd_exe: logger.critical( "Job: %s/%s: Executable for simulator not specified! This job will not run!" ) raise RuntimeError( "Unspecified simulation_executable in cobalt config") else: #FIXME: Need to put launcher location into config cmd_exe = '/usr/bin/cobalt-launcher.py' #run the user script off the login node, and on the compute node if (get_cluster_system_config("run_remote", 'true').lower() in config_true_values and not sim_mode): cmd = ("/usr/bin/ssh", rank0, cmd_exe) + cmd_args + tuple(split_args) else: cmd = (cmd_exe, ) + cmd_args + tuple(split_args) ret["cmd"] = cmd ret["args"] = split_args return ret
def _get_argv(self, config_files=None): """Get a command string for a process group for a process group.""" if config_files is None: config_files = Cobalt.CONFIG_FILES config = ConfigParser() config.read(config_files) argv = [ config.get("bgpm", "mpirun"), os.path.basename(config.get("bgpm", "mpirun")), ] argv.extend([ "-np", str(self.size), "-mode", self.mode, "-cwd", self.cwd, "-exe", self.executable, ]) try: partition = self.location[0] except (KeyError, IndexError): raise ProcessGroupCreationError("location") argv.extend(["-partition", partition]) if self.kerneloptions: argv.extend(['-kernel_options', self.kerneloptions]) if self.args: argv.extend(["-args", " ".join(self.args)]) if self.env: env_kvstring = " ".join([ "%s=%s" % (key, value) for key, value in self.env.iteritems() ]) argv.extend(["-env", env_kvstring]) return argv
def add_process_groups(self, specs): """Create a simulated process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)" % (specs)) script_specs = [] other_specs = [] for spec in specs: if spec.get('mode') == "script": script_specs.append(spec) else: other_specs.append(spec) # start up script jobs new_pgroups = [] if script_specs: try: for spec in script_specs: script_pgroup = ComponentProxy("script-manager").add_jobs( [spec]) new_pgroup = self.process_groups.q_add([spec]) new_pgroup[0].script_id = script_pgroup[0]['id'] self.reserve_resources_until( spec['location'], time.time() + 60 * float(spec['walltime']), new_pgroup[0].jobid) new_pgroups.append(new_pgroup[0]) except (ComponentLookupError, xmlrpclib.Fault): raise ProcessGroupCreationError( "system::add_process_groups failed to communicate with script-manager" ) process_groups = self.process_groups.q_add(other_specs) for process_group in process_groups: self.start(process_group) return new_pgroups + process_groups
def prefork(self): ret = {} # check for valid user/group try: userid, groupid = pwd.getpwnam(self.user)[2:4] except KeyError: raise ProcessGroupCreationError("error getting uid/gid") ret["userid"] = userid ret["primary_group"] = groupid # get supplementary groups supplementary_group_ids = [] for g in grp.getgrall(): if self.user in g.gr_mem: supplementary_group_ids.append(g.gr_gid) ret["other_groups"] = supplementary_group_ids ret["umask"] = self.umask try: partition = self.location[0] except IndexError: raise ProcessGroupCreationError("no location") kerneloptions = self.kerneloptions # export subset of MPIRUN_* variables to mpirun's environment # we explicitly state the ones we want since some are "dangerous" exportenv = [ 'MPIRUN_CONNECTION', 'MPIRUN_KERNEL_OPTIONS', 'MPIRUN_MAPFILE', 'MPIRUN_START_GDBSERVER', 'MPIRUN_LABEL', 'MPIRUN_NW', 'MPIRUN_VERBOSE', 'MPIRUN_ENABLE_TTY_REPORTING', 'MPIRUN_STRACE' ] postfork_env = {} app_envs = [] for key, value in self.env.iteritems(): if key in exportenv: postfork_env[key] = value else: app_envs.append((key, value)) envs = " ".join(["%s=%s" % x for x in app_envs]) ret["postfork_env"] = postfork_env ret["stdin"] = self.stdin ret["stdout"] = self.stdout ret["stderr"] = self.stderr cmd = (self.config['mpirun'], os.path.basename(self.config['mpirun']), '-host', self.config['mmcs_server_ip'], '-np', str(self.size), '-partition', partition, '-mode', self.mode, '-cwd', self.cwd, '-exe', self.executable) if self.args: cmd = cmd + ('-args', self.args) if envs: cmd = cmd + ('-env', envs) if kerneloptions: cmd = cmd + ('-kernel_options', kerneloptions) # If this mpirun command originated from a user script, its arguments # have been passed along in a special attribute. These arguments have # already been modified to include the partition that cobalt has selected # for the job, and can just replace the arguments built above. if self.true_mpi_args: cmd = (self.config['mpirun'], os.path.basename(self.config['mpirun'])) + tuple( self.true_mpi_args) ret["id"] = self.id ret["jobid"] = self.jobid ret["cobalt_log_file"] = self.cobalt_log_file ret["cmd"] = cmd return ret