def __init__ (self, spec): """Initialize a new partition.""" Data.__init__(self, spec) spec = spec.copy() self.scheduled = spec.pop("scheduled", False) self.name = spec.pop("name", None) self.functional = spec.pop("functional", False) self.queue = spec.pop("queue", "default") self.size = spec.pop("size", None) # these hold Partition objects self._parents = set() self._children = set() self._all_children = set() self.state = spec.pop("state", "idle") self.tag = spec.get("tag", "partition") self.bridge_partition = None self.node_cards = spec.get("node_cards", []) self.switches = spec.get("switches", []) self.reserved_until = False self.reserved_by = None self.used_by = None self.cleanup_pending = False # this holds partition names self._wiring_conflicts = set() self.backfill_time = None self.draining = False self._update_node_cards()
def test_Sync (self): data1 = Data({'tag':"somedata"}) data2 = ForeignData({}) data2.Sync(data1.to_rx()) for field in data2.fields: assert getattr(data1, field) == getattr(data2, field)
def __init__(self, spec): Data.__init__(self, spec) self.tag = 'job' #following fields are initialized at beginning of simulation self.jobid = int(spec.get("jobid")) self.queue = spec.get("queue", "default") #self.queue = "default" self.submittime = spec.get("submittime") #in seconds self.walltime = spec.get("walltime") #in minutes self.user = spec.get("user", "unknown") self.project = spec.get("project", "unknown") self.nodes = spec.get("nodes", 0) self.runtime = spec.get("runtime", 0) self.remain_time = float(self.runtime) self.start_time = spec.get('start_time', '0') self.end_time = spec.get('end_time', '0') self.state = spec.get("state", "invisible") self.system_state = '' self.starttime = 0 self.arrival_time = 0 self.failure_time = 0 self.has_resources = False self.is_runnable = False self.is_visible = False self.args = [] self.progress = 0 self.recovery_opt = spec.get("recovery_opt", RECOVERYOPT) self.checkpoint = 1 self.location = []
def update (self, spec): if spec.has_key("users"): qm = ComponentProxy("queue-manager") try: qm.set_queues([{'name':self.queue,}], {'users':spec['users']}, "bgsched") except ComponentLookupError: logger.error("unable to contact queue manager when updating reservation users") raise # try the above first -- if we can't contact the queue-manager, don't update the users if spec.has_key('cycle') and not self.cycle: #we have just turned this into a cyclic reservation and need a cycle_id. spec['cycle_id'] = self.cycle_id_gen.get() #get the user name of whoever issued the command user_name = None if spec.has_key('__cmd_user'): user_name = spec['__cmd_user'] del spec['__cmd_user'] #if we're defering, pull out the 'defer' entry and send a cobalt db message. #there really isn't a corresponding field to update deferred = False if spec.has_key('defer'): logger.info("Res %s/%s: Deferring cyclic reservation: %s", self.res_id, self.cycle_id, self.name) dbwriter.log_to_db(user_name, "deferred", "reservation", self) del spec['defer'] deferred = True Data.update(self, spec) if not deferred or not self.running: #we only want this if we aren't defering. If we are, the cycle will #take care of the new data object creation. dbwriter.log_to_db(user_name, "modifying", "reservation", self)
def __init__(self, spec): Data.__init__(self, spec) self.tag = "process group" # self.args = " ".join(spec.get("args", [])) self.args = spec.get("args", []) self.cobalt_log_file = spec.get("cobalt_log_file") self.cwd = spec.get("cwd") self.env = spec.get("env", {}) self.executable = spec.get("executable") self.exit_status = None self.head_pid = None self.id = spec.get("id") self.jobid = spec.get("jobid") self.kernel = spec.get("kernel") self.kerneloptions = spec.get("kerneloptions") self.location = spec.get("location", []) self.mode = spec.get("mode") self.nodefile = None self.size = spec.get("size") self.stderr = spec.get("stderr") self.stdin = spec.get("stdin") self.stdout = spec.get("stdout") self.umask = spec.get("umask") self.user = spec.get("user", "") self.starttime = spec.get("starttime") self.walltime = spec.get("walltime") self.killtime = spec.get("killtime") self.resid = spec.get("resid", None) self.runid = spec.get("runid", None) self.forker = spec.get("forker", None)
def __init__(self, spec): """Initialize a new partition.""" Data.__init__(self, spec) spec = spec.copy() self.scheduled = spec.pop("scheduled", False) self.name = spec.pop("name", None) self.functional = spec.pop("functional", False) self.queue = spec.pop("queue", "default") self.size = spec.pop("size", None) # these hold Partition objects self._parents = set() self._children = set() self._all_children = set() self.state = spec.pop("state", "idle") self.tag = spec.get("tag", "partition") self.bridge_partition = None self.node_cards = spec.get("node_cards", []) self.switches = spec.get("switches", []) self.reserved_until = False self.reserved_by = None self.used_by = None self.cleanup_pending = False # this holds partition names self._wiring_conflicts = set() self.backfill_time = None self.draining = False self._update_node_cards()
def __init__(self, spec, logger): Data.__init__(self, spec) self.tag = "process group" self.args = " ".join(spec.get("args", [])) self.cobalt_log_file = spec.get("cobalt_log_file") self.cwd = spec.get("cwd") self.env = spec.get("env", {}) self.executable = spec.get("executable") self.exit_status = None self.head_pid = None self.id = spec.get("id") self.jobid = spec.get("jobid") self.kernel = spec.get("kernel") self.kerneloptions = spec.get("kerneloptions") self.location = spec.get("location", []) self.mode = spec.get("mode") self.nodefile = None self.size = spec.get("size") self.stderr = spec.get("stderr") self.stdin = spec.get("stdin") self.stdout = spec.get("stdout") self.true_mpi_args = spec.get("true_mpi_args") self.umask = spec.get("umask") self.user = spec.get("user", "") self.logger = logger
def test_Sync(self): data1 = Data({'tag': "somedata"}) data2 = ForeignData({}) data2.Sync(data1.to_rx()) for field in data2.fields: assert getattr(data1, field) == getattr(data2, field)
def __init__ (self, spec): Data.__init__(self, spec) spec = spec.copy() self.name = spec.pop("name") self.location = spec.pop("location") self.tag = spec.get("tag", "service") self.stamp = time.time()
def __init__(self, spec): Data.__init__(self, spec) spec = spec.copy() self.name = spec.pop("name") self.location = spec.pop("location") self.tag = spec.get("tag", "service") self.stamp = time.time()
def __init__(self, spec): Data.__init__(self, spec) self.tag = 'job' #following fields are initialized at beginning of simulation self.jobid = int(spec.get("jobid")) self.queue = spec.get("queue", "default") #self.queue = "default" self.submittime = spec.get("submittime") #in seconds self.walltime = spec.get("walltime") #in minutes self.walltime_p = spec.get("walltime_p") # *AdjEst* self.user = spec.get("user", "unknown") self.project = spec.get("project", "unknown") self.nodes = spec.get("nodes", 0) self.runtime = spec.get("runtime", 0) self.remain_time = float(self.runtime) self.start_time = spec.get('start_time', '0') self.end_time = spec.get('end_time', '0') self.last_hold = spec.get( 'last_hold', 0 ) # #the time (unix sec) the job starts a latest holding (coscheduling only) self.hold_time = 0 #the time period during which the job is holding (coscheduling only) self.yield_time = spec.get( 'first_yield', 0) #the time the job first yields (coscheduling only) self.state = spec.get("state", "invisible") self.system_state = '' self.starttime = 0 #self.arrival_time = 0 #self.failure_time = 0 self.has_resources = False self.is_runnable = spec.get("is_runnable", False) self.is_visible = False self.score = float(spec.get("score", 0.0)) self.attrs = spec.get("attrs", {}) self.args = [] self.progress = 0 #self.checkpoint = 1 self.recovering = False self.location = spec.get( 'location', '' ) #original location read from job trace, used for job reservation #samnickolay self.pricing_queue_position = -1 self.original_pricing_queue_position = -1 self.max_price = -1 self.max_slowdown = -1 self.price_slowdown_quotes = [] self.originally_realtime = False self.quoted_price = -1 self.quoted_slowdown = -1 self.quoted_slowdown_time = 1.1 self.estimated_slowdown_at_runtime = -1 self.in_high_priority_queue = False self.original_log_runtime = spec.get("original_log_runtime", 0)
def __init__(self, spec): Data.__init__(self, spec) self.tag = "Resource" self.functional = spec.get("functional", False) self.name = spec.get("name", None) self.queue = spec.get("queue", "default") self.scheduled = spec.get("scheduled", False) self.size = 1 self.state = spec.get("state", "idle") self.attributes = spec.get("attributes", {})
def __init__ (self, spec): """Initialize a new partition.""" Data.__init__(self, spec) spec = spec.copy() self.machine = spec.get("machine", 0) self.type = spec.get("type", "I") self.datetime = spec.get("datetime", None) self.unixtime = spec.get("unixtime", None) self.jobid = spec.get("jobid", 0) self.location = spec.get("location", {})
def update (self, spec): if spec.has_key("users"): qm = ComponentProxy(self.COMP_QUEUE_MANAGER) try: qm.set_queues([{'name':self.queue,}], {'users':spec['users']}, "bgsched") except ComponentLookupError: logger.error("unable to contact queue manager when updating reservation users") raise # try the above first -- if we can't contact the queue-manager, don't update the users Data.update(self, spec)
def __init__ (self, spec): Data.__init__(self, spec) self.tag = spec.get("tag", "reservation") self.cycle = spec.get("cycle") self.users = spec.get("users", "") self.createdQueue = False self.partitions = spec.get("partitions", "") self.name = spec['name'] self.start = spec['start'] self.queue = spec.get("queue", "R.%s" % self.name) self.duration = spec.get("duration")
def __init__(self, spec): Data.__init__(self, spec) self.tag = spec.get("tag", "reservation") self.cycle = spec.get("cycle") self.users = spec.get("users", "") self.createdQueue = False self.partitions = spec.get("partitions", "") self.name = spec['name'] self.start = spec['start'] self.queue = spec.get("queue", "R.%s" % self.name) self.duration = spec.get("duration")
def update(self, spec): if spec.has_key("users"): qm = ComponentProxy(self.COMP_QUEUE_MANAGER) try: qm.set_queues([{ 'name': self.queue, }], {'users': spec['users']}, "bgsched") except ComponentLookupError: logger.error( "unable to contact queue manager when updating reservation users" ) raise # try the above first -- if we can't contact the queue-manager, don't update the users Data.update(self, spec)
def __init__(self, spec): Data.__init__(self, spec) self.tag = 'job' #following fields are initialized at beginning of simulation self.jobid = int(spec.get("jobid")) self.queue = spec.get("queue", "default") #self.queue = "default" self.submittime = spec.get("submittime") #in seconds self.walltime = spec.get("walltime") #in minutes self.walltime_p = spec.get("walltime_p") # *AdjEst* self.user = spec.get("user", "unknown") self.project = spec.get("project", "unknown") self.nodes = spec.get("nodes", 0) self.runtime = spec.get("runtime", 0) self.remain_time = float(self.runtime) self.start_time = spec.get('start_time', '0') self.end_time = spec.get('end_time', '0') self.last_hold = spec.get( 'last_hold', 0 ) # #the time (unix sec) the job starts a latest holding (coscheduling only) self.hold_time = 0 #the time period during which the job is holding (coscheduling only) self.yield_time = spec.get( 'first_yield', 0) #the time the job first yields (coscheduling only) self.state = spec.get("state", "invisible") self.system_state = '' self.starttime = 0 #self.arrival_time = 0 #self.failure_time = 0 self.has_resources = False self.is_runnable = spec.get("is_runnable", False) self.is_visible = False self.score = float(spec.get("score", 0.0)) self.attrs = spec.get("attrs", {}) self.args = [] self.progress = 0 #self.checkpoint = 1 self.recovering = False self.location = [] self.torus = spec.get("torus", False)
def __init__(self, spec): Data.__init__(self, spec) self.tag = spec.get("tag", "reservation") self.cycle = spec.get("cycle") self.users = spec.get("users", "") self.createdQueue = False self.partitions = spec.get("partitions", "") self.name = spec['name'] self.start = spec['start'] self.queue = spec.get("queue", "R.%s" % self.name) self.duration = spec.get("duration") self.res_id = spec.get("res_id") self.cycle_id_gen = bgsched_cycle_id_gen if self.cycle: self.cycle_id = spec.get("cycle_id", self.cycle_id_gen.get()) else: self.cycle_id = None self.running = False self.project = spec.get("project", None)
def __init__ (self, spec): Data.__init__(self, spec) self.tag = spec.get("tag", "reservation") self.cycle = spec.get("cycle") self.users = spec.get("users", "") self.createdQueue = False self.partitions = spec.get("partitions", "") self.name = spec['name'] self.start = spec['start'] self.queue = spec.get("queue", "R.%s" % self.name) self.duration = spec.get("duration") self.res_id = spec.get("res_id") self.cycle_id_gen = bgsched_cycle_id_gen if self.cycle: self.cycle_id = spec.get("cycle_id",self.cycle_id_gen.get()) else: self.cycle_id = None self.running = False self.project = spec.get("project", None)
def __init__(self, spec): Data.__init__(self, spec) self.tag = 'job' #following fields are initialized at beginning of simulation self.jobid = int(spec.get("jobid")) self.queue = spec.get("queue", "default") #self.queue = "default" self.submittime = spec.get("submittime") #in seconds self.walltime = spec.get("walltime") #in minutes self.walltime_p = spec.get("walltime_p") # *AdjEst* self.user = spec.get("user", "unknown") self.project = spec.get("project", "unknown") self.nodes = spec.get("nodes", 0) self.runtime = spec.get("runtime", 0) self.remain_time = float(self.runtime) self.start_time = spec.get('start_time', '0') self.end_time = spec.get('end_time', '0') self.last_hold = spec.get('last_hold', 0) # #the time (unix sec) the job starts a latest holding (coscheduling only) self.hold_time = 0 #the time period during which the job is holding (coscheduling only) self.yield_time = spec.get('first_yield', 0) #the time the job first yields (coscheduling only) self.state = spec.get("state", "invisible") self.system_state = '' self.starttime = 0 #self.arrival_time = 0 #self.failure_time = 0 self.has_resources = False self.is_runnable = spec.get("is_runnable", False) self.is_visible = False self.score = float(spec.get("score", 0.0)) self.attrs = spec.get("attrs", {}) self.args = [] self.progress = 0 #self.checkpoint = 1 self.recovering = False self.location = spec.get('location', '') #original location read from job trace, used for job reservation self.io_cnt = spec.get('io_cnt', 0) self.io_size = spec.get('io_size', 0) self.io_frac = spec.get('io_frac', 0)
def update(self, spec): if spec.has_key("users"): qm = ComponentProxy("queue-manager") try: qm.set_queues([{ 'name': self.queue, }], {'users': spec['users']}, "bgsched") except ComponentLookupError: logger.error( "unable to contact queue manager when updating reservation users" ) raise # try the above first -- if we can't contact the queue-manager, # don't update the users if spec.has_key('cycle') and not self.cycle: #just turned this into a cyclic reservation and need a cycle_id spec['cycle_id'] = self.cycle_id_gen.get() #get the user name of whoever issued the command user_name = None if spec.has_key('__cmd_user'): user_name = spec['__cmd_user'] del spec['__cmd_user'] #if we're defering, pull out the 'defer' entry and send a cobalt db #message. There really isn't a corresponding field to update deferred = False if spec.has_key('defer'): logger.info("Res %s/%s: Deferring cyclic reservation: %s", self.res_id, self.cycle_id, self.name) dbwriter.log_to_db(user_name, "deferred", "reservation", self) del spec['defer'] deferred = True Data.update(self, spec) if not deferred or not self.running: #we only want this if we aren't defering. If we are, the cycle will #take care of the new data object creation. dbwriter.log_to_db(user_name, "modifying", "reservation", self)
def __init__(self, spec): Data.__init__(self, spec) self.tag = "process group" self.args = spec.get("args", []) self.cobalt_log_file = spec.get("cobalt_log_file") self.cwd = spec.get("cwd") self.env = spec.get("env", {}) self.executable = spec.get("executable") self.exit_status = None self.head_pid = None self.id = spec.get("id") self.jobid = spec.get("jobid") self.kernel = spec.get("kernel") self.kerneloptions = spec.get("kerneloptions") self.ion_kernel = spec.get("ion_kernel", "default") self.ion_kerneloptions = spec.get("ion_kerneloptions", None) self.location = spec.get("location", []) self.mode = spec.get("mode") self.nodefile = None self.size = spec.get("size") self.stderr = spec.get("stderr") self.stdin = spec.get("stdin") self.stdout = spec.get("stdout") self.umask = spec.get("umask") self.user = spec.get("user", "") self.starttime = spec.get("starttime") self.walltime = spec.get("walltime") self.killtime = spec.get("killtime") self.resid = spec.get("resid", None) self.runid = spec.get("runid", None) self.forker = spec.get("forker", None) self.ranks_per_node = spec.get("ranks_per_node", None) self.subblock = spec.get("subblock", False) self.subblock_parent = spec.get("subblock_parent", None) self.corner = spec.get("corner", None) self.extents = spec.get("extents", None) self.attrs = spec.get("attrs", {})
def test_match(self): data = Data({'tag': "somedata"}) assert data.match({'tag': "*"}) assert data.match({'tag': "somedata"}) assert not data.match({'tag': "someotherdata"}) assert not data.match({ 'tag': "somedata", 'not_an_attribute': "someotherdata" })
def test_get_default (self): warnings.simplefilter("ignore", DeprecationWarning) data = Data({'tag':"somedata"}) assert data.get("tag", "default_value") == "somedata" assert data.get("not_an_attribute", "default_value") == "default_value"
def test_get_default(self): warnings.simplefilter("ignore", DeprecationWarning) data = Data({'tag': "somedata"}) assert data.get("tag", "default_value") == "somedata" assert data.get("not_an_attribute", "default_value") == "default_value"
def test_update (self): warnings.simplefilter("ignore", DeprecationWarning) warnings.simplefilter("ignore", RuntimeWarning) data = Data({'tag':"somedata"}) data.update({'tag':"someotherdata"}) assert data.tag == "someotherdata"
def test_get (self): warnings.simplefilter("ignore", DeprecationWarning) warnings.simplefilter("ignore", RuntimeWarning) data = Data({'tag':"somedata"}) assert data.get("tag") == "somedata"
def __init__(self, spec): Data.__init__(self, spec) spec = spec.copy() self.tag = spec.get("tag", "process-group") self.umask = spec.get('umask', 022) self.name = spec.pop("name", None) self.location = spec.pop("location", None) self.state = spec.pop("state", 'running') self.user = spec.pop("user", None) self.stdout = spec.pop("stdout", None) self.stderr = spec.pop("stderr", None) self.cobalt_log_file = spec.get('cobalt_log_file') self.executable = spec.pop("executable", None) self.jobid = spec.pop("jobid", None) self.path = spec.pop("path", None) self.cwd = spec.pop("cwd", None) self.args = spec.pop("args", []) self.env = spec.pop("env", None) self.stdin = spec.pop("stdin", None) self.kerneloptions = spec.pop("kerneloptions", None) self.job_size = spec.pop("size", None) self.id = spec.get("id") self.mpi_system_id = None self.exit_status = None self.log = logging.getLogger('pg') try: tmp_info = pwd.getpwnam(self.user) userid = tmp_info[2] groupid = tmp_info[3] home_dir = tmp_info[5] except KeyError: raise ProcessGroupCreationError, "user/group" if self.stdout is not None: self.outlog = self.stdout else: self.outlog = tempfile.mktemp() if self.stderr is not None: self.errlog = self.stderr else: self.errlog = tempfile.mktemp() self.pid = os.fork() if not self.pid: program = self.executable self.t = tempfile.NamedTemporaryFile() self.t.write("\n".join(self.location) + '\n') self.t.flush() # create a nodefile in /tmp os.environ['COBALT_NODEFILE'] = self.t.name os.environ["COBALT_JOBID"] = str(self.jobid) os.environ["COBALT_PARTNAME"] = self.location[0] os.environ["COBALT_JOBSIZE"] = str(self.job_size) os.environ['USER'] = self.user os.environ['HOME'] = home_dir # get supplementary groups supplementary_group_ids = [] for g in grp.getgrall(): if self.user in g.gr_mem: supplementary_group_ids.append(g.gr_gid) try: os.setgroups([]) os.setgroups(supplementary_group_ids) except: self.log.error("Failed to set supplementary groups for PG %s", self.jobid, exc_info=True) try: os.setgid(groupid) os.setuid(userid) except OSError: self.log.error("Failed to change userid/groupid for PG %s" % (self.jobid)) sys.exit(0) try: os.umask(self.umask) except: self.log.error("Failed to set umask to %s" % self.umask) try: err = open(self.errlog, 'a') os.dup2(err.fileno(), sys.__stderr__.fileno()) except IOError: self.log.error("Job %s/%s: Failed to open stderr file %s. Stderr will be lost" % (self.jobid, self.user, self.errlog)) except OSError: self.log.error("Job %s/%s: Failed to chmod or dup2 file %s. Stderr will be lost" % (self.jobid, self.user, self.errlog)) try: out = open(self.outlog, 'a') os.dup2(out.fileno(), sys.__stdout__.fileno()) except IOError: self.log.error("Job %s/%s: Failed to open stdout file %s. Stdout will be lost" % (self.jobid, self.user, self.outlog)) except OSError: self.log.error("Job %s/%s: Failed to chmod or dup2 file %s. Stdout will be lost" % (self.jobid, self.user, self.errlog)) cmd = [self.executable, self.executable] + self.args chdir_error = "" try: os.chdir(self.cwd) except: self.log.error("Job %s/%s: unable to set cwd to %s" % (self.jobid, self.user, self.cwd)) chdir_error = "unable to set cwd to %s" % self.cwd try: cobalt_log_file = open(self.cobalt_log_file or "/dev/null", "a") if chdir_error: print >> cobalt_log_file, chdir_error + "\n" print >> cobalt_log_file, "%s\n" % " ".join(cmd[1:]) print >> cobalt_log_file, "called with environment:\n" for key in os.environ: print >> cobalt_log_file, "%s=%s" % (key, os.environ[key]) print >> cobalt_log_file, "\n" cobalt_log_file.close() except: self.log.error("Job %s/%s: unable to open cobaltlog file %s" % (self.jobid, self.user, self.cobalt_log_file)) try: os.execl(*cmd) except Exception, e: self.log.error("Job %s/%s: Something went wrong in starting the script job." % (self.jobid, self.user), exc_info=1) os._exit(1)
def test_to_rx(self): data = Data({'tag': "somedata"}) rx = data.to_rx(["tag", "otherattribute"]) assert set(rx.keys()) == set(["tag", "otherattribute"]) assert rx['tag'] == "somedata" assert rx['otherattribute'] is None
def test_match (self): data = Data({'tag':"somedata"}) assert data.match({'tag':"*"}) assert data.match({'tag':"somedata"}) assert not data.match({'tag':"someotherdata"}) assert not data.match({'tag':"somedata", 'not_an_attribute':"someotherdata"})
def test_get(self): warnings.simplefilter("ignore", DeprecationWarning) warnings.simplefilter("ignore", RuntimeWarning) data = Data({'tag': "somedata"}) assert data.get("tag") == "somedata"
def test_to_rx (self): data = Data({'tag':"somedata"}) rx = data.to_rx(["tag", "otherattribute"]) assert set(rx.keys()) == set(["tag", "otherattribute"]) assert rx['tag'] == "somedata" assert rx['otherattribute'] is None
def test_update(self): warnings.simplefilter("ignore", DeprecationWarning) warnings.simplefilter("ignore", RuntimeWarning) data = Data({'tag': "somedata"}) data.update({'tag': "someotherdata"}) assert data.tag == "someotherdata"
def __init__(self, spec): Data.__init__(self, spec) spec = spec.copy() self.tag = spec.get("tag", "process-group") self.umask = spec.get('umask', 022) self.name = spec.pop("name", None) self.location = spec.pop("location", None) self.state = spec.pop("state", 'running') self.user = spec.pop("user", None) self.stdout = spec.pop("stdout", None) self.stderr = spec.pop("stderr", None) self.cobalt_log_file = spec.get('cobalt_log_file') self.executable = spec.pop("executable", None) self.jobid = spec.pop("jobid", None) self.path = spec.pop("path", None) self.cwd = spec.pop("cwd", None) self.args = spec.pop("args", []) self.env = spec.pop("env", None) self.stdin = spec.pop("stdin", None) self.kerneloptions = spec.pop("kerneloptions", None) self.job_size = spec.pop("size", None) self.id = spec.get("id") self.mpi_system_id = None self.exit_status = None self.log = logging.getLogger('pg') try: tmp_info = pwd.getpwnam(self.user) userid = tmp_info[2] groupid = tmp_info[3] home_dir = tmp_info[5] except KeyError: raise ProcessGroupCreationError, "user/group" if self.stdout is not None: self.outlog = self.stdout else: self.outlog = tempfile.mktemp() if self.stderr is not None: self.errlog = self.stderr else: self.errlog = tempfile.mktemp() self.pid = os.fork() if not self.pid: program = self.executable self.t = tempfile.NamedTemporaryFile() self.t.write("\n".join(self.location) + '\n') self.t.flush() # create a nodefile in /tmp os.environ['COBALT_NODEFILE'] = self.t.name os.environ["COBALT_JOBID"] = str(self.jobid) os.environ["COBALT_PARTNAME"] = self.location[0] os.environ["COBALT_JOBSIZE"] = str(self.job_size) os.environ['USER'] = self.user os.environ['HOME'] = home_dir # get supplementary groups supplementary_group_ids = [] for g in grp.getgrall(): if self.user in g.gr_mem: supplementary_group_ids.append(g.gr_gid) try: os.setgroups([]) os.setgroups(supplementary_group_ids) except: self.log.error("Failed to set supplementary groups for PG %s", self.jobid, exc_info=True) try: os.setgid(groupid) os.setuid(userid) except OSError: self.log.error("Failed to change userid/groupid for PG %s" % (self.jobid)) sys.exit(0) try: os.umask(self.umask) except: self.log.error("Failed to set umask to %s" % self.umask) try: err = open(self.errlog, 'a') os.dup2(err.fileno(), sys.__stderr__.fileno()) except IOError: self.log.error( "Job %s/%s: Failed to open stderr file %s. Stderr will be lost" % (self.jobid, self.user, self.errlog)) except OSError: self.log.error( "Job %s/%s: Failed to chmod or dup2 file %s. Stderr will be lost" % (self.jobid, self.user, self.errlog)) try: out = open(self.outlog, 'a') os.dup2(out.fileno(), sys.__stdout__.fileno()) except IOError: self.log.error( "Job %s/%s: Failed to open stdout file %s. Stdout will be lost" % (self.jobid, self.user, self.outlog)) except OSError: self.log.error( "Job %s/%s: Failed to chmod or dup2 file %s. Stdout will be lost" % (self.jobid, self.user, self.errlog)) cmd = [self.executable, self.executable] + self.args chdir_error = "" try: os.chdir(self.cwd) except: self.log.error("Job %s/%s: unable to set cwd to %s" % (self.jobid, self.user, self.cwd)) chdir_error = "unable to set cwd to %s" % self.cwd try: cobalt_log_file = open(self.cobalt_log_file or "/dev/null", "a") if chdir_error: print >> cobalt_log_file, chdir_error + "\n" print >> cobalt_log_file, "%s\n" % " ".join(cmd[1:]) print >> cobalt_log_file, "called with environment:\n" for key in os.environ: print >> cobalt_log_file, "%s=%s" % (key, os.environ[key]) print >> cobalt_log_file, "\n" cobalt_log_file.close() except: self.log.error("Job %s/%s: unable to open cobaltlog file %s" % (self.jobid, self.user, self.cobalt_log_file)) try: os.execl(*cmd) except Exception, e: self.log.error( "Job %s/%s: Something went wrong in starting the script job." % (self.jobid, self.user), exc_info=1) os._exit(1)