class BGSystem (BGBaseSystem): """Blue Gene system component. Methods: configure -- load partitions from the bridge API add_process_groups -- add (start) an mpirun process on the system (exposed, ~query) get_process_groups -- retrieve mpirun processes (exposed, query) wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query) signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query) update_partition_state -- update partition state from the bridge API (runs as a thread) """ name = "system" implementation = "bgsystem" logger = logger _configfields = ['diag_script_location', 'diag_log_file', 'kernel'] _config = ConfigParser.ConfigParser() _config.read(Cobalt.CONFIG_FILES) if not _config._sections.has_key('bgsystem'): print '''"bgsystem" section missing from cobalt config file''' sys.exit(1) config = _config._sections['bgsystem'] mfields = [field for field in _configfields if not config.has_key(field)] if mfields: print "Missing option(s) in cobalt config file [bgsystem] section: %s" % (" ".join(mfields)) sys.exit(1) if config.get('kernel') == "true": _kernel_configfields = ['bootprofiles', 'partitionboot'] mfields = [field for field in _kernel_configfields if not config.has_key(field)] if mfields: print "Missing option(s) in cobalt config file [bgsystem] section: %s" % (" ".join(mfields)) sys.exit(1) def __init__ (self, *args, **kwargs): BGBaseSystem.__init__(self, *args, **kwargs) sys.setrecursionlimit(5000) self.process_groups.item_cls = BGProcessGroup self.diag_pids = dict() self.configure() # initiate the process before starting any threads thread.start_new_thread(self.update_partition_state, tuple()) def __getstate__(self): flags = {} for part in self._partitions.values(): sched = None func = None queue = None if hasattr(part, 'scheduled'): sched = part.scheduled if hasattr(part, 'functional'): func = part.functional if hasattr(part, 'queue'): queue = part.queue flags[part.name] = (sched, func, queue) return {'managed_partitions':self._managed_partitions, 'version':1, 'partition_flags': flags} def __setstate__(self, state): sys.setrecursionlimit(5000) self._managed_partitions = state['managed_partitions'] self._partitions = PartitionDict() self.process_groups = BGProcessGroupDict() self.process_groups.item_cls = BGProcessGroup self.node_card_cache = dict() self._partitions_lock = thread.allocate_lock() self.pending_diags = dict() self.failed_diags = list() self.diag_pids = dict() self.pending_script_waits = sets.Set() self.bridge_in_error = False self.cached_partitions = None self.offline_partitions = [] self.configure() if 'partition_flags' in state: for pname, flags in state['partition_flags'].items(): if pname in self._partitions: self._partitions[pname].scheduled = flags[0] self._partitions[pname].functional = flags[1] self._partitions[pname].queue = flags[2] else: logger.info("Partition %s is no longer defined" % pname) self.update_relatives() # initiate the process before starting any threads thread.start_new_thread(self.update_partition_state, tuple()) self.lock = threading.Lock() self.statistics = Statistics() def save_me(self): Component.save(self) save_me = automatic(save_me) def _get_node_card(self, name, state): if not self.node_card_cache.has_key(name): self.node_card_cache[name] = NodeCard(name, state) return self.node_card_cache[name] def _new_partition_dict(self, partition_def, bp_cache={}): # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API NODES_PER_NODECARD = 32 node_list = [] if partition_def.small: bp_name = partition_def.base_partitions[0].id for nc in partition_def._node_cards: node_list.append(self._get_node_card(bp_name + "-" + nc.id, nc.state)) else: try: for bp in partition_def.base_partitions: if bp.id not in bp_cache: bp_cache[bp.id] = [] for nc in Cobalt.bridge.NodeCardList.by_base_partition(bp): bp_cache[bp.id].append(self._get_node_card(bp.id + "-" + nc.id, nc.state)) node_list += bp_cache[bp.id] except BridgeException: print "Error communicating with the bridge during initial config. Terminating." sys.exit(1) d = dict( name = partition_def.id, queue = "default", size = NODES_PER_NODECARD * len(node_list), bridge_partition = partition_def, node_cards = node_list, switches = [ s.id for s in partition_def.switches ], state = _get_state(partition_def), ) return d def _detect_wiring_deps(self, partition, wiring_cache={}): def _kernel(): s2 = sets.Set(p.switches) if s1.intersection(s2): p._wiring_conflicts.add(partition.name) partition._wiring_conflicts.add(p.name) self.logger.debug("%s and %s havening problems" % (partition.name, p.name)) s1 = sets.Set(partition.switches) if wiring_cache.has_key(partition.size): for p in wiring_cache[partition.size]: if partition.name!=p.name: _kernel() else: wiring_cache[partition.size] = [partition] for p in self._partitions.values(): if p.size==partition.size and partition.name!=p.name: wiring_cache[partition.size].append(p) _kernel() def configure (self): """Read partition data from the bridge.""" self.logger.info("configure()") try: system_def = Cobalt.bridge.PartitionList.by_filter() except BridgeException: print "Error communicating with the bridge during initial config. Terminating." sys.exit(1) # initialize a new partition dict with all partitions # partitions = PartitionDict() tmp_list = [] wiring_cache = {} bp_cache = {} for partition_def in system_def: tmp_list.append(self._new_partition_dict(partition_def, bp_cache)) partitions.q_add(tmp_list) # update object state self._partitions.clear() self._partitions.update(partitions) # find the wiring deps start = time.time() for p in self._partitions.values(): self._detect_wiring_deps(p, wiring_cache) end = time.time() self.logger.info("took %f seconds to find wiring deps" % (end - start)) # update state information for p in self._partitions.values(): if p.state != "busy": for nc in p.node_cards: if nc.used_by: p.state = "blocked (%s)" % nc.used_by break for dep_name in p._wiring_conflicts: if self._partitions[dep_name].state == "busy": p.state = "blocked-wiring (%s)" % dep_name break def update_partition_state(self): """Use the quicker bridge method that doesn't return nodecard information to update the states of the partitions""" def _start_partition_cleanup(p): self.logger.info("partition %s: marking partition for cleaning", p.name) p.cleanup_pending = True partitions_cleanup.append(p) _set_partition_cleanup_state(p) p.reserved_until = False p.reserved_by = None p.used_by = None def _set_partition_cleanup_state(p): p.state = "cleanup" for part in p._children: if part.bridge_partition.state == "RM_PARTITION_FREE": part.state = "blocked (%s)" % (p.name,) else: part.state = "cleanup" for part in p._parents: if part.state == "idle": part.state = "blocked (%s)" % (p.name,) while True: try: system_def = Cobalt.bridge.PartitionList.info_by_filter() except BridgeException: self.logger.error("Error communicating with the bridge to update partition state information.") self.bridge_in_error = True time.sleep(5) # wait a little bit... continue # then try again try: bg_object = Cobalt.bridge.BlueGene.by_serial() for bp in bg_object.base_partitions: for nc in Cobalt.bridge.NodeCardList.by_base_partition(bp): self.node_card_cache[bp.id + "-" + nc.id].state = nc.state except: self.logger.error("Error communicating with the bridge to update nodecard state information.") self.bridge_in_error = True time.sleep(5) # wait a little bit... continue # then try again self.bridge_in_error = False busted_switches = [] for s in bg_object.switches: if s.state != "RM_SWITCH_UP": busted_switches.append(s.id) # set all of the nodecards to not busy for nc in self.node_card_cache.values(): nc.used_by = '' # update the state of each partition self._partitions_lock.acquire() now = time.time() partitions_cleanup = [] self.offline_partitions = [] missing_partitions = set(self._partitions.keys()) new_partitions = [] try: for partition in system_def: missing_partitions.discard(partition.id) if self._partitions.has_key(partition.id): p = self._partitions[partition.id] p.state = _get_state(partition) p.bridge_partition = partition p._update_node_cards() if p.reserved_until and now > p.reserved_until: p.reserved_until = False p.reserved_by = None else: new_partitions.append(partition) # remove the missing partitions and their wiring relations for pname in missing_partitions: self.logger.info("missing partition removed: %s", pname) p = self._partitions[pname] for dep_name in p._wiring_conflicts: self.logger.debug("removing wiring dependency from: %s", dep_name) self._partitions[dep_name]._wiring_conflicts.discard(p.name) if p.name in self._managed_partitions: self._managed_partitions.discard(p.name) del self._partitions[p.name] bp_cache = {} wiring_cache = {} # throttle the adding of new partitions so updating of # machine state doesn't get bogged down for partition in new_partitions[:8]: self.logger.info("new partition found: %s", partition.id) bridge_p = Cobalt.bridge.Partition.by_id(partition.id) self._partitions.q_add([self._new_partition_dict(bridge_p, bp_cache)]) p = self._partitions[bridge_p.id] p.bridge_partition = partition self._detect_wiring_deps(p, wiring_cache) # if partitions were added or removed, then update the relationships between partitions if len(missing_partitions) > 0 or len(new_partitions) > 0: self.update_relatives() for p in self._partitions.values(): if p.cleanup_pending: if p.used_by: # if the partition has a pending cleanup request, then set the state so that cleanup will be # performed _start_partition_cleanup(p) else: # if the cleanup has already been initiated, then see how it's going busy = [] parts = list(p._all_children) parts.append(p) for part in parts: if part.bridge_partition.state != "RM_PARTITION_FREE": busy.append(part.name) if len(busy) > 0: _set_partition_cleanup_state(p) self.logger.info("partition %s: still cleaning; busy partition(s): %s", p.name, ", ".join(busy)) else: p.cleanup_pending = False self.logger.info("partition %s: cleaning complete", p.name) if p.state == "busy": # when the partition becomes busy, if a script job isn't reserving it, then release the reservation if not p.reserved_by: p.reserved_until = False elif p.state != "cleanup": if p.reserved_until: p.state = "allocated" for part in p._parents: if part.state == "idle": part.state = "blocked (%s)" % (p.name,) for part in p._children: if part.state == "idle": part.state = "blocked (%s)" % (p.name,) elif p.bridge_partition.state == "RM_PARTITION_FREE" and p.used_by: # if the job assigned to the partition has completed, then set the state so that cleanup will be # performed _start_partition_cleanup(p) continue for diag_part in self.pending_diags: if p.name == diag_part.name or p.name in diag_part.parents or p.name in diag_part.children: p.state = "blocked by pending diags" for nc in p.node_cards: if nc.used_by: p.state = "blocked (%s)" % nc.used_by if nc.state != "RM_NODECARD_UP": p.state = "hardware offline: nodecard %s" % nc.id self.offline_partitions.append(p.name) for s in p.switches: if s in busted_switches: p.state = "hardware offline: switch %s" % s self.offline_partitions.append(p.name) for dep_name in p._wiring_conflicts: if self._partitions[dep_name].state in ["busy", "allocated", "cleanup"]: p.state = "blocked-wiring (%s)" % dep_name break for part_name in self.failed_diags: part = self._partitions[part_name] if p.name == part.name: p.state = "failed diags" elif p.name in part.parents or p.name in part.children: p.state = "blocked by failed diags" except: self.logger.error("error in update_partition_state", exc_info=True) self._partitions_lock.release() # cleanup partitions and set their kernels back to the default (while _not_ holding the lock) pnames_cleaned = [] for p in partitions_cleanup: self.logger.info("partition %s: starting partition destruction", p.name) pnames_destroyed = [] parts = list(p._all_children) parts.append(p) for part in parts: pnames_cleaned.append(part.name) try: bpart = part.bridge_partition if bpart.state != "RM_PARTITION_FREE": bpart.destroy() pnames_destroyed.append(part.name) except Cobalt.bridge.IncompatibleState: pass except: self.logger.info("partition %s: an exception occurred while attempting to destroy partition %s", p.name, part.name) if len(pnames_destroyed) > 0: self.logger.info("partition %s: partition destruction initiated for %s", p.name, ", ".join(pnames_destroyed)) else: self.logger.info("partition %s: no partition destruction was required", p.name) try: self._clear_kernel(p.name) self.logger.info("partition %s: kernel settings cleared", p.name) except: self.logger.error("partition %s: failed to clear kernel settings", p.name) job_filter = Cobalt.bridge.JobFilter() job_filter.job_type = Cobalt.bridge.JOB_TYPE_ALL_FLAG jobs = Cobalt.bridge.JobList.by_filter(job_filter) for job in jobs: if job.partition_id in pnames_cleaned: try: job.cancel() self.logger.info("partition %s: task %d canceled", job.partition_id, job.db_id) except (Cobalt.bridge.IncompatibleState, Cobalt.bridge.JobNotFound): pass time.sleep(10) def _mark_partition_for_cleaning(self, pname, jobid): self._partitions_lock.acquire() try: p = self._partitions[pname] if p.used_by == jobid: p.cleanup_pending = True self.logger.info("partition %s: partition marked for cleanup", pname) elif p.used_by != None: self.logger.info("partition %s: job %s was not the current partition user (%s); partition not marked " + \ "for cleanup", pname, jobid, p.used_by) except: self.logger.exception("partition %s: unexpected exception while marking the partition for cleanup", pname) self._partitions_lock.release() def _validate_kernel(self, kernel): if self.config.get('kernel') != 'true': return True kernel_dir = "%s/%s" % (os.path.expandvars(self.config.get('bootprofiles')), kernel) return os.path.exists(kernel_dir) def _set_kernel(self, partition, kernel): '''Set the kernel to be used by jobs run on the specified partition''' if self.config.get('kernel') != 'true': if kernel != "default": raise Exception("custom kernel capabilities disabled") return partition_link = "%s/%s" % (os.path.expandvars(self.config.get('partitionboot')), partition) kernel_dir = "%s/%s" % (os.path.expandvars(self.config.get('bootprofiles')), kernel) try: current = os.readlink(partition_link) except OSError: self.logger.error("partition %s: failed to read partitionboot location %s" % (partition, partition_link)) raise Exception("failed to read partitionboot location %s" % (partition_link,)) if current != kernel_dir: if not self._validate_kernel(kernel): self.logger.error("partition %s: kernel directory \"%s\" does not exist" % (partition, kernel_dir)) raise Exception("kernel directory \"%s\" does not exist" % (kernel_dir,)) self.logger.info("partition %s: updating boot image; currently set to \"%s\"" % (partition, current.split('/')[-1])) try: os.unlink(partition_link) os.symlink(kernel_dir, partition_link) except OSError: self.logger.error("partition %s: failed to reset boot location" % (partition,)) raise Exception("failed to reset boot location for partition" % (partition,)) self.logger.info("partition %s: boot image updated; now set to \"%s\"" % (partition, kernel)) def _clear_kernel(self, partition): '''Set the kernel to be used by a partition to the default value''' if self.config.get('kernel') == 'true': try: self._set_kernel(partition, "default") except: logger.error("partition %s: failed to reset boot location" % (partition,)) def generate_xml(self): """This method produces an XML file describing the managed partitions, suitable for use with the simulator.""" ret = "<BG>\n" ret += "<PartitionList>\n" for p_name in self._managed_partitions: p = self._partitions[p_name] ret += " <Partition name='%s'>\n" % p.name for nc in p.node_cards: ret += " <NodeCard id='%s' />\n" % nc.id for s in p.switches: ret += " <Switch id='%s' />\n" % s ret += " </Partition>\n" ret += "</PartitionList>\n" ret += "</BG>\n" return ret generate_xml = exposed(generate_xml) def add_process_groups (self, specs): """Create a process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)" % (specs)) script_specs = [] other_specs = [] for spec in specs: if spec.get('mode', False) == "script": script_specs.append(spec) else: other_specs.append(spec) # start up script jobs script_pgroups = [] if script_specs: for spec in script_specs: try: self._set_kernel(spec.get('location')[0], spec.get('kernel', "default")) except Exception, e: new_pgroup = self.process_groups.q_add([spec]) pgroup = new_pgroup[0] pgroup.nodect = self._partitions[pgroup.location[0]].size pgroup.exit_status = 1 self.logger.info("process group %s: job %s/%s failed to set the kernel; %s", pgroup.id, pgroup.jobid, pgroup.user, e) else: try: script_pgroup = ComponentProxy("script-manager").add_jobs([spec]) except (ComponentLookupError, xmlrpclib.Fault): self._clear_kernel(spec.get('location')[0]) # FIXME: jobs that were already started are not reported raise ProcessGroupCreationError("system::add_process_groups failed to communicate with script-manager") new_pgroup = self.process_groups.q_add([spec]) pgroup = new_pgroup[0] pgroup.script_id = script_pgroup[0]['id'] pgroup.nodect = self._partitions[pgroup.location[0]].size self.logger.info("job %s/%s: process group %s created to track script", pgroup.jobid, pgroup.user, pgroup.id) self.reserve_resources_until(spec['location'], time.time() + 60*float(spec['walltime']), pgroup.jobid) if pgroup.kernel != "default": self.logger.info("process group %s: job %s/%s using kernel %s", pgroup.id, pgroup.jobid, pgroup.user, pgroup.kernel) script_pgroups.append(pgroup) # start up non-script mode jobs process_groups = self.process_groups.q_add(other_specs) for pgroup in process_groups: pgroup.nodect = self._partitions[pgroup.location[0]].size self.logger.info("job %s/%s: process group %s created to track mpirun status", pgroup.jobid, pgroup.user, pgroup.id) try: if not pgroup.true_mpi_args: self._set_kernel(pgroup.location[0], pgroup.kernel) except Exception, e: # FIXME: setting exit_status to signal the job has failed isn't really the right thing to do. another flag # should be added to the process group that wait_process_group uses to determine when a process group is no # longer active. an error message should also be attached to the process group so that cqm can report the # problem to the user. pgroup.exit_status = 1 self.logger.info("process group %s: job %s/%s failed to set the kernel; %s", pgroup.id, pgroup.jobid, pgroup.user, e) else: if pgroup.kernel != "default" and not pgroup.true_mpi_args: self.logger.info("process group %s: job %s/%s using kernel %s", pgroup.id, pgroup.jobid, pgroup.user, pgroup.kernel) pgroup.start()
class BGSystem(BGBaseSystem): """Blue Gene system component. Methods: configure -- load partitions from the bridge API add_process_groups -- add (start) an mpirun process on the system (exposed, ~query) get_process_groups -- retrieve mpirun processes (exposed, query) wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query) signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query) update_partition_state -- update partition state from the bridge API (runs as a thread) """ name = "system" implementation = "bgsystem" logger = logger _configfields = ['diag_script_location', 'diag_log_file', 'kernel'] _config = ConfigParser.ConfigParser() _config.read(Cobalt.CONFIG_FILES) if not _config._sections.has_key('bgsystem'): print '''"bgsystem" section missing from cobalt config file''' sys.exit(1) config = _config._sections['bgsystem'] mfields = [field for field in _configfields if not config.has_key(field)] if mfields: print "Missing option(s) in cobalt config file [bgsystem] section: %s" % ( " ".join(mfields)) sys.exit(1) if config.get('kernel') == "true": _kernel_configfields = ['bootprofiles', 'partitionboot'] mfields = [ field for field in _kernel_configfields if not config.has_key(field) ] if mfields: print "Missing option(s) in cobalt config file [bgsystem] section: %s" % ( " ".join(mfields)) sys.exit(1) def __init__(self, *args, **kwargs): BGBaseSystem.__init__(self, *args, **kwargs) sys.setrecursionlimit(5000) self.process_groups.item_cls = BGProcessGroup self.diag_pids = dict() self.configure() # initiate the process before starting any threads thread.start_new_thread(self.update_partition_state, tuple()) def __getstate__(self): flags = {} for part in self._partitions.values(): sched = None func = None queue = None if hasattr(part, 'scheduled'): sched = part.scheduled if hasattr(part, 'functional'): func = part.functional if hasattr(part, 'queue'): queue = part.queue flags[part.name] = (sched, func, queue) return { 'managed_partitions': self._managed_partitions, 'version': 1, 'partition_flags': flags } def __setstate__(self, state): sys.setrecursionlimit(5000) self._managed_partitions = state['managed_partitions'] self._partitions = PartitionDict() self.process_groups = BGProcessGroupDict() self.process_groups.item_cls = BGProcessGroup self.node_card_cache = dict() self._partitions_lock = thread.allocate_lock() self.pending_diags = dict() self.failed_diags = list() self.diag_pids = dict() self.pending_script_waits = sets.Set() self.bridge_in_error = False self.cached_partitions = None self.offline_partitions = [] self.configure() if 'partition_flags' in state: for pname, flags in state['partition_flags'].items(): if pname in self._partitions: self._partitions[pname].scheduled = flags[0] self._partitions[pname].functional = flags[1] self._partitions[pname].queue = flags[2] else: logger.info("Partition %s is no longer defined" % pname) self.update_relatives() # initiate the process before starting any threads thread.start_new_thread(self.update_partition_state, tuple()) self.lock = threading.Lock() self.statistics = Statistics() def save_me(self): Component.save(self) save_me = automatic(save_me) def _get_node_card(self, name, state): if not self.node_card_cache.has_key(name): self.node_card_cache[name] = NodeCard(name, state) return self.node_card_cache[name] def _new_partition_dict(self, partition_def, bp_cache={}): # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API NODES_PER_NODECARD = 32 node_list = [] if partition_def.small: bp_name = partition_def.base_partitions[0].id for nc in partition_def._node_cards: node_list.append( self._get_node_card(bp_name + "-" + nc.id, nc.state)) else: try: for bp in partition_def.base_partitions: if bp.id not in bp_cache: bp_cache[bp.id] = [] for nc in Cobalt.bridge.NodeCardList.by_base_partition( bp): bp_cache[bp.id].append( self._get_node_card(bp.id + "-" + nc.id, nc.state)) node_list += bp_cache[bp.id] except BridgeException: print "Error communicating with the bridge during initial config. Terminating." sys.exit(1) d = dict( name=partition_def.id, queue="default", size=NODES_PER_NODECARD * len(node_list), bridge_partition=partition_def, node_cards=node_list, switches=[s.id for s in partition_def.switches], state=_get_state(partition_def), ) return d def _detect_wiring_deps(self, partition, wiring_cache={}): def _kernel(): s2 = sets.Set(p.switches) if s1.intersection(s2): p._wiring_conflicts.add(partition.name) partition._wiring_conflicts.add(p.name) self.logger.debug("%s and %s havening problems" % (partition.name, p.name)) s1 = sets.Set(partition.switches) if wiring_cache.has_key(partition.size): for p in wiring_cache[partition.size]: if partition.name != p.name: _kernel() else: wiring_cache[partition.size] = [partition] for p in self._partitions.values(): if p.size == partition.size and partition.name != p.name: wiring_cache[partition.size].append(p) _kernel() def configure(self): """Read partition data from the bridge.""" self.logger.info("configure()") try: system_def = Cobalt.bridge.PartitionList.by_filter() except BridgeException: print "Error communicating with the bridge during initial config. Terminating." sys.exit(1) # initialize a new partition dict with all partitions # partitions = PartitionDict() tmp_list = [] wiring_cache = {} bp_cache = {} for partition_def in system_def: tmp_list.append(self._new_partition_dict(partition_def, bp_cache)) partitions.q_add(tmp_list) # update object state self._partitions.clear() self._partitions.update(partitions) # find the wiring deps start = time.time() for p in self._partitions.values(): self._detect_wiring_deps(p, wiring_cache) end = time.time() self.logger.info("took %f seconds to find wiring deps" % (end - start)) # update state information for p in self._partitions.values(): if p.state != "busy": for nc in p.node_cards: if nc.used_by: p.state = "blocked (%s)" % nc.used_by break for dep_name in p._wiring_conflicts: if self._partitions[dep_name].state == "busy": p.state = "blocked-wiring (%s)" % dep_name break def update_partition_state(self): """Use the quicker bridge method that doesn't return nodecard information to update the states of the partitions""" def _start_partition_cleanup(p): self.logger.info("partition %s: marking partition for cleaning", p.name) p.cleanup_pending = True partitions_cleanup.append(p) _set_partition_cleanup_state(p) p.reserved_until = False p.reserved_by = None p.used_by = None def _set_partition_cleanup_state(p): p.state = "cleanup" for part in p._children: if part.bridge_partition.state == "RM_PARTITION_FREE": part.state = "blocked (%s)" % (p.name, ) else: part.state = "cleanup" for part in p._parents: if part.state == "idle": part.state = "blocked (%s)" % (p.name, ) while True: try: system_def = Cobalt.bridge.PartitionList.info_by_filter() except BridgeException: self.logger.error( "Error communicating with the bridge to update partition state information." ) self.bridge_in_error = True time.sleep(5) # wait a little bit... continue # then try again try: bg_object = Cobalt.bridge.BlueGene.by_serial() for bp in bg_object.base_partitions: for nc in Cobalt.bridge.NodeCardList.by_base_partition(bp): self.node_card_cache[bp.id + "-" + nc.id].state = nc.state except: self.logger.error( "Error communicating with the bridge to update nodecard state information." ) self.bridge_in_error = True time.sleep(5) # wait a little bit... continue # then try again self.bridge_in_error = False busted_switches = [] for s in bg_object.switches: if s.state != "RM_SWITCH_UP": busted_switches.append(s.id) # set all of the nodecards to not busy for nc in self.node_card_cache.values(): nc.used_by = '' # update the state of each partition self._partitions_lock.acquire() now = time.time() partitions_cleanup = [] self.offline_partitions = [] missing_partitions = set(self._partitions.keys()) new_partitions = [] try: for partition in system_def: missing_partitions.discard(partition.id) if self._partitions.has_key(partition.id): p = self._partitions[partition.id] p.state = _get_state(partition) p.bridge_partition = partition p._update_node_cards() if p.reserved_until and now > p.reserved_until: p.reserved_until = False p.reserved_by = None else: new_partitions.append(partition) # remove the missing partitions and their wiring relations for pname in missing_partitions: self.logger.info("missing partition removed: %s", pname) p = self._partitions[pname] for dep_name in p._wiring_conflicts: self.logger.debug( "removing wiring dependency from: %s", dep_name) self._partitions[dep_name]._wiring_conflicts.discard( p.name) if p.name in self._managed_partitions: self._managed_partitions.discard(p.name) del self._partitions[p.name] bp_cache = {} wiring_cache = {} # throttle the adding of new partitions so updating of # machine state doesn't get bogged down for partition in new_partitions[:8]: self.logger.info("new partition found: %s", partition.id) bridge_p = Cobalt.bridge.Partition.by_id(partition.id) self._partitions.q_add( [self._new_partition_dict(bridge_p, bp_cache)]) p = self._partitions[bridge_p.id] p.bridge_partition = partition self._detect_wiring_deps(p, wiring_cache) # if partitions were added or removed, then update the relationships between partitions if len(missing_partitions) > 0 or len(new_partitions) > 0: self.update_relatives() for p in self._partitions.values(): if p.cleanup_pending: if p.used_by: # if the partition has a pending cleanup request, then set the state so that cleanup will be # performed _start_partition_cleanup(p) else: # if the cleanup has already been initiated, then see how it's going busy = [] parts = list(p._all_children) parts.append(p) for part in parts: if part.bridge_partition.state != "RM_PARTITION_FREE": busy.append(part.name) if len(busy) > 0: _set_partition_cleanup_state(p) self.logger.info( "partition %s: still cleaning; busy partition(s): %s", p.name, ", ".join(busy)) else: p.cleanup_pending = False self.logger.info( "partition %s: cleaning complete", p.name) if p.state == "busy": # when the partition becomes busy, if a script job isn't reserving it, then release the reservation if not p.reserved_by: p.reserved_until = False elif p.state != "cleanup": if p.reserved_until: p.state = "allocated" for part in p._parents: if part.state == "idle": part.state = "blocked (%s)" % (p.name, ) for part in p._children: if part.state == "idle": part.state = "blocked (%s)" % (p.name, ) elif p.bridge_partition.state == "RM_PARTITION_FREE" and p.used_by: # if the job assigned to the partition has completed, then set the state so that cleanup will be # performed _start_partition_cleanup(p) continue for diag_part in self.pending_diags: if p.name == diag_part.name or p.name in diag_part.parents or p.name in diag_part.children: p.state = "blocked by pending diags" for nc in p.node_cards: if nc.used_by: p.state = "blocked (%s)" % nc.used_by if nc.state != "RM_NODECARD_UP": p.state = "hardware offline: nodecard %s" % nc.id self.offline_partitions.append(p.name) for s in p.switches: if s in busted_switches: p.state = "hardware offline: switch %s" % s self.offline_partitions.append(p.name) for dep_name in p._wiring_conflicts: if self._partitions[dep_name].state in [ "busy", "allocated", "cleanup" ]: p.state = "blocked-wiring (%s)" % dep_name break for part_name in self.failed_diags: part = self._partitions[part_name] if p.name == part.name: p.state = "failed diags" elif p.name in part.parents or p.name in part.children: p.state = "blocked by failed diags" except: self.logger.error("error in update_partition_state", exc_info=True) self._partitions_lock.release() # cleanup partitions and set their kernels back to the default (while _not_ holding the lock) pnames_cleaned = [] for p in partitions_cleanup: self.logger.info( "partition %s: starting partition destruction", p.name) pnames_destroyed = [] parts = list(p._all_children) parts.append(p) for part in parts: pnames_cleaned.append(part.name) try: bpart = part.bridge_partition if bpart.state != "RM_PARTITION_FREE": bpart.destroy() pnames_destroyed.append(part.name) except Cobalt.bridge.IncompatibleState: pass except: self.logger.info( "partition %s: an exception occurred while attempting to destroy partition %s", p.name, part.name) if len(pnames_destroyed) > 0: self.logger.info( "partition %s: partition destruction initiated for %s", p.name, ", ".join(pnames_destroyed)) else: self.logger.info( "partition %s: no partition destruction was required", p.name) try: self._clear_kernel(p.name) self.logger.info("partition %s: kernel settings cleared", p.name) except: self.logger.error( "partition %s: failed to clear kernel settings", p.name) job_filter = Cobalt.bridge.JobFilter() job_filter.job_type = Cobalt.bridge.JOB_TYPE_ALL_FLAG jobs = Cobalt.bridge.JobList.by_filter(job_filter) for job in jobs: if job.partition_id in pnames_cleaned: try: job.cancel() self.logger.info("partition %s: task %d canceled", job.partition_id, job.db_id) except (Cobalt.bridge.IncompatibleState, Cobalt.bridge.JobNotFound): pass time.sleep(10) def _mark_partition_for_cleaning(self, pname, jobid): self._partitions_lock.acquire() try: p = self._partitions[pname] if p.used_by == jobid: p.cleanup_pending = True self.logger.info("partition %s: partition marked for cleanup", pname) elif p.used_by != None: self.logger.info("partition %s: job %s was not the current partition user (%s); partition not marked " + \ "for cleanup", pname, jobid, p.used_by) except: self.logger.exception( "partition %s: unexpected exception while marking the partition for cleanup", pname) self._partitions_lock.release() def _validate_kernel(self, kernel): if self.config.get('kernel') != 'true': return True kernel_dir = "%s/%s" % (os.path.expandvars( self.config.get('bootprofiles')), kernel) return os.path.exists(kernel_dir) def _set_kernel(self, partition, kernel): '''Set the kernel to be used by jobs run on the specified partition''' if self.config.get('kernel') != 'true': if kernel != "default": raise Exception("custom kernel capabilities disabled") return partition_link = "%s/%s" % (os.path.expandvars( self.config.get('partitionboot')), partition) kernel_dir = "%s/%s" % (os.path.expandvars( self.config.get('bootprofiles')), kernel) try: current = os.readlink(partition_link) except OSError: self.logger.error( "partition %s: failed to read partitionboot location %s" % (partition, partition_link)) raise Exception("failed to read partitionboot location %s" % (partition_link, )) if current != kernel_dir: if not self._validate_kernel(kernel): self.logger.error( "partition %s: kernel directory \"%s\" does not exist" % (partition, kernel_dir)) raise Exception("kernel directory \"%s\" does not exist" % (kernel_dir, )) self.logger.info( "partition %s: updating boot image; currently set to \"%s\"" % (partition, current.split('/')[-1])) try: os.unlink(partition_link) os.symlink(kernel_dir, partition_link) except OSError: self.logger.error( "partition %s: failed to reset boot location" % (partition, )) raise Exception("failed to reset boot location for partition" % (partition, )) self.logger.info( "partition %s: boot image updated; now set to \"%s\"" % (partition, kernel)) def _clear_kernel(self, partition): '''Set the kernel to be used by a partition to the default value''' if self.config.get('kernel') == 'true': try: self._set_kernel(partition, "default") except: logger.error("partition %s: failed to reset boot location" % (partition, )) def generate_xml(self): """This method produces an XML file describing the managed partitions, suitable for use with the simulator.""" ret = "<BG>\n" ret += "<PartitionList>\n" for p_name in self._managed_partitions: p = self._partitions[p_name] ret += " <Partition name='%s'>\n" % p.name for nc in p.node_cards: ret += " <NodeCard id='%s' />\n" % nc.id for s in p.switches: ret += " <Switch id='%s' />\n" % s ret += " </Partition>\n" ret += "</PartitionList>\n" ret += "</BG>\n" return ret generate_xml = exposed(generate_xml) def add_process_groups(self, specs): """Create a process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)" % (specs)) script_specs = [] other_specs = [] for spec in specs: if spec.get('mode', False) == "script": script_specs.append(spec) else: other_specs.append(spec) # start up script jobs script_pgroups = [] if script_specs: for spec in script_specs: try: self._set_kernel( spec.get('location')[0], spec.get('kernel', "default")) except Exception, e: new_pgroup = self.process_groups.q_add([spec]) pgroup = new_pgroup[0] pgroup.nodect = self._partitions[pgroup.location[0]].size pgroup.exit_status = 1 self.logger.info( "process group %s: job %s/%s failed to set the kernel; %s", pgroup.id, pgroup.jobid, pgroup.user, e) else: try: script_pgroup = ComponentProxy( "script-manager").add_jobs([spec]) except (ComponentLookupError, xmlrpclib.Fault): self._clear_kernel(spec.get('location')[0]) # FIXME: jobs that were already started are not reported raise ProcessGroupCreationError( "system::add_process_groups failed to communicate with script-manager" ) new_pgroup = self.process_groups.q_add([spec]) pgroup = new_pgroup[0] pgroup.script_id = script_pgroup[0]['id'] pgroup.nodect = self._partitions[pgroup.location[0]].size self.logger.info( "job %s/%s: process group %s created to track script", pgroup.jobid, pgroup.user, pgroup.id) self.reserve_resources_until( spec['location'], time.time() + 60 * float(spec['walltime']), pgroup.jobid) if pgroup.kernel != "default": self.logger.info( "process group %s: job %s/%s using kernel %s", pgroup.id, pgroup.jobid, pgroup.user, pgroup.kernel) script_pgroups.append(pgroup) # start up non-script mode jobs process_groups = self.process_groups.q_add(other_specs) for pgroup in process_groups: pgroup.nodect = self._partitions[pgroup.location[0]].size self.logger.info( "job %s/%s: process group %s created to track mpirun status", pgroup.jobid, pgroup.user, pgroup.id) try: if not pgroup.true_mpi_args: self._set_kernel(pgroup.location[0], pgroup.kernel) except Exception, e: # FIXME: setting exit_status to signal the job has failed isn't really the right thing to do. another flag # should be added to the process group that wait_process_group uses to determine when a process group is no # longer active. an error message should also be attached to the process group so that cqm can report the # problem to the user. pgroup.exit_status = 1 self.logger.info( "process group %s: job %s/%s failed to set the kernel; %s", pgroup.id, pgroup.jobid, pgroup.user, e) else: if pgroup.kernel != "default" and not pgroup.true_mpi_args: self.logger.info( "process group %s: job %s/%s using kernel %s", pgroup.id, pgroup.jobid, pgroup.user, pgroup.kernel) pgroup.start()