class BGSystem (BGBaseSystem): """Blue Gene system component. Methods: configure -- load partitions from the bridge API add_process_groups -- add (start) an mpirun process on the system (exposed, ~query) get_process_groups -- retrieve mpirun processes (exposed, query) wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query) signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query) update_partition_state -- update partition state from the bridge API (runs as a thread) """ name = "system" implementation = "bgsystem" logger = logger _configfields = ['diag_script_location', 'diag_log_file', 'kernel'] _config = ConfigParser.ConfigParser() _config.read(Cobalt.CONFIG_FILES) if not _config._sections.has_key('bgsystem'): print '''"bgsystem" section missing from cobalt config file''' sys.exit(1) config = _config._sections['bgsystem'] mfields = [field for field in _configfields if not config.has_key(field)] if mfields: print "Missing option(s) in cobalt config file [bgsystem] section: %s" % (" ".join(mfields)) sys.exit(1) if config.get('kernel') == "true": _kernel_configfields = ['bootprofiles', 'partitionboot'] mfields = [field for field in _kernel_configfields if not config.has_key(field)] if mfields: print "Missing option(s) in cobalt config file [bgsystem] section: %s" % (" ".join(mfields)) sys.exit(1) def __init__ (self, *args, **kwargs): BGBaseSystem.__init__(self, *args, **kwargs) sys.setrecursionlimit(5000) self.process_groups.item_cls = BGProcessGroup self.diag_pids = dict() self.configure() # initiate the process before starting any threads thread.start_new_thread(self.update_partition_state, tuple()) def __getstate__(self): flags = {} for part in self._partitions.values(): sched = None func = None queue = None if hasattr(part, 'scheduled'): sched = part.scheduled if hasattr(part, 'functional'): func = part.functional if hasattr(part, 'queue'): queue = part.queue flags[part.name] = (sched, func, queue) return {'managed_partitions':self._managed_partitions, 'version':1, 'partition_flags': flags} def __setstate__(self, state): sys.setrecursionlimit(5000) self._managed_partitions = state['managed_partitions'] self._partitions = PartitionDict() self.process_groups = BGProcessGroupDict() self.process_groups.item_cls = BGProcessGroup self.node_card_cache = dict() self._partitions_lock = thread.allocate_lock() self.pending_diags = dict() self.failed_diags = list() self.diag_pids = dict() self.pending_script_waits = sets.Set() self.bridge_in_error = False self.cached_partitions = None self.offline_partitions = [] self.configure() if 'partition_flags' in state: for pname, flags in state['partition_flags'].items(): if pname in self._partitions: self._partitions[pname].scheduled = flags[0] self._partitions[pname].functional = flags[1] self._partitions[pname].queue = flags[2] else: logger.info("Partition %s is no longer defined" % pname) self.update_relatives() # initiate the process before starting any threads thread.start_new_thread(self.update_partition_state, tuple()) self.lock = threading.Lock() self.statistics = Statistics() def save_me(self): Component.save(self) save_me = automatic(save_me) def _get_node_card(self, name, state): if not self.node_card_cache.has_key(name): self.node_card_cache[name] = NodeCard(name, state) return self.node_card_cache[name] def _new_partition_dict(self, partition_def, bp_cache={}): # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API NODES_PER_NODECARD = 32 node_list = [] if partition_def.small: bp_name = partition_def.base_partitions[0].id for nc in partition_def._node_cards: node_list.append(self._get_node_card(bp_name + "-" + nc.id, nc.state)) else: try: for bp in partition_def.base_partitions: if bp.id not in bp_cache: bp_cache[bp.id] = [] for nc in Cobalt.bridge.NodeCardList.by_base_partition(bp): bp_cache[bp.id].append(self._get_node_card(bp.id + "-" + nc.id, nc.state)) node_list += bp_cache[bp.id] except BridgeException: print "Error communicating with the bridge during initial config. Terminating." sys.exit(1) d = dict( name = partition_def.id, queue = "default", size = NODES_PER_NODECARD * len(node_list), bridge_partition = partition_def, node_cards = node_list, switches = [ s.id for s in partition_def.switches ], state = _get_state(partition_def), ) return d def _detect_wiring_deps(self, partition, wiring_cache={}): def _kernel(): s2 = sets.Set(p.switches) if s1.intersection(s2): p._wiring_conflicts.add(partition.name) partition._wiring_conflicts.add(p.name) self.logger.debug("%s and %s havening problems" % (partition.name, p.name)) s1 = sets.Set(partition.switches) if wiring_cache.has_key(partition.size): for p in wiring_cache[partition.size]: if partition.name!=p.name: _kernel() else: wiring_cache[partition.size] = [partition] for p in self._partitions.values(): if p.size==partition.size and partition.name!=p.name: wiring_cache[partition.size].append(p) _kernel() def configure (self): """Read partition data from the bridge.""" self.logger.info("configure()") try: system_def = Cobalt.bridge.PartitionList.by_filter() except BridgeException: print "Error communicating with the bridge during initial config. Terminating." sys.exit(1) # initialize a new partition dict with all partitions # partitions = PartitionDict() tmp_list = [] wiring_cache = {} bp_cache = {} for partition_def in system_def: tmp_list.append(self._new_partition_dict(partition_def, bp_cache)) partitions.q_add(tmp_list) # update object state self._partitions.clear() self._partitions.update(partitions) # find the wiring deps start = time.time() for p in self._partitions.values(): self._detect_wiring_deps(p, wiring_cache) end = time.time() self.logger.info("took %f seconds to find wiring deps" % (end - start)) # update state information for p in self._partitions.values(): if p.state != "busy": for nc in p.node_cards: if nc.used_by: p.state = "blocked (%s)" % nc.used_by break for dep_name in p._wiring_conflicts: if self._partitions[dep_name].state == "busy": p.state = "blocked-wiring (%s)" % dep_name break def update_partition_state(self): """Use the quicker bridge method that doesn't return nodecard information to update the states of the partitions""" def _start_partition_cleanup(p): self.logger.info("partition %s: marking partition for cleaning", p.name) p.cleanup_pending = True partitions_cleanup.append(p) _set_partition_cleanup_state(p) p.reserved_until = False p.reserved_by = None p.used_by = None def _set_partition_cleanup_state(p): p.state = "cleanup" for part in p._children: if part.bridge_partition.state == "RM_PARTITION_FREE": part.state = "blocked (%s)" % (p.name,) else: part.state = "cleanup" for part in p._parents: if part.state == "idle": part.state = "blocked (%s)" % (p.name,) while True: try: system_def = Cobalt.bridge.PartitionList.info_by_filter() except BridgeException: self.logger.error("Error communicating with the bridge to update partition state information.") self.bridge_in_error = True time.sleep(5) # wait a little bit... continue # then try again try: bg_object = Cobalt.bridge.BlueGene.by_serial() for bp in bg_object.base_partitions: for nc in Cobalt.bridge.NodeCardList.by_base_partition(bp): self.node_card_cache[bp.id + "-" + nc.id].state = nc.state except: self.logger.error("Error communicating with the bridge to update nodecard state information.") self.bridge_in_error = True time.sleep(5) # wait a little bit... continue # then try again self.bridge_in_error = False busted_switches = [] for s in bg_object.switches: if s.state != "RM_SWITCH_UP": busted_switches.append(s.id) # set all of the nodecards to not busy for nc in self.node_card_cache.values(): nc.used_by = '' # update the state of each partition self._partitions_lock.acquire() now = time.time() partitions_cleanup = [] self.offline_partitions = [] missing_partitions = set(self._partitions.keys()) new_partitions = [] try: for partition in system_def: missing_partitions.discard(partition.id) if self._partitions.has_key(partition.id): p = self._partitions[partition.id] p.state = _get_state(partition) p.bridge_partition = partition p._update_node_cards() if p.reserved_until and now > p.reserved_until: p.reserved_until = False p.reserved_by = None else: new_partitions.append(partition) # remove the missing partitions and their wiring relations for pname in missing_partitions: self.logger.info("missing partition removed: %s", pname) p = self._partitions[pname] for dep_name in p._wiring_conflicts: self.logger.debug("removing wiring dependency from: %s", dep_name) self._partitions[dep_name]._wiring_conflicts.discard(p.name) if p.name in self._managed_partitions: self._managed_partitions.discard(p.name) del self._partitions[p.name] bp_cache = {} wiring_cache = {} # throttle the adding of new partitions so updating of # machine state doesn't get bogged down for partition in new_partitions[:8]: self.logger.info("new partition found: %s", partition.id) bridge_p = Cobalt.bridge.Partition.by_id(partition.id) self._partitions.q_add([self._new_partition_dict(bridge_p, bp_cache)]) p = self._partitions[bridge_p.id] p.bridge_partition = partition self._detect_wiring_deps(p, wiring_cache) # if partitions were added or removed, then update the relationships between partitions if len(missing_partitions) > 0 or len(new_partitions) > 0: self.update_relatives() for p in self._partitions.values(): if p.cleanup_pending: if p.used_by: # if the partition has a pending cleanup request, then set the state so that cleanup will be # performed _start_partition_cleanup(p) else: # if the cleanup has already been initiated, then see how it's going busy = [] parts = list(p._all_children) parts.append(p) for part in parts: if part.bridge_partition.state != "RM_PARTITION_FREE": busy.append(part.name) if len(busy) > 0: _set_partition_cleanup_state(p) self.logger.info("partition %s: still cleaning; busy partition(s): %s", p.name, ", ".join(busy)) else: p.cleanup_pending = False self.logger.info("partition %s: cleaning complete", p.name) if p.state == "busy": # when the partition becomes busy, if a script job isn't reserving it, then release the reservation if not p.reserved_by: p.reserved_until = False elif p.state != "cleanup": if p.reserved_until: p.state = "allocated" for part in p._parents: if part.state == "idle": part.state = "blocked (%s)" % (p.name,) for part in p._children: if part.state == "idle": part.state = "blocked (%s)" % (p.name,) elif p.bridge_partition.state == "RM_PARTITION_FREE" and p.used_by: # if the job assigned to the partition has completed, then set the state so that cleanup will be # performed _start_partition_cleanup(p) continue for diag_part in self.pending_diags: if p.name == diag_part.name or p.name in diag_part.parents or p.name in diag_part.children: p.state = "blocked by pending diags" for nc in p.node_cards: if nc.used_by: p.state = "blocked (%s)" % nc.used_by if nc.state != "RM_NODECARD_UP": p.state = "hardware offline: nodecard %s" % nc.id self.offline_partitions.append(p.name) for s in p.switches: if s in busted_switches: p.state = "hardware offline: switch %s" % s self.offline_partitions.append(p.name) for dep_name in p._wiring_conflicts: if self._partitions[dep_name].state in ["busy", "allocated", "cleanup"]: p.state = "blocked-wiring (%s)" % dep_name break for part_name in self.failed_diags: part = self._partitions[part_name] if p.name == part.name: p.state = "failed diags" elif p.name in part.parents or p.name in part.children: p.state = "blocked by failed diags" except: self.logger.error("error in update_partition_state", exc_info=True) self._partitions_lock.release() # cleanup partitions and set their kernels back to the default (while _not_ holding the lock) pnames_cleaned = [] for p in partitions_cleanup: self.logger.info("partition %s: starting partition destruction", p.name) pnames_destroyed = [] parts = list(p._all_children) parts.append(p) for part in parts: pnames_cleaned.append(part.name) try: bpart = part.bridge_partition if bpart.state != "RM_PARTITION_FREE": bpart.destroy() pnames_destroyed.append(part.name) except Cobalt.bridge.IncompatibleState: pass except: self.logger.info("partition %s: an exception occurred while attempting to destroy partition %s", p.name, part.name) if len(pnames_destroyed) > 0: self.logger.info("partition %s: partition destruction initiated for %s", p.name, ", ".join(pnames_destroyed)) else: self.logger.info("partition %s: no partition destruction was required", p.name) try: self._clear_kernel(p.name) self.logger.info("partition %s: kernel settings cleared", p.name) except: self.logger.error("partition %s: failed to clear kernel settings", p.name) job_filter = Cobalt.bridge.JobFilter() job_filter.job_type = Cobalt.bridge.JOB_TYPE_ALL_FLAG jobs = Cobalt.bridge.JobList.by_filter(job_filter) for job in jobs: if job.partition_id in pnames_cleaned: try: job.cancel() self.logger.info("partition %s: task %d canceled", job.partition_id, job.db_id) except (Cobalt.bridge.IncompatibleState, Cobalt.bridge.JobNotFound): pass time.sleep(10) def _mark_partition_for_cleaning(self, pname, jobid): self._partitions_lock.acquire() try: p = self._partitions[pname] if p.used_by == jobid: p.cleanup_pending = True self.logger.info("partition %s: partition marked for cleanup", pname) elif p.used_by != None: self.logger.info("partition %s: job %s was not the current partition user (%s); partition not marked " + \ "for cleanup", pname, jobid, p.used_by) except: self.logger.exception("partition %s: unexpected exception while marking the partition for cleanup", pname) self._partitions_lock.release() def _validate_kernel(self, kernel): if self.config.get('kernel') != 'true': return True kernel_dir = "%s/%s" % (os.path.expandvars(self.config.get('bootprofiles')), kernel) return os.path.exists(kernel_dir) def _set_kernel(self, partition, kernel): '''Set the kernel to be used by jobs run on the specified partition''' if self.config.get('kernel') != 'true': if kernel != "default": raise Exception("custom kernel capabilities disabled") return partition_link = "%s/%s" % (os.path.expandvars(self.config.get('partitionboot')), partition) kernel_dir = "%s/%s" % (os.path.expandvars(self.config.get('bootprofiles')), kernel) try: current = os.readlink(partition_link) except OSError: self.logger.error("partition %s: failed to read partitionboot location %s" % (partition, partition_link)) raise Exception("failed to read partitionboot location %s" % (partition_link,)) if current != kernel_dir: if not self._validate_kernel(kernel): self.logger.error("partition %s: kernel directory \"%s\" does not exist" % (partition, kernel_dir)) raise Exception("kernel directory \"%s\" does not exist" % (kernel_dir,)) self.logger.info("partition %s: updating boot image; currently set to \"%s\"" % (partition, current.split('/')[-1])) try: os.unlink(partition_link) os.symlink(kernel_dir, partition_link) except OSError: self.logger.error("partition %s: failed to reset boot location" % (partition,)) raise Exception("failed to reset boot location for partition" % (partition,)) self.logger.info("partition %s: boot image updated; now set to \"%s\"" % (partition, kernel)) def _clear_kernel(self, partition): '''Set the kernel to be used by a partition to the default value''' if self.config.get('kernel') == 'true': try: self._set_kernel(partition, "default") except: logger.error("partition %s: failed to reset boot location" % (partition,)) def generate_xml(self): """This method produces an XML file describing the managed partitions, suitable for use with the simulator.""" ret = "<BG>\n" ret += "<PartitionList>\n" for p_name in self._managed_partitions: p = self._partitions[p_name] ret += " <Partition name='%s'>\n" % p.name for nc in p.node_cards: ret += " <NodeCard id='%s' />\n" % nc.id for s in p.switches: ret += " <Switch id='%s' />\n" % s ret += " </Partition>\n" ret += "</PartitionList>\n" ret += "</BG>\n" return ret generate_xml = exposed(generate_xml) def add_process_groups (self, specs): """Create a process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)" % (specs)) script_specs = [] other_specs = [] for spec in specs: if spec.get('mode', False) == "script": script_specs.append(spec) else: other_specs.append(spec) # start up script jobs script_pgroups = [] if script_specs: for spec in script_specs: try: self._set_kernel(spec.get('location')[0], spec.get('kernel', "default")) except Exception, e: new_pgroup = self.process_groups.q_add([spec]) pgroup = new_pgroup[0] pgroup.nodect = self._partitions[pgroup.location[0]].size pgroup.exit_status = 1 self.logger.info("process group %s: job %s/%s failed to set the kernel; %s", pgroup.id, pgroup.jobid, pgroup.user, e) else: try: script_pgroup = ComponentProxy("script-manager").add_jobs([spec]) except (ComponentLookupError, xmlrpclib.Fault): self._clear_kernel(spec.get('location')[0]) # FIXME: jobs that were already started are not reported raise ProcessGroupCreationError("system::add_process_groups failed to communicate with script-manager") new_pgroup = self.process_groups.q_add([spec]) pgroup = new_pgroup[0] pgroup.script_id = script_pgroup[0]['id'] pgroup.nodect = self._partitions[pgroup.location[0]].size self.logger.info("job %s/%s: process group %s created to track script", pgroup.jobid, pgroup.user, pgroup.id) self.reserve_resources_until(spec['location'], time.time() + 60*float(spec['walltime']), pgroup.jobid) if pgroup.kernel != "default": self.logger.info("process group %s: job %s/%s using kernel %s", pgroup.id, pgroup.jobid, pgroup.user, pgroup.kernel) script_pgroups.append(pgroup) # start up non-script mode jobs process_groups = self.process_groups.q_add(other_specs) for pgroup in process_groups: pgroup.nodect = self._partitions[pgroup.location[0]].size self.logger.info("job %s/%s: process group %s created to track mpirun status", pgroup.jobid, pgroup.user, pgroup.id) try: if not pgroup.true_mpi_args: self._set_kernel(pgroup.location[0], pgroup.kernel) except Exception, e: # FIXME: setting exit_status to signal the job has failed isn't really the right thing to do. another flag # should be added to the process group that wait_process_group uses to determine when a process group is no # longer active. an error message should also be attached to the process group so that cqm can report the # problem to the user. pgroup.exit_status = 1 self.logger.info("process group %s: job %s/%s failed to set the kernel; %s", pgroup.id, pgroup.jobid, pgroup.user, e) else: if pgroup.kernel != "default" and not pgroup.true_mpi_args: self.logger.info("process group %s: job %s/%s using kernel %s", pgroup.id, pgroup.jobid, pgroup.user, pgroup.kernel) pgroup.start()
class Simulator (BGBaseSystem): """Generic system simulator. Methods: configure -- load partitions from an xml file reserve_partition -- lock a partition for use by a process_group (exposed) release_partition -- release a locked (busy) partition (exposed) add_process_groups -- add (start) a process group on the system (exposed, query) get_process_groups -- retrieve process groups (exposed, query) wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query) signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query) update_partition_state -- simulates updating partition state from the bridge API (automatic) """ name = "system" implementation = "simulator" logger = logger MIN_RUN_TIME = 60 MAX_RUN_TIME = 180 def __init__ (self, *args, **kwargs): BGBaseSystem.__init__(self, *args, **kwargs) self.process_groups.item_cls = BGSimProcessGroup self.config_file = kwargs.get("config_file", None) self.failed_components = sets.Set() if self.config_file is not None: self.configure(self.config_file) def __getstate__(self): flags = {} for part in self._partitions.values(): sched = None func = None queue = None if hasattr(part, 'scheduled'): sched = part.scheduled if hasattr(part, 'functional'): func = part.functional if hasattr(part, 'queue'): queue = part.queue flags[part.name] = (sched, func, queue) return {'managed_partitions':self._managed_partitions, 'version':2, 'config_file':self.config_file, 'partition_flags': flags} def __setstate__(self, state): self._managed_partitions = state['managed_partitions'] self.config_file = state['config_file'] self._partitions = PartitionDict() self.process_groups = BGProcessGroupDict() self.process_groups.item_cls = BGSimProcessGroup self.node_card_cache = dict() self._partitions_lock = thread.allocate_lock() self.failed_components = sets.Set() self.pending_diags = dict() self.failed_diags = list() self.bridge_in_error = False self.cached_partitions = None self.offline_partitions = [] if self.config_file is not None: self.configure(self.config_file) if 'partition_flags' in state: for pname, flags in state['partition_flags'].items(): if pname in self._partitions: self._partitions[pname].scheduled = flags[0] self._partitions[pname].functional = flags[1] self._partitions[pname].queue = flags[2] else: logger.info("Partition %s is no longer defined" % pname) self.update_relatives() self.lock = threading.Lock() self.statistics = Statistics() def save_me(self): Component.save(self) save_me = automatic(save_me) # check whether this is a mesh partition def check_mesh(self, location_name): location = location_name.strip() specs = location.split("-") mesh_mask = specs[len(specs)-2] if len(mesh_mask) == 1: return True #print location, " Torus" else: return False #print location, " Mesh" def configure (self, config_file): """Configure simulated partitions. Arguments: config_file -- xml configuration file """ def _get_node_card(name): if not self.node_card_cache.has_key(name): self.node_card_cache[name] = NodeCard(name) return self.node_card_cache[name] self.logger.info("configure()") try: system_doc = ElementTree.parse(config_file) except IOError: self.logger.error("unable to open file: %r" % config_file) self.logger.error("exiting...") sys.exit(1) except: self.logger.error("problem loading data from file: %r" % config_file) self.logger.error("exiting...") traceback.print_exc(file=sys.stdout) sys.exit(1) system_def = system_doc.getroot() if system_def.tag != "BG": self.logger.error("unexpected root element in %r: %r" % (config_file, system_def.tag)) self.logger.error("exiting...") sys.exit(1) # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API NODES_PER_NODECARD = 32 # initialize a new partition dict with all partitions # partitions = PartitionDict() tmp_list = [] part_key = {} # this is going to hold partition objects from the bridge (not our own Partition) wiring_cache = {} bp_cache = {} for partition_def in system_def.getiterator("Block"): # skip duplicated partition if part_key.has_key(partition_def.get("name")): continue else: part_key[partition_def.get("name")]="" node_list = [] switch_list = [] for nc in partition_def.getiterator("NodeBoard"): node_list.append(_get_node_card(nc.get("id"))) nc_count = len(node_list) # skip 1K mesh partition if its a default partition file if config_file == "partition_xml/2013-10-10-mira_pure.xml": if NODES_PER_NODECARD * nc_count == 1024 and self.check_mesh(partition_def.get("name")): continue if not wiring_cache.has_key(nc_count): wiring_cache[nc_count] = [] wiring_cache[nc_count].append(partition_def.get("name")) for s in partition_def.getiterator("Switch"): switch_list.append(s.get("id")) tmp_list.append( dict( name = partition_def.get("name"), queue = partition_def.get("queue", "default"), size = NODES_PER_NODECARD * nc_count, node_cards = node_list, switches = switch_list, state = "idle", )) partitions.q_add(tmp_list) # find the wiring deps for size in wiring_cache: for p in wiring_cache[size]: p = partitions[p] s1 = sets.Set( p.switches ) for other in wiring_cache[size]: other = partitions[other] if (p.name == other.name): continue s2 = sets.Set( other.switches ) if s1.intersection(s2): self.logger.info("found a wiring dep between %s and %s", p.name, other.name) partitions[p.name]._wiring_conflicts.add(other.name) # update object state self._partitions.clear() self._partitions.update(partitions) print "Total partitions: ", len(self._partitions) def reserve_partition (self, name, size=None): """Reserve a partition and block all related partitions. Arguments: name -- name of the partition to reserve size -- size of the process group reserving the partition (optional) """ try: partition = self.partitions[name] except KeyError: self.logger.error("reserve_partition(%r, %r) [does not exist]" % (name, size)) return False if partition.state != "allocated": self.logger.error("reserve_partition(%r, %r) [%s]" % (name, size, partition.state)) return False if not partition.functional: self.logger.error("reserve_partition(%r, %r) [not functional]" % (name, size)) if size is not None and size > partition.size: self.logger.error("reserve_partition(%r, %r) [size mismatch]" % (name, size)) return False self._partitions_lock.acquire() try: partition.state = "busy" partition.reserved_until = False except: self.logger.error("error in reserve_partition", exc_info=True) self._partitions_lock.release() # explicitly call this, since the above "busy" is instantaneously available self.update_partition_state() self.logger.info("reserve_partition(%r, %r)" % (name, size)) return True reserve_partition = exposed(reserve_partition) def release_partition (self, name): """Release a reserved partition. Arguments: name -- name of the partition to release """ try: partition = self.partitions[name] except KeyError: self.logger.error("release_partition(%r) [already free]" % (name)) return False if not partition.state == "busy": self.logger.info("release_partition(%r) [not busy]" % (name)) return False self._partitions_lock.acquire() try: partition.state = "idle" except: self.logger.error("error in release_partition", exc_info=True) self._partitions_lock.release() # explicitly unblock the blocked partitions self.update_partition_state() self.logger.info("release_partition(%r)" % (name)) return True release_partition = exposed(release_partition) def add_process_groups (self, specs): """Create a simulated process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)" % (specs)) script_specs = [] other_specs = [] for spec in specs: if spec.get('mode') == "script": script_specs.append(spec) else: other_specs.append(spec) # start up script jobs new_pgroups = [] if script_specs: try: for spec in script_specs: script_pgroup = ComponentProxy("script-manager").add_jobs([spec]) new_pgroup = self.process_groups.q_add([spec]) new_pgroup[0].script_id = script_pgroup[0]['id'] self.reserve_resources_until(spec['location'], time.time() + 60*float(spec['walltime']), new_pgroup[0].jobid) new_pgroups.append(new_pgroup[0]) except (ComponentLookupError, xmlrpclib.Fault): raise ProcessGroupCreationError("system::add_process_groups failed to communicate with script-manager") process_groups = self.process_groups.q_add(other_specs) for process_group in process_groups: self.start(process_group) return new_pgroups + process_groups add_process_groups = exposed(query(all_fields=True)(add_process_groups)) def get_process_groups (self, specs): """Query process_groups from the simulator.""" return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def wait_process_groups (self, specs): """get process groups that have finished running.""" self.logger.info("wait_process_groups(%r)" % (specs)) process_groups = [pg for pg in self.process_groups.q_get(specs) if pg.exit_status is not None] for process_group in process_groups: # jobs that were launched on behalf of the script manager shouldn't release the partition if not process_group.true_mpi_args: self.reserve_resources_until(process_group.location, None, process_group.jobid) del self.process_groups[process_group.id] return process_groups wait_process_groups = exposed(query(wait_process_groups)) def signal_process_groups (self, specs, signame="SIGINT"): """Simulate the signaling of a process_group.""" self.logger.info("signal_process_groups(%r, %r)" % (specs, signame)) process_groups = self.process_groups.q_get(specs) for process_group in process_groups: if process_group.mode == "script": try: pgroup = ComponentProxy("script-manager").signal_jobs([{'id':process_group.script_id}], "SIGTERM") except (ComponentLookupError, xmlrpclib.Fault): logger.error("Failed to communicate with script manager when killing job") else: process_group.signals.append(signame) return process_groups signal_process_groups = exposed(query(signal_process_groups)) def start (self, process_group): thread.start_new_thread(self._mpirun, (process_group, )) def _mpirun (self, process_group): argv = process_group._get_argv() try: stdout = open(process_group.stdout or "/dev/null", "a") except: stdout = open("/dev/null", "a") try: stderr = open(process_group.stderr or "/dev/null", "a") except: stderr = open("/dev/null", "a") try: clfn = process_group.cobalt_log_file or "/dev/null" cobalt_log_file = open(clfn, "a") print >> cobalt_log_file, "%s\n" % " ".join(argv[1:]) cobalt_log_file.close() except: logger.error("Job %s/%s: unable to open cobaltlog file %s", process_group.id, process_group.user, clfn, exc_info = True) try: partition = argv[argv.index("-partition") + 1] except ValueError: print >> stderr, "ERROR: '-partition' is a required flag" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return except IndexError: print >> stderr, "ERROR: '-partition' requires a value" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return try: mode = argv[argv.index("-mode") + 1] except ValueError: print >> stderr, "ERROR: '-mode' is a required flag" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return except IndexError: print >> stderr, "ERROR: '-mode' requires a value" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return try: size = argv[argv.index("-np") + 1] except ValueError: print >> stderr, "ERROR: '-np' is a required flag" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return except IndexError: print >> stderr, "ERROR: '-np' requires a value" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return try: size = int(size) except ValueError: print >> stderr, "ERROR: '-np' got invalid value %r" % (size) print >> stderr, "FE_MPI (Info) : Exit status: 1" print >> stdout, "ENVIRONMENT" print >> stdout, "-----------" for key, value in process_group.env.iteritems(): print >> stdout, "%s=%s" % (key, value) print >> stdout print >> stderr, "FE_MPI (Info) : Initializing MPIRUN" reserved = self.reserve_partition(partition, size) if not reserved: print >> stderr, "BE_MPI (ERROR): Failed to run process on partition" print >> stderr, "BE_MPI (Info) : BE completed" print >> stderr, "FE_MPI (ERROR): Failure list:" print >> stderr, "FE_MPI (ERROR): - 1. ProcessGroup execution failed - unable to reserve partition", partition print >> stderr, "FE_MPI (Info) : FE completed" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return hardware_failure = False for nc in self.partitions[partition].node_cards: if nc.id in self.failed_components: hardware_failure = True break for switch in self.partitions[partition].switches: if switch in self.failed_components: hardware_failure = True break if hardware_failure: excuses = ["incorrectly polarized packet accelerator", "the Internet is full", "side fumbling detected", "unilateral phase detractors offline", ] print >> stderr, "BE_MPI (ERROR): Booting aborted - partition is in DEALLOCATING ('D') state" print >> stderr, "BE_MPI (ERROR): Partition has not reached the READY ('I') state" print >> stderr, "BE_MPI (Info) : Checking for block error text:" print >> stderr, "BE_MPI (ERROR): block error text '%s.'" % random.choice(excuses) print >> stderr, "BE_MPI (Info) : Starting cleanup sequence" time.sleep(20) self.release_partition(partition) print >> stderr, "BE_MPI (Info) : Partition", partition, "switched to state FREE ('F')" print >> stderr, "FE_MPI (ERROR): Failure list:" print >> stderr, "FE_MPI (ERROR): - 1.", partition, "couldn't boot." print >> stderr, "FE_MPI (Info) : FE completed" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return print >> stderr, "FE_MPI (Info) : process group with id", process_group.id print >> stderr, "FE_MPI (Info) : Waiting for process_group to terminate" print >> stdout, "Running process_group: %s" % " ".join(argv) start_time = time.time() run_time = random.randint(self.MIN_RUN_TIME, self.MAX_RUN_TIME) my_exit_status = 0 self.logger.info("process group %d running for about %f seconds", process_group.id, run_time) while time.time() < (start_time + run_time): if "SIGKILL" in process_group.signals: process_group.exit_status = 1 return elif "SIGTERM" in process_group.signals: print >> stderr, "FE_MPI (Info) : ProcessGroup got signal SIGTERM" my_exit_status = 1 break else: time.sleep(1) # tumblers better than pumpers print >> stderr, "FE_MPI (Info) : ProcessGroup", process_group.id, "switched to state TERMINATED ('T')" print >> stderr, "FE_MPI (Info) : ProcessGroup sucessfully terminated" print >> stderr, "BE_MPI (Info) : Releasing partition", partition released = self.release_partition(partition) if not released: print >> stderr, "BE_MPI (ERROR): Partition", partition, "could not switch to state FREE ('F')" print >> stderr, "BE_MPI (Info) : BE completed" print >> stderr, "FE_MPI (Info) : FE completed" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return print >> stderr, "BE_MPI (Info) : Partition", partition, "switched to state FREE ('F')" print >> stderr, "BE_MPI (Info) : BE completed" print >> stderr, "FE_MPI (Info) : FE completed" print >> stderr, "FE_MPI (Info) : Exit status:", my_exit_status process_group.exit_status = my_exit_status def update_partition_state(self): # first, set all of the nodecards to not busy for nc in self.node_card_cache.values(): nc.used_by = '' self._partitions_lock.acquire() try: for p in self._partitions.values(): p._update_node_cards() now = time.time() # since we don't have the bridge, a partition which isn't busy # should be set to idle and then blocked states can be derived for p in self._partitions.values(): if p.state != "busy": p.state = "idle" if p.reserved_until and now > p.reserved_until: p.reserved_until = None p.reserved_by = None for p in self._partitions.values(): if p.state == "busy": # when the partition becomes busy, if a script job isn't reserving it, then release the reservation if not p.reserved_by: p.reserved_until = False else: if p.reserved_until: p.state = "allocated" for part in p._parents: if part.state == "idle": part.state = "blocked (%s)" % (p.name,) for part in p._children: if part.state == "idle": part.state = "blocked (%s)" % (p.name,) for diag_part in self.pending_diags: if p.name == diag_part.name or p.name in diag_part.parents or p.name in diag_part.children: p.state = "blocked by pending diags" for nc in p.node_cards: if nc.used_by: p.state = "blocked (%s)" % nc.used_by break for dep_name in p._wiring_conflicts: if self._partitions[dep_name].state in ["allocated", "busy"]: p.state = "blocked-wiring (%s)" % dep_name break for part_name in self.failed_diags: part = self._partitions[part_name] if p.name == part.name: p.state = "failed diags" elif p.name in part.parents or p.name in part.children: p.state = "blocked by failed diags" except: self.logger.error("error in update_partition_state", exc_info=True) self._partitions_lock.release() update_partition_state = automatic(update_partition_state) def add_failed_components(self, component_names): success = [] for name in component_names: if self.node_card_cache.has_key(name): self.failed_components.add(name) success.append(name) else: for p in self._partitions.values(): if name in p.switches: self.failed_components.add(name) success.append(name) break return success add_failed_component = exposed(add_failed_components) def del_failed_components(self, component_names): success = [] for name in component_names: try: self.failed_components.remove(name) success.append(name) except KeyError: pass return success del_failed_components = exposed(del_failed_components) def list_failed_components(self, component_names): return list(self.failed_components) list_failed_components = exposed(list_failed_components) def launch_diags(self, partition, test_name): exit_value = 0 for nc in partition.node_cards: if nc.id in self.failed_components: exit_value = 1 for switch in partition.switches: if switch in self.failed_components: exit_value = 2 self.finish_diags(partition, test_name, exit_value)
class BGSystem(BGBaseSystem): """Blue Gene system component. Methods: configure -- load partitions from the bridge API add_process_groups -- add (start) an mpirun process on the system (exposed, ~query) get_process_groups -- retrieve mpirun processes (exposed, query) wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query) signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query) update_partition_state -- update partition state from the bridge API (runs as a thread) """ name = "system" implementation = "bgsystem" logger = logger _configfields = ['diag_script_location', 'diag_log_file', 'kernel'] _config = ConfigParser.ConfigParser() _config.read(Cobalt.CONFIG_FILES) if not _config._sections.has_key('bgsystem'): print '''"bgsystem" section missing from cobalt config file''' sys.exit(1) config = _config._sections['bgsystem'] mfields = [field for field in _configfields if not config.has_key(field)] if mfields: print "Missing option(s) in cobalt config file [bgsystem] section: %s" % ( " ".join(mfields)) sys.exit(1) if config.get('kernel') == "true": _kernel_configfields = ['bootprofiles', 'partitionboot'] mfields = [ field for field in _kernel_configfields if not config.has_key(field) ] if mfields: print "Missing option(s) in cobalt config file [bgsystem] section: %s" % ( " ".join(mfields)) sys.exit(1) def __init__(self, *args, **kwargs): BGBaseSystem.__init__(self, *args, **kwargs) sys.setrecursionlimit(5000) self.process_groups.item_cls = BGProcessGroup self.diag_pids = dict() self.configure() # initiate the process before starting any threads thread.start_new_thread(self.update_partition_state, tuple()) def __getstate__(self): flags = {} for part in self._partitions.values(): sched = None func = None queue = None if hasattr(part, 'scheduled'): sched = part.scheduled if hasattr(part, 'functional'): func = part.functional if hasattr(part, 'queue'): queue = part.queue flags[part.name] = (sched, func, queue) return { 'managed_partitions': self._managed_partitions, 'version': 1, 'partition_flags': flags } def __setstate__(self, state): sys.setrecursionlimit(5000) self._managed_partitions = state['managed_partitions'] self._partitions = PartitionDict() self.process_groups = BGProcessGroupDict() self.process_groups.item_cls = BGProcessGroup self.node_card_cache = dict() self._partitions_lock = thread.allocate_lock() self.pending_diags = dict() self.failed_diags = list() self.diag_pids = dict() self.pending_script_waits = sets.Set() self.bridge_in_error = False self.cached_partitions = None self.offline_partitions = [] self.configure() if 'partition_flags' in state: for pname, flags in state['partition_flags'].items(): if pname in self._partitions: self._partitions[pname].scheduled = flags[0] self._partitions[pname].functional = flags[1] self._partitions[pname].queue = flags[2] else: logger.info("Partition %s is no longer defined" % pname) self.update_relatives() # initiate the process before starting any threads thread.start_new_thread(self.update_partition_state, tuple()) self.lock = threading.Lock() self.statistics = Statistics() def save_me(self): Component.save(self) save_me = automatic(save_me) def _get_node_card(self, name, state): if not self.node_card_cache.has_key(name): self.node_card_cache[name] = NodeCard(name, state) return self.node_card_cache[name] def _new_partition_dict(self, partition_def, bp_cache={}): # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API NODES_PER_NODECARD = 32 node_list = [] if partition_def.small: bp_name = partition_def.base_partitions[0].id for nc in partition_def._node_cards: node_list.append( self._get_node_card(bp_name + "-" + nc.id, nc.state)) else: try: for bp in partition_def.base_partitions: if bp.id not in bp_cache: bp_cache[bp.id] = [] for nc in Cobalt.bridge.NodeCardList.by_base_partition( bp): bp_cache[bp.id].append( self._get_node_card(bp.id + "-" + nc.id, nc.state)) node_list += bp_cache[bp.id] except BridgeException: print "Error communicating with the bridge during initial config. Terminating." sys.exit(1) d = dict( name=partition_def.id, queue="default", size=NODES_PER_NODECARD * len(node_list), bridge_partition=partition_def, node_cards=node_list, switches=[s.id for s in partition_def.switches], state=_get_state(partition_def), ) return d def _detect_wiring_deps(self, partition, wiring_cache={}): def _kernel(): s2 = sets.Set(p.switches) if s1.intersection(s2): p._wiring_conflicts.add(partition.name) partition._wiring_conflicts.add(p.name) self.logger.debug("%s and %s havening problems" % (partition.name, p.name)) s1 = sets.Set(partition.switches) if wiring_cache.has_key(partition.size): for p in wiring_cache[partition.size]: if partition.name != p.name: _kernel() else: wiring_cache[partition.size] = [partition] for p in self._partitions.values(): if p.size == partition.size and partition.name != p.name: wiring_cache[partition.size].append(p) _kernel() def configure(self): """Read partition data from the bridge.""" self.logger.info("configure()") try: system_def = Cobalt.bridge.PartitionList.by_filter() except BridgeException: print "Error communicating with the bridge during initial config. Terminating." sys.exit(1) # initialize a new partition dict with all partitions # partitions = PartitionDict() tmp_list = [] wiring_cache = {} bp_cache = {} for partition_def in system_def: tmp_list.append(self._new_partition_dict(partition_def, bp_cache)) partitions.q_add(tmp_list) # update object state self._partitions.clear() self._partitions.update(partitions) # find the wiring deps start = time.time() for p in self._partitions.values(): self._detect_wiring_deps(p, wiring_cache) end = time.time() self.logger.info("took %f seconds to find wiring deps" % (end - start)) # update state information for p in self._partitions.values(): if p.state != "busy": for nc in p.node_cards: if nc.used_by: p.state = "blocked (%s)" % nc.used_by break for dep_name in p._wiring_conflicts: if self._partitions[dep_name].state == "busy": p.state = "blocked-wiring (%s)" % dep_name break def update_partition_state(self): """Use the quicker bridge method that doesn't return nodecard information to update the states of the partitions""" def _start_partition_cleanup(p): self.logger.info("partition %s: marking partition for cleaning", p.name) p.cleanup_pending = True partitions_cleanup.append(p) _set_partition_cleanup_state(p) p.reserved_until = False p.reserved_by = None p.used_by = None def _set_partition_cleanup_state(p): p.state = "cleanup" for part in p._children: if part.bridge_partition.state == "RM_PARTITION_FREE": part.state = "blocked (%s)" % (p.name, ) else: part.state = "cleanup" for part in p._parents: if part.state == "idle": part.state = "blocked (%s)" % (p.name, ) while True: try: system_def = Cobalt.bridge.PartitionList.info_by_filter() except BridgeException: self.logger.error( "Error communicating with the bridge to update partition state information." ) self.bridge_in_error = True time.sleep(5) # wait a little bit... continue # then try again try: bg_object = Cobalt.bridge.BlueGene.by_serial() for bp in bg_object.base_partitions: for nc in Cobalt.bridge.NodeCardList.by_base_partition(bp): self.node_card_cache[bp.id + "-" + nc.id].state = nc.state except: self.logger.error( "Error communicating with the bridge to update nodecard state information." ) self.bridge_in_error = True time.sleep(5) # wait a little bit... continue # then try again self.bridge_in_error = False busted_switches = [] for s in bg_object.switches: if s.state != "RM_SWITCH_UP": busted_switches.append(s.id) # set all of the nodecards to not busy for nc in self.node_card_cache.values(): nc.used_by = '' # update the state of each partition self._partitions_lock.acquire() now = time.time() partitions_cleanup = [] self.offline_partitions = [] missing_partitions = set(self._partitions.keys()) new_partitions = [] try: for partition in system_def: missing_partitions.discard(partition.id) if self._partitions.has_key(partition.id): p = self._partitions[partition.id] p.state = _get_state(partition) p.bridge_partition = partition p._update_node_cards() if p.reserved_until and now > p.reserved_until: p.reserved_until = False p.reserved_by = None else: new_partitions.append(partition) # remove the missing partitions and their wiring relations for pname in missing_partitions: self.logger.info("missing partition removed: %s", pname) p = self._partitions[pname] for dep_name in p._wiring_conflicts: self.logger.debug( "removing wiring dependency from: %s", dep_name) self._partitions[dep_name]._wiring_conflicts.discard( p.name) if p.name in self._managed_partitions: self._managed_partitions.discard(p.name) del self._partitions[p.name] bp_cache = {} wiring_cache = {} # throttle the adding of new partitions so updating of # machine state doesn't get bogged down for partition in new_partitions[:8]: self.logger.info("new partition found: %s", partition.id) bridge_p = Cobalt.bridge.Partition.by_id(partition.id) self._partitions.q_add( [self._new_partition_dict(bridge_p, bp_cache)]) p = self._partitions[bridge_p.id] p.bridge_partition = partition self._detect_wiring_deps(p, wiring_cache) # if partitions were added or removed, then update the relationships between partitions if len(missing_partitions) > 0 or len(new_partitions) > 0: self.update_relatives() for p in self._partitions.values(): if p.cleanup_pending: if p.used_by: # if the partition has a pending cleanup request, then set the state so that cleanup will be # performed _start_partition_cleanup(p) else: # if the cleanup has already been initiated, then see how it's going busy = [] parts = list(p._all_children) parts.append(p) for part in parts: if part.bridge_partition.state != "RM_PARTITION_FREE": busy.append(part.name) if len(busy) > 0: _set_partition_cleanup_state(p) self.logger.info( "partition %s: still cleaning; busy partition(s): %s", p.name, ", ".join(busy)) else: p.cleanup_pending = False self.logger.info( "partition %s: cleaning complete", p.name) if p.state == "busy": # when the partition becomes busy, if a script job isn't reserving it, then release the reservation if not p.reserved_by: p.reserved_until = False elif p.state != "cleanup": if p.reserved_until: p.state = "allocated" for part in p._parents: if part.state == "idle": part.state = "blocked (%s)" % (p.name, ) for part in p._children: if part.state == "idle": part.state = "blocked (%s)" % (p.name, ) elif p.bridge_partition.state == "RM_PARTITION_FREE" and p.used_by: # if the job assigned to the partition has completed, then set the state so that cleanup will be # performed _start_partition_cleanup(p) continue for diag_part in self.pending_diags: if p.name == diag_part.name or p.name in diag_part.parents or p.name in diag_part.children: p.state = "blocked by pending diags" for nc in p.node_cards: if nc.used_by: p.state = "blocked (%s)" % nc.used_by if nc.state != "RM_NODECARD_UP": p.state = "hardware offline: nodecard %s" % nc.id self.offline_partitions.append(p.name) for s in p.switches: if s in busted_switches: p.state = "hardware offline: switch %s" % s self.offline_partitions.append(p.name) for dep_name in p._wiring_conflicts: if self._partitions[dep_name].state in [ "busy", "allocated", "cleanup" ]: p.state = "blocked-wiring (%s)" % dep_name break for part_name in self.failed_diags: part = self._partitions[part_name] if p.name == part.name: p.state = "failed diags" elif p.name in part.parents or p.name in part.children: p.state = "blocked by failed diags" except: self.logger.error("error in update_partition_state", exc_info=True) self._partitions_lock.release() # cleanup partitions and set their kernels back to the default (while _not_ holding the lock) pnames_cleaned = [] for p in partitions_cleanup: self.logger.info( "partition %s: starting partition destruction", p.name) pnames_destroyed = [] parts = list(p._all_children) parts.append(p) for part in parts: pnames_cleaned.append(part.name) try: bpart = part.bridge_partition if bpart.state != "RM_PARTITION_FREE": bpart.destroy() pnames_destroyed.append(part.name) except Cobalt.bridge.IncompatibleState: pass except: self.logger.info( "partition %s: an exception occurred while attempting to destroy partition %s", p.name, part.name) if len(pnames_destroyed) > 0: self.logger.info( "partition %s: partition destruction initiated for %s", p.name, ", ".join(pnames_destroyed)) else: self.logger.info( "partition %s: no partition destruction was required", p.name) try: self._clear_kernel(p.name) self.logger.info("partition %s: kernel settings cleared", p.name) except: self.logger.error( "partition %s: failed to clear kernel settings", p.name) job_filter = Cobalt.bridge.JobFilter() job_filter.job_type = Cobalt.bridge.JOB_TYPE_ALL_FLAG jobs = Cobalt.bridge.JobList.by_filter(job_filter) for job in jobs: if job.partition_id in pnames_cleaned: try: job.cancel() self.logger.info("partition %s: task %d canceled", job.partition_id, job.db_id) except (Cobalt.bridge.IncompatibleState, Cobalt.bridge.JobNotFound): pass time.sleep(10) def _mark_partition_for_cleaning(self, pname, jobid): self._partitions_lock.acquire() try: p = self._partitions[pname] if p.used_by == jobid: p.cleanup_pending = True self.logger.info("partition %s: partition marked for cleanup", pname) elif p.used_by != None: self.logger.info("partition %s: job %s was not the current partition user (%s); partition not marked " + \ "for cleanup", pname, jobid, p.used_by) except: self.logger.exception( "partition %s: unexpected exception while marking the partition for cleanup", pname) self._partitions_lock.release() def _validate_kernel(self, kernel): if self.config.get('kernel') != 'true': return True kernel_dir = "%s/%s" % (os.path.expandvars( self.config.get('bootprofiles')), kernel) return os.path.exists(kernel_dir) def _set_kernel(self, partition, kernel): '''Set the kernel to be used by jobs run on the specified partition''' if self.config.get('kernel') != 'true': if kernel != "default": raise Exception("custom kernel capabilities disabled") return partition_link = "%s/%s" % (os.path.expandvars( self.config.get('partitionboot')), partition) kernel_dir = "%s/%s" % (os.path.expandvars( self.config.get('bootprofiles')), kernel) try: current = os.readlink(partition_link) except OSError: self.logger.error( "partition %s: failed to read partitionboot location %s" % (partition, partition_link)) raise Exception("failed to read partitionboot location %s" % (partition_link, )) if current != kernel_dir: if not self._validate_kernel(kernel): self.logger.error( "partition %s: kernel directory \"%s\" does not exist" % (partition, kernel_dir)) raise Exception("kernel directory \"%s\" does not exist" % (kernel_dir, )) self.logger.info( "partition %s: updating boot image; currently set to \"%s\"" % (partition, current.split('/')[-1])) try: os.unlink(partition_link) os.symlink(kernel_dir, partition_link) except OSError: self.logger.error( "partition %s: failed to reset boot location" % (partition, )) raise Exception("failed to reset boot location for partition" % (partition, )) self.logger.info( "partition %s: boot image updated; now set to \"%s\"" % (partition, kernel)) def _clear_kernel(self, partition): '''Set the kernel to be used by a partition to the default value''' if self.config.get('kernel') == 'true': try: self._set_kernel(partition, "default") except: logger.error("partition %s: failed to reset boot location" % (partition, )) def generate_xml(self): """This method produces an XML file describing the managed partitions, suitable for use with the simulator.""" ret = "<BG>\n" ret += "<PartitionList>\n" for p_name in self._managed_partitions: p = self._partitions[p_name] ret += " <Partition name='%s'>\n" % p.name for nc in p.node_cards: ret += " <NodeCard id='%s' />\n" % nc.id for s in p.switches: ret += " <Switch id='%s' />\n" % s ret += " </Partition>\n" ret += "</PartitionList>\n" ret += "</BG>\n" return ret generate_xml = exposed(generate_xml) def add_process_groups(self, specs): """Create a process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)" % (specs)) script_specs = [] other_specs = [] for spec in specs: if spec.get('mode', False) == "script": script_specs.append(spec) else: other_specs.append(spec) # start up script jobs script_pgroups = [] if script_specs: for spec in script_specs: try: self._set_kernel( spec.get('location')[0], spec.get('kernel', "default")) except Exception, e: new_pgroup = self.process_groups.q_add([spec]) pgroup = new_pgroup[0] pgroup.nodect = self._partitions[pgroup.location[0]].size pgroup.exit_status = 1 self.logger.info( "process group %s: job %s/%s failed to set the kernel; %s", pgroup.id, pgroup.jobid, pgroup.user, e) else: try: script_pgroup = ComponentProxy( "script-manager").add_jobs([spec]) except (ComponentLookupError, xmlrpclib.Fault): self._clear_kernel(spec.get('location')[0]) # FIXME: jobs that were already started are not reported raise ProcessGroupCreationError( "system::add_process_groups failed to communicate with script-manager" ) new_pgroup = self.process_groups.q_add([spec]) pgroup = new_pgroup[0] pgroup.script_id = script_pgroup[0]['id'] pgroup.nodect = self._partitions[pgroup.location[0]].size self.logger.info( "job %s/%s: process group %s created to track script", pgroup.jobid, pgroup.user, pgroup.id) self.reserve_resources_until( spec['location'], time.time() + 60 * float(spec['walltime']), pgroup.jobid) if pgroup.kernel != "default": self.logger.info( "process group %s: job %s/%s using kernel %s", pgroup.id, pgroup.jobid, pgroup.user, pgroup.kernel) script_pgroups.append(pgroup) # start up non-script mode jobs process_groups = self.process_groups.q_add(other_specs) for pgroup in process_groups: pgroup.nodect = self._partitions[pgroup.location[0]].size self.logger.info( "job %s/%s: process group %s created to track mpirun status", pgroup.jobid, pgroup.user, pgroup.id) try: if not pgroup.true_mpi_args: self._set_kernel(pgroup.location[0], pgroup.kernel) except Exception, e: # FIXME: setting exit_status to signal the job has failed isn't really the right thing to do. another flag # should be added to the process group that wait_process_group uses to determine when a process group is no # longer active. an error message should also be attached to the process group so that cqm can report the # problem to the user. pgroup.exit_status = 1 self.logger.info( "process group %s: job %s/%s failed to set the kernel; %s", pgroup.id, pgroup.jobid, pgroup.user, e) else: if pgroup.kernel != "default" and not pgroup.true_mpi_args: self.logger.info( "process group %s: job %s/%s using kernel %s", pgroup.id, pgroup.jobid, pgroup.user, pgroup.kernel) pgroup.start()
class Simulator(BGBaseSystem): """Generic system simulator. Methods: configure -- load partitions from an xml file reserve_partition -- lock a partition for use by a process_group (exposed) release_partition -- release a locked (busy) partition (exposed) add_process_groups -- add (start) a process group on the system (exposed, query) get_process_groups -- retrieve process groups (exposed, query) wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query) signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query) update_partition_state -- simulates updating partition state from the bridge API (automatic) """ name = "system" implementation = "simulator" logger = logger MIN_RUN_TIME = 60 MAX_RUN_TIME = 180 def __init__(self, *args, **kwargs): BGBaseSystem.__init__(self, *args, **kwargs) self.process_groups.item_cls = BGSimProcessGroup self.config_file = kwargs.get("config_file", None) self.failed_components = sets.Set() if self.config_file is not None: self.configure(self.config_file) def __getstate__(self): flags = {} for part in self._partitions.values(): sched = None func = None queue = None if hasattr(part, 'scheduled'): sched = part.scheduled if hasattr(part, 'functional'): func = part.functional if hasattr(part, 'queue'): queue = part.queue flags[part.name] = (sched, func, queue) return { 'managed_partitions': self._managed_partitions, 'version': 2, 'config_file': self.config_file, 'partition_flags': flags } def __setstate__(self, state): self._managed_partitions = state['managed_partitions'] self.config_file = state['config_file'] self._partitions = PartitionDict() self.process_groups = BGProcessGroupDict() self.process_groups.item_cls = BGSimProcessGroup self.node_card_cache = dict() self._partitions_lock = thread.allocate_lock() self.failed_components = sets.Set() self.pending_diags = dict() self.failed_diags = list() self.bridge_in_error = False self.cached_partitions = None self.offline_partitions = [] if self.config_file is not None: self.configure(self.config_file) if 'partition_flags' in state: for pname, flags in state['partition_flags'].items(): if pname in self._partitions: self._partitions[pname].scheduled = flags[0] self._partitions[pname].functional = flags[1] self._partitions[pname].queue = flags[2] else: logger.info("Partition %s is no longer defined" % pname) self.update_relatives() self.lock = threading.Lock() self.statistics = Statistics() def save_me(self): Component.save(self) save_me = automatic(save_me) def configure(self, config_file): """Configure simulated partitions. Arguments: config_file -- xml configuration file """ def _get_node_card(name): if not self.node_card_cache.has_key(name): self.node_card_cache[name] = NodeCard(name) return self.node_card_cache[name] self.logger.info("configure()") try: system_doc = ElementTree.parse(config_file) except IOError: self.logger.error("unable to open file: %r" % config_file) self.logger.error("exiting...") sys.exit(1) except: self.logger.error("problem loading data from file: %r" % config_file) self.logger.error("exiting...") sys.exit(1) system_def = system_doc.getroot() if system_def.tag != "BG": self.logger.error("unexpected root element in %r: %r" % (config_file, system_def.tag)) self.logger.error("exiting...") sys.exit(1) # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API NODES_PER_NODECARD = 32 # initialize a new partition dict with all partitions # partitions = PartitionDict() tmp_list = [] # this is going to hold partition objects from the bridge (not our own Partition) wiring_cache = {} bp_cache = {} for partition_def in system_def.getiterator("Partition"): if not partition_def.get("name").startswith("ANL"): continue node_list = [] switch_list = [] for nc in partition_def.getiterator("NodeCard"): node_list.append(_get_node_card(nc.get("id"))) nc_count = len(node_list) # remove partitions which have less than 512 nodes if (NODES_PER_NODECARD * nc_count) < 512: continue if not wiring_cache.has_key(nc_count): wiring_cache[nc_count] = [] wiring_cache[nc_count].append(partition_def.get("name")) for s in partition_def.getiterator("Switch"): switch_list.append(s.get("id")) tmp_list.append( dict( name=partition_def.get("name"), queue=partition_def.get("queue", "default"), size=NODES_PER_NODECARD * nc_count, node_cards=node_list, switches=switch_list, state="idle", )) partitions.q_add(tmp_list) # find the wiring deps for size in wiring_cache: for p in wiring_cache[size]: p = partitions[p] s1 = sets.Set(p.switches) for other in wiring_cache[size]: other = partitions[other] if (p.name == other.name): continue s2 = sets.Set(other.switches) if s1.intersection(s2): self.logger.info( "found a wiring dep between %s and %s", p.name, other.name) partitions[p.name]._wiring_conflicts.add(other.name) # update object state self._partitions.clear() self._partitions.update(partitions) print "Total partitions: ", len(self._partitions) def reserve_partition(self, name, size=None): """Reserve a partition and block all related partitions. Arguments: name -- name of the partition to reserve size -- size of the process group reserving the partition (optional) """ try: partition = self.partitions[name] except KeyError: self.logger.error("reserve_partition(%r, %r) [does not exist]" % (name, size)) return False if partition.state != "allocated": self.logger.error("reserve_partition(%r, %r) [%s]" % (name, size, partition.state)) return False if not partition.functional: self.logger.error("reserve_partition(%r, %r) [not functional]" % (name, size)) if size is not None and size > partition.size: self.logger.error("reserve_partition(%r, %r) [size mismatch]" % (name, size)) return False self._partitions_lock.acquire() try: partition.state = "busy" partition.reserved_until = False except: self.logger.error("error in reserve_partition", exc_info=True) self._partitions_lock.release() # explicitly call this, since the above "busy" is instantaneously available self.update_partition_state() self.logger.info("reserve_partition(%r, %r)" % (name, size)) return True reserve_partition = exposed(reserve_partition) def release_partition(self, name): """Release a reserved partition. Arguments: name -- name of the partition to release """ try: partition = self.partitions[name] except KeyError: self.logger.error("release_partition(%r) [already free]" % (name)) return False if not partition.state == "busy": self.logger.info("release_partition(%r) [not busy]" % (name)) return False self._partitions_lock.acquire() try: partition.state = "idle" except: self.logger.error("error in release_partition", exc_info=True) self._partitions_lock.release() # explicitly unblock the blocked partitions self.update_partition_state() self.logger.info("release_partition(%r)" % (name)) return True release_partition = exposed(release_partition) def add_process_groups(self, specs): """Create a simulated process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)" % (specs)) script_specs = [] other_specs = [] for spec in specs: if spec.get('mode') == "script": script_specs.append(spec) else: other_specs.append(spec) # start up script jobs new_pgroups = [] if script_specs: try: for spec in script_specs: script_pgroup = ComponentProxy("script-manager").add_jobs( [spec]) new_pgroup = self.process_groups.q_add([spec]) new_pgroup[0].script_id = script_pgroup[0]['id'] self.reserve_resources_until( spec['location'], time.time() + 60 * float(spec['walltime']), new_pgroup[0].jobid) new_pgroups.append(new_pgroup[0]) except (ComponentLookupError, xmlrpclib.Fault): raise ProcessGroupCreationError( "system::add_process_groups failed to communicate with script-manager" ) process_groups = self.process_groups.q_add(other_specs) for process_group in process_groups: self.start(process_group) return new_pgroups + process_groups add_process_groups = exposed(query(all_fields=True)(add_process_groups)) def get_process_groups(self, specs): """Query process_groups from the simulator.""" return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def wait_process_groups(self, specs): """get process groups that have finished running.""" self.logger.info("wait_process_groups(%r)" % (specs)) process_groups = [ pg for pg in self.process_groups.q_get(specs) if pg.exit_status is not None ] for process_group in process_groups: # jobs that were launched on behalf of the script manager shouldn't release the partition if not process_group.true_mpi_args: self.reserve_resources_until(process_group.location, None, process_group.jobid) del self.process_groups[process_group.id] return process_groups wait_process_groups = exposed(query(wait_process_groups)) def signal_process_groups(self, specs, signame="SIGINT"): """Simulate the signaling of a process_group.""" self.logger.info("signal_process_groups(%r, %r)" % (specs, signame)) process_groups = self.process_groups.q_get(specs) for process_group in process_groups: if process_group.mode == "script": try: pgroup = ComponentProxy("script-manager").signal_jobs( [{ 'id': process_group.script_id }], "SIGTERM") except (ComponentLookupError, xmlrpclib.Fault): logger.error( "Failed to communicate with script manager when killing job" ) else: process_group.signals.append(signame) return process_groups signal_process_groups = exposed(query(signal_process_groups)) def start(self, process_group): thread.start_new_thread(self._mpirun, (process_group, )) def _mpirun(self, process_group): argv = process_group._get_argv() try: stdout = open(process_group.stdout or "/dev/null", "a") except: stdout = open("/dev/null", "a") try: stderr = open(process_group.stderr or "/dev/null", "a") except: stderr = open("/dev/null", "a") try: clfn = process_group.cobalt_log_file or "/dev/null" cobalt_log_file = open(clfn, "a") print >> cobalt_log_file, "%s\n" % " ".join(argv[1:]) cobalt_log_file.close() except: logger.error("Job %s/%s: unable to open cobaltlog file %s", process_group.id, process_group.user, clfn, exc_info=True) try: partition = argv[argv.index("-partition") + 1] except ValueError: print >> stderr, "ERROR: '-partition' is a required flag" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return except IndexError: print >> stderr, "ERROR: '-partition' requires a value" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return try: mode = argv[argv.index("-mode") + 1] except ValueError: print >> stderr, "ERROR: '-mode' is a required flag" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return except IndexError: print >> stderr, "ERROR: '-mode' requires a value" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return try: size = argv[argv.index("-np") + 1] except ValueError: print >> stderr, "ERROR: '-np' is a required flag" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return except IndexError: print >> stderr, "ERROR: '-np' requires a value" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return try: size = int(size) except ValueError: print >> stderr, "ERROR: '-np' got invalid value %r" % (size) print >> stderr, "FE_MPI (Info) : Exit status: 1" print >> stdout, "ENVIRONMENT" print >> stdout, "-----------" for key, value in process_group.env.iteritems(): print >> stdout, "%s=%s" % (key, value) print >> stdout print >> stderr, "FE_MPI (Info) : Initializing MPIRUN" reserved = self.reserve_partition(partition, size) if not reserved: print >> stderr, "BE_MPI (ERROR): Failed to run process on partition" print >> stderr, "BE_MPI (Info) : BE completed" print >> stderr, "FE_MPI (ERROR): Failure list:" print >> stderr, "FE_MPI (ERROR): - 1. ProcessGroup execution failed - unable to reserve partition", partition print >> stderr, "FE_MPI (Info) : FE completed" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return hardware_failure = False for nc in self.partitions[partition].node_cards: if nc.id in self.failed_components: hardware_failure = True break for switch in self.partitions[partition].switches: if switch in self.failed_components: hardware_failure = True break if hardware_failure: excuses = [ "incorrectly polarized packet accelerator", "the Internet is full", "side fumbling detected", "unilateral phase detractors offline", ] print >> stderr, "BE_MPI (ERROR): Booting aborted - partition is in DEALLOCATING ('D') state" print >> stderr, "BE_MPI (ERROR): Partition has not reached the READY ('I') state" print >> stderr, "BE_MPI (Info) : Checking for block error text:" print >> stderr, "BE_MPI (ERROR): block error text '%s.'" % random.choice( excuses) print >> stderr, "BE_MPI (Info) : Starting cleanup sequence" time.sleep(20) self.release_partition(partition) print >> stderr, "BE_MPI (Info) : Partition", partition, "switched to state FREE ('F')" print >> stderr, "FE_MPI (ERROR): Failure list:" print >> stderr, "FE_MPI (ERROR): - 1.", partition, "couldn't boot." print >> stderr, "FE_MPI (Info) : FE completed" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return print >> stderr, "FE_MPI (Info) : process group with id", process_group.id print >> stderr, "FE_MPI (Info) : Waiting for process_group to terminate" print >> stdout, "Running process_group: %s" % " ".join(argv) start_time = time.time() run_time = random.randint(self.MIN_RUN_TIME, self.MAX_RUN_TIME) my_exit_status = 0 self.logger.info("process group %d running for about %f seconds", process_group.id, run_time) while time.time() < (start_time + run_time): if "SIGKILL" in process_group.signals: process_group.exit_status = 1 return elif "SIGTERM" in process_group.signals: print >> stderr, "FE_MPI (Info) : ProcessGroup got signal SIGTERM" my_exit_status = 1 break else: time.sleep(1) # tumblers better than pumpers print >> stderr, "FE_MPI (Info) : ProcessGroup", process_group.id, "switched to state TERMINATED ('T')" print >> stderr, "FE_MPI (Info) : ProcessGroup sucessfully terminated" print >> stderr, "BE_MPI (Info) : Releasing partition", partition released = self.release_partition(partition) if not released: print >> stderr, "BE_MPI (ERROR): Partition", partition, "could not switch to state FREE ('F')" print >> stderr, "BE_MPI (Info) : BE completed" print >> stderr, "FE_MPI (Info) : FE completed" print >> stderr, "FE_MPI (Info) : Exit status: 1" process_group.exit_status = 1 return print >> stderr, "BE_MPI (Info) : Partition", partition, "switched to state FREE ('F')" print >> stderr, "BE_MPI (Info) : BE completed" print >> stderr, "FE_MPI (Info) : FE completed" print >> stderr, "FE_MPI (Info) : Exit status:", my_exit_status process_group.exit_status = my_exit_status def update_partition_state(self): # first, set all of the nodecards to not busy for nc in self.node_card_cache.values(): nc.used_by = '' self._partitions_lock.acquire() try: for p in self._partitions.values(): p._update_node_cards() now = time.time() # since we don't have the bridge, a partition which isn't busy # should be set to idle and then blocked states can be derived for p in self._partitions.values(): if p.state != "busy": p.state = "idle" if p.reserved_until and now > p.reserved_until: p.reserved_until = None p.reserved_by = None for p in self._partitions.values(): if p.state == "busy": # when the partition becomes busy, if a script job isn't reserving it, then release the reservation if not p.reserved_by: p.reserved_until = False else: if p.reserved_until: p.state = "allocated" for part in p._parents: if part.state == "idle": part.state = "blocked (%s)" % (p.name, ) for part in p._children: if part.state == "idle": part.state = "blocked (%s)" % (p.name, ) for diag_part in self.pending_diags: if p.name == diag_part.name or p.name in diag_part.parents or p.name in diag_part.children: p.state = "blocked by pending diags" for nc in p.node_cards: if nc.used_by: p.state = "blocked (%s)" % nc.used_by break for dep_name in p._wiring_conflicts: if self._partitions[dep_name].state in [ "allocated", "busy" ]: p.state = "blocked-wiring (%s)" % dep_name break for part_name in self.failed_diags: part = self._partitions[part_name] if p.name == part.name: p.state = "failed diags" elif p.name in part.parents or p.name in part.children: p.state = "blocked by failed diags" except: self.logger.error("error in update_partition_state", exc_info=True) self._partitions_lock.release() update_partition_state = automatic(update_partition_state) def add_failed_components(self, component_names): success = [] for name in component_names: if self.node_card_cache.has_key(name): self.failed_components.add(name) success.append(name) else: for p in self._partitions.values(): if name in p.switches: self.failed_components.add(name) success.append(name) break return success add_failed_component = exposed(add_failed_components) def del_failed_components(self, component_names): success = [] for name in component_names: try: self.failed_components.remove(name) success.append(name) except KeyError: pass return success del_failed_components = exposed(del_failed_components) def list_failed_components(self, component_names): return list(self.failed_components) list_failed_components = exposed(list_failed_components) def launch_diags(self, partition, test_name): exit_value = 0 for nc in partition.node_cards: if nc.id in self.failed_components: exit_value = 1 for switch in partition.switches: if switch in self.failed_components: exit_value = 2 self.finish_diags(partition, test_name, exit_value)
class Simulator(BGBaseSystem): """Generic system simulator. Methods: configure -- load partitions from an xml file reserve_partition -- lock a partition for use by a process_group (exposed) release_partition -- release a locked (busy) partition (exposed) add_process_groups -- add (start) a process group on the system (exposed, query) get_process_groups -- retrieve process groups (exposed, query) wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query) signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query) update_partition_state -- simulates updating partition state from the bridge API (automatic) """ name = "system" implementation = "simulator" logger = logger MIN_RUN_TIME = 60 MAX_RUN_TIME = 180 def __init__(self, *args, **kwargs): BGBaseSystem.__init__(self, *args, **kwargs) sys.setrecursionlimit(5000) #why this magic number? self.process_groups.item_cls = BGSimProcessGroup self.config_file = kwargs.get("config_file", None) self.failed_components = set() if self.config_file is not None: self.configure(self.config_file) def __getstate__(self): flags = {} for part in self._partitions.values(): sched = None func = None queue = None if hasattr(part, 'scheduled'): sched = part.scheduled if hasattr(part, 'functional'): func = part.functional if hasattr(part, 'queue'): queue = part.queue flags[part.name] = (sched, func, queue) return { 'managed_partitions': self._managed_partitions, 'version': 2, 'config_file': self.config_file, 'partition_flags': flags } def __setstate__(self, state): Cobalt.Util.fix_set(state) sys.setrecursionlimit(5000) self._managed_partitions = state['managed_partitions'] self.config_file = state['config_file'] self._partitions = PartitionDict() self.process_groups = BGProcessGroupDict() self.process_groups.item_cls = BGSimProcessGroup self.node_card_cache = dict() self._partitions_lock = thread.allocate_lock() self.failed_components = set() self.pending_diags = dict() self.failed_diags = list() self.bridge_in_error = False self.cached_partitions = None self.offline_partitions = [] if self.config_file is not None: self.configure(self.config_file) if 'partition_flags' in state: for pname, flags in state['partition_flags'].items(): if pname in self._partitions: self._partitions[pname].scheduled = flags[0] self._partitions[pname].functional = flags[1] self._partitions[pname].queue = flags[2] else: logger.info("Partition %s is no longer defined" % pname) self.update_relatives() self.lock = threading.Lock() self.statistics = Statistics() def save_me(self): Component.save(self) save_me = automatic(save_me) def configure(self, config_file): """Configure simulated partitions. Arguments: config_file -- xml configuration file """ def _get_node_card(name): if not self.node_card_cache.has_key(name): self.node_card_cache[name] = NodeCard(name) return self.node_card_cache[name] self.logger.info("configure()") try: system_doc = ElementTree.parse(config_file) except IOError: self.logger.error("unable to open file: %r" % config_file) self.logger.error("exiting...") sys.exit(1) except: self.logger.error("problem loading data from file: %r" % config_file) self.logger.error("exiting...") sys.exit(1) system_def = system_doc.getroot() if system_def.tag != "BG": self.logger.error("unexpected root element in %r: %r" % (config_file, system_def.tag)) self.logger.error("exiting...") sys.exit(1) # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API NODES_PER_NODECARD = 32 # initialize a new partition dict with all partitions # partitions = PartitionDict() tmp_list = [] # this is going to hold partition objects from the bridge (not our own Partition) wiring_cache = {} bp_cache = {} for partition_def in system_def.getiterator("Partition"): node_list = [] switch_list = [] for nc in partition_def.getiterator("NodeCard"): node_list.append(_get_node_card(nc.get("id"))) nc_count = len(node_list) if not wiring_cache.has_key(nc_count): wiring_cache[nc_count] = [] wiring_cache[nc_count].append(partition_def.get("name")) for s in partition_def.getiterator("Switch"): switch_list.append(s.get("id")) tmp_list.append( dict( name=partition_def.get("name"), queue=partition_def.get("queue", "default"), size=NODES_PER_NODECARD * nc_count, node_cards=node_list, switches=switch_list, state="idle", )) partitions.q_add(tmp_list) # find the wiring deps for size in wiring_cache: for p in wiring_cache[size]: p = partitions[p] s1 = set(p.switches) for other in wiring_cache[size]: other = partitions[other] if (p.name == other.name): continue s2 = set(other.switches) if s1.intersection(s2): self.logger.info( "found a wiring dep between %s and %s", p.name, other.name) partitions[p.name]._wiring_conflicts.add(other.name) # update object state self._partitions.clear() self._partitions.update(partitions) def reserve_partition(self, name, size=None): """Reserve a partition and block all related partitions. Arguments: name -- name of the partition to reserve size -- size of the process group reserving the partition (optional) """ try: partition = self.partitions[name] except KeyError: self.logger.error("reserve_partition(%r, %r) [does not exist]" % (name, size)) return False if partition.state != "allocated": self.logger.error("reserve_partition(%r, %r) [%s]" % (name, size, partition.state)) return False if not partition.functional: self.logger.error("reserve_partition(%r, %r) [not functional]" % (name, size)) if size is not None and size > partition.size: self.logger.error("reserve_partition(%r, %r) [size mismatch]" % (name, size)) return False self._partitions_lock.acquire() try: partition.state = "busy" partition.reserved_until = False except: self.logger.error("error in reserve_partition", exc_info=True) self._partitions_lock.release() # explicitly call this, since the above "busy" is instantaneously available self.update_partition_state() self.logger.info("reserve_partition(%r, %r)" % (name, size)) return True reserve_partition = exposed(reserve_partition) def release_partition(self, name): """Release a reserved partition. Arguments: name -- name of the partition to release """ try: partition = self.partitions[name] except KeyError: self.logger.error("release_partition(%r) [already free]" % (name)) return False if not partition.state == "busy": self.logger.info("release_partition(%r) [not busy]" % (name)) return False self._partitions_lock.acquire() try: partition.state = "idle" except: self.logger.error("error in release_partition", exc_info=True) self._partitions_lock.release() # explicitly unblock the blocked partitions self.update_partition_state() self.logger.info("release_partition(%r)" % (name)) return True release_partition = exposed(release_partition) def add_process_groups(self, specs): """Create a simulated process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)" % (specs)) # FIXME: setting exit_status to signal the job has failed isn't really the right thing to do. another flag should be # added to the process group that wait_process_group uses to determine when a process group is no longer active. an # error message should also be attached to the process group so that cqm can report the problem to the user. process_groups = self.process_groups.q_add(specs) for pgroup in process_groups: pgroup.label = "Job %s/%s/%s" % (pgroup.jobid, pgroup.user, pgroup.id) pgroup.nodect = self._partitions[pgroup.location[0]].size self.logger.info( "%s: process group %s created to track job status", pgroup.label, pgroup.id) try: #TODO: allow the kernel set step to work in the simulator. For now this doesn't fly. pass #self._set_kernel(pgroup.location[0], pgroup.kernel) except Exception, e: self.logger.error("%s: failed to set the kernel; %s", pgroup.label, e) pgroup.exit_status = 255 else: if pgroup.kernel != "default": self.logger.info("%s: now using kernel %s", pgroup.label, pgroup.kernel) if pgroup.mode == "script": pgroup.forker = 'user_script_forker' else: pgroup.forker = 'bg_mpirun_forker' if self.reserve_resources_until( pgroup.location, float(pgroup.starttime) + 60 * float(pgroup.walltime), pgroup.jobid): try: pgroup.start() if pgroup.head_pid == None: self.logger.error( "%s: process group failed to start using the %s component; releasing resources", pgroup.label, pgroup.forker) self.reserve_resources_until( pgroup.location, None, pgroup.jobid) pgroup.exit_status = 255 except (ComponentLookupError, xmlrpclib.Fault), e: self.logger.error( "%s: failed to contact the %s component", pgroup.label, pgroup.forker) # do not release the resources; instead re-raise the exception and allow cqm to the opportunity to retry # until the job has exhausted its maximum alloted time del self.process_groups[pgroup.id] raise except (ComponentLookupError, xmlrpclib.Fault), e: self.logger.error( "%s: a fault occurred while attempting to start the process group using the %s " "component", pgroup.label, pgroup.forker) # do not release the resources; instead re-raise the exception and allow cqm to the opportunity to retry # until the job has exhausted its maximum alloted time del self.process_groups[process_group.id] raise except:
class Simulator (BGBaseSystem): """Generic system simulator. Methods: configure -- load partitions from an xml file reserve_partition -- lock a partition for use by a process_group (exposed) release_partition -- release a locked (busy) partition (exposed) add_process_groups -- add (start) a process group on the system (exposed, query) get_process_groups -- retrieve process groups (exposed, query) wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query) signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query) update_partition_state -- simulates updating partition state from the bridge API (automatic) """ name = "system" implementation = "simulator" logger = logger MIN_RUN_TIME = 60 MAX_RUN_TIME = 180 def __init__ (self, *args, **kwargs): BGBaseSystem.__init__(self, *args, **kwargs) sys.setrecursionlimit(5000) #why this magic number? self.process_groups.item_cls = BGSimProcessGroup self.config_file = kwargs.get("config_file", None) self.failed_components = set() if self.config_file is not None: self.configure(self.config_file) def __getstate__(self): flags = {} for part in self._partitions.values(): sched = None func = None queue = None if hasattr(part, 'scheduled'): sched = part.scheduled if hasattr(part, 'functional'): func = part.functional if hasattr(part, 'queue'): queue = part.queue flags[part.name] = (sched, func, queue) return {'managed_partitions':self._managed_partitions, 'version':2, 'config_file':self.config_file, 'partition_flags': flags} def __setstate__(self, state): Cobalt.Util.fix_set(state) sys.setrecursionlimit(5000) self._managed_partitions = state['managed_partitions'] self.config_file = state['config_file'] self._partitions = PartitionDict() self.process_groups = BGProcessGroupDict() self.process_groups.item_cls = BGSimProcessGroup self.node_card_cache = dict() self._partitions_lock = thread.allocate_lock() self.failed_components = set() self.pending_diags = dict() self.failed_diags = list() self.bridge_in_error = False self.cached_partitions = None self.offline_partitions = [] if self.config_file is not None: self.configure(self.config_file) if 'partition_flags' in state: for pname, flags in state['partition_flags'].items(): if pname in self._partitions: self._partitions[pname].scheduled = flags[0] self._partitions[pname].functional = flags[1] self._partitions[pname].queue = flags[2] else: logger.info("Partition %s is no longer defined" % pname) self.update_relatives() self.lock = threading.Lock() self.statistics = Statistics() def save_me(self): Component.save(self) save_me = automatic(save_me) def configure (self, config_file): """Configure simulated partitions. Arguments: config_file -- xml configuration file """ def _get_node_card(name): if not self.node_card_cache.has_key(name): self.node_card_cache[name] = NodeCard(name) return self.node_card_cache[name] self.logger.info("configure()") try: system_doc = ElementTree.parse(config_file) except IOError: self.logger.error("unable to open file: %r" % config_file) self.logger.error("exiting...") sys.exit(1) except: self.logger.error("problem loading data from file: %r" % config_file) self.logger.error("exiting...") sys.exit(1) system_def = system_doc.getroot() if system_def.tag != "BG": self.logger.error("unexpected root element in %r: %r" % (config_file, system_def.tag)) self.logger.error("exiting...") sys.exit(1) # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API NODES_PER_NODECARD = 32 # initialize a new partition dict with all partitions # partitions = PartitionDict() tmp_list = [] # this is going to hold partition objects from the bridge (not our own Partition) wiring_cache = {} bp_cache = {} for partition_def in system_def.getiterator("Partition"): node_list = [] switch_list = [] for nc in partition_def.getiterator("NodeCard"): node_list.append(_get_node_card(nc.get("id"))) nc_count = len(node_list) if not wiring_cache.has_key(nc_count): wiring_cache[nc_count] = [] wiring_cache[nc_count].append(partition_def.get("name")) for s in partition_def.getiterator("Switch"): switch_list.append(s.get("id")) tmp_list.append( dict( name = partition_def.get("name"), queue = partition_def.get("queue", "default"), size = NODES_PER_NODECARD * nc_count, node_cards = node_list, switches = switch_list, state = "idle", )) partitions.q_add(tmp_list) # find the wiring deps for size in wiring_cache: for p in wiring_cache[size]: p = partitions[p] s1 = set( p.switches ) for other in wiring_cache[size]: other = partitions[other] if (p.name == other.name): continue s2 = set( other.switches ) if s1.intersection(s2): self.logger.info("found a wiring dep between %s and %s", p.name, other.name) partitions[p.name]._wiring_conflicts.add(other.name) # update object state self._partitions.clear() self._partitions.update(partitions) def reserve_partition (self, name, size=None): """Reserve a partition and block all related partitions. Arguments: name -- name of the partition to reserve size -- size of the process group reserving the partition (optional) """ try: partition = self.partitions[name] except KeyError: self.logger.error("reserve_partition(%r, %r) [does not exist]" % (name, size)) return False if partition.state != "allocated": self.logger.error("reserve_partition(%r, %r) [%s]" % (name, size, partition.state)) return False if not partition.functional: self.logger.error("reserve_partition(%r, %r) [not functional]" % (name, size)) if size is not None and size > partition.size: self.logger.error("reserve_partition(%r, %r) [size mismatch]" % (name, size)) return False self._partitions_lock.acquire() try: partition.state = "busy" partition.reserved_until = False except: self.logger.error("error in reserve_partition", exc_info=True) self._partitions_lock.release() # explicitly call this, since the above "busy" is instantaneously available self.update_partition_state() self.logger.info("reserve_partition(%r, %r)" % (name, size)) return True reserve_partition = exposed(reserve_partition) def release_partition (self, name): """Release a reserved partition. Arguments: name -- name of the partition to release """ try: partition = self.partitions[name] except KeyError: self.logger.error("release_partition(%r) [already free]" % (name)) return False if not partition.state == "busy": self.logger.info("release_partition(%r) [not busy]" % (name)) return False self._partitions_lock.acquire() try: partition.state = "idle" except: self.logger.error("error in release_partition", exc_info=True) self._partitions_lock.release() # explicitly unblock the blocked partitions self.update_partition_state() self.logger.info("release_partition(%r)" % (name)) return True release_partition = exposed(release_partition) def add_process_groups (self, specs): """Create a simulated process group. Arguments: spec -- dictionary hash specifying a process group to start """ self.logger.info("add_process_groups(%r)" % (specs)) # FIXME: setting exit_status to signal the job has failed isn't really the right thing to do. another flag should be # added to the process group that wait_process_group uses to determine when a process group is no longer active. an # error message should also be attached to the process group so that cqm can report the problem to the user. process_groups = self.process_groups.q_add(specs) for pgroup in process_groups: pgroup.label = "Job %s/%s/%s" % (pgroup.jobid, pgroup.user, pgroup.id) pgroup.nodect = self._partitions[pgroup.location[0]].size self.logger.info("%s: process group %s created to track job status", pgroup.label, pgroup.id) try: #TODO: allow the kernel set step to work in the simulator. For now this doesn't fly. pass #self._set_kernel(pgroup.location[0], pgroup.kernel) except Exception, e: self.logger.error("%s: failed to set the kernel; %s", pgroup.label, e) pgroup.exit_status = 255 else: if pgroup.kernel != "default": self.logger.info("%s: now using kernel %s", pgroup.label, pgroup.kernel) if pgroup.mode == "script": pgroup.forker = 'user_script_forker' else: pgroup.forker = 'bg_mpirun_forker' if self.reserve_resources_until(pgroup.location, float(pgroup.starttime) + 60*float(pgroup.walltime), pgroup.jobid): try: pgroup.start() if pgroup.head_pid == None: self.logger.error("%s: process group failed to start using the %s component; releasing resources", pgroup.label, pgroup.forker) self.reserve_resources_until(pgroup.location, None, pgroup.jobid) pgroup.exit_status = 255 except (ComponentLookupError, xmlrpclib.Fault), e: self.logger.error("%s: failed to contact the %s component", pgroup.label, pgroup.forker) # do not release the resources; instead re-raise the exception and allow cqm to the opportunity to retry # until the job has exhausted its maximum alloted time del self.process_groups[pgroup.id] raise except (ComponentLookupError, xmlrpclib.Fault), e: self.logger.error("%s: a fault occurred while attempting to start the process group using the %s " "component", pgroup.label, pgroup.forker) # do not release the resources; instead re-raise the exception and allow cqm to the opportunity to retry # until the job has exhausted its maximum alloted time del self.process_groups[process_group.id] raise except: