class SlurmdCharm(CharmBase): """Slurmd lifecycle events.""" _stored = StoredState() on = SlurmdCharmEvents() def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default(nhc_conf=str(), slurm_installed=False, slurmctld_available=False, slurmctld_started=False, cluster_name=str()) self._slurm_manager = SlurmManager(self, "slurmd") self._fluentbit = FluentbitClient(self, "fluentbit") # interface to slurmctld, should only have one slurmctld per slurmd app self._slurmd = Slurmd(self, "slurmd") self._slurmd_peer = SlurmdPeer(self, "slurmd-peer") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self.on.config_changed: self._on_config_changed, self.on.slurmctld_started: self._on_slurmctld_started, self.on.slurmd_start: self._on_slurmd_start, self.on.check_etcd: self._on_check_etcd, self._slurmd.on.slurmctld_available: self._on_slurmctld_available, self._slurmd.on.slurmctld_unavailable: self._on_slurmctld_unavailable, # fluentbit self.on["fluentbit"].relation_created: self._on_configure_fluentbit, # actions self.on.version_action: self._on_version_action, self.on.node_configured_action: self._on_node_configured_action, self.on.get_node_inventory_action: self._on_get_node_inventory_action, self.on.show_nhc_config_action: self._on_show_nhc_config, # infiniband actions self.on.get_infiniband_repo_action: self.get_infiniband_repo, self.on.set_infiniband_repo_action: self.set_infiniband_repo, self.on.install_infiniband_action: self.install_infiniband, self.on.uninstall_infiniband_action: self.uninstall_infiniband, self.on.start_infiniband_action: self.start_infiniband, self.on.enable_infiniband_action: self.enable_infiniband, self.on.stop_infiniband_action: self.stop_infiniband, self.on.is_active_infiniband_action: self.is_active_infiniband, # nvdia actions self.on.nvidia_repo_action: self.nvidia_repo, self.on.nvidia_package_action: self.nvidia_package, self.on.nvidia_install_action: self.nvidia_install, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): """Perform installation operations for slurmd.""" self.unit.set_workload_version(Path("version").read_text().strip()) self.unit.status = WaitingStatus("Installing slurmd") custom_repo = self.config.get("custom-slurm-repo") successful_installation = self._slurm_manager.install(custom_repo) logger.debug(f"### slurmd installed: {successful_installation}") if successful_installation: self._stored.slurm_installed = True else: self.unit.status = BlockedStatus("Error installing slurmd") event.defer() self._check_status() def _on_configure_fluentbit(self, event): """Set up Fluentbit log forwarding.""" self._configure_fluentbit() def _configure_fluentbit(self): logger.debug("## Configuring fluentbit") cfg = list() cfg.extend(self._slurm_manager.fluentbit_config_nhc) cfg.extend(self._slurm_manager.fluentbit_config_slurm) self._fluentbit.configure(cfg) def _on_upgrade(self, event): """Perform upgrade operations.""" self.unit.set_workload_version(Path("version").read_text().strip()) def _on_update_status(self, event): """Handle update status.""" self._check_status() def _check_status(self) -> bool: """Check if we heve all needed components. - partition name - slurm installed - slurmctld available and working - munge key configured and working """ if self._slurm_manager.needs_reboot: self.unit.status = BlockedStatus("Machine needs reboot") return False if not self.get_partition_name(): self.unit.status = WaitingStatus("Waiting on charm configuration") return False if not self._stored.slurm_installed: self.unit.status = BlockedStatus("Error installing slurmd") return False if not self._slurmd.is_joined: self.unit.status = BlockedStatus("Need relations: slurmctld") return False if not self._stored.slurmctld_available: self.unit.status = WaitingStatus("Waiting on: slurmctld") return False if not self._slurm_manager.check_munged(): self.unit.status = BlockedStatus("Error configuring munge key") return False if not self._stored.slurmctld_started: self.unit.status = WaitingStatus("Waiting slurmctld to start") return False self.unit.status = ActiveStatus("slurmd available") return True def ensure_slurmd_starts(self, max_attemps=10) -> bool: """Ensure slurmd is up and running.""" logger.debug("## Stoping slurmd") self._slurm_manager.slurm_systemctl('stop') for i in range(max_attemps): if self._slurm_manager.slurm_is_active(): logger.debug("## Slurmd running") break else: logger.warning("## Slurmd not running, trying to start it") self.unit.status = WaitingStatus("Starting slurmd") self._slurm_manager.restart_slurm_component() sleep(2 + i) if self._slurm_manager.slurm_is_active(): return True else: self.unit.status = BlockedStatus("Cannot start slurmd") return False def _set_slurmctld_available(self, flag: bool): """Change stored value for slurmctld availability.""" self._stored.slurmctld_available = flag def _set_slurmctld_started(self, flag: bool): """Change stored value for slurmctld started.""" self._stored.slurmctld_started = flag def _on_slurmctld_available(self, event): """Get data from slurmctld and send inventory.""" if not self._stored.slurm_installed: event.defer() return logger.debug( '#### Slurmctld available - setting overrides for configless') # get slurmctld host:port from relation and override systemd services host = self._slurmd.slurmctld_hostname port = self._slurmd.slurmctld_port self._slurm_manager.create_configless_systemd_override(host, port) self._slurm_manager.daemon_reload() self._write_munge_key_and_restart_munge() self._set_slurmctld_available(True) self._on_set_partition_info_on_app_relation_data(event) self._check_status() # check etcd for hostnames self.on.check_etcd.emit() def _on_check_etcd(self, event): """Check if node is accounted for. Check if slurmctld accounted for this node's inventory for the first time, if so, emit slurmctld_started event, so the node can start the daemon. """ host = self._slurmd.slurmctld_address port = self._slurmd.etcd_port logger.debug(f"## Connecting to etcd3 in {host}:{port}") client = Etcd3Client(host=host, port=port, api_path="/v3/") logger.debug("## Querying etcd3 for node list") try: v = client.get(key="all_nodes") logger.debug(f"## Got: {v}") except Exception as e: logger.error( f"## Unable to connect to {host} to get list of nodes: {e}") event.defer() return node_accounted = False if v: hostnames = json.loads(v[0]) logger.debug(f"### etcd3 node list: {hostnames}") if self.hostname in hostnames: self.on.slurmctld_started.emit() node_accounted = True if not node_accounted: logger.debug("## Node not accounted for. Deferring.") event.defer() def _on_slurmctld_unavailable(self, event): logger.debug("## Slurmctld unavailable") self._set_slurmctld_available(False) self._set_slurmctld_started(False) self._slurm_manager.slurm_systemctl('stop') self._check_status() def _on_slurmctld_started(self, event): """Set flag to True and emit slurmd_start event.""" self._set_slurmctld_started(True) self.on.slurmd_start.emit() def _on_slurmd_start(self, event): if not self._check_status(): event.defer() return # only set up fluentbit if we have a relation to it if self._fluentbit._relation is not None: self._configure_fluentbit() # at this point, we have slurm installed, munge configured, and we know # slurmctld accounted for this node. It should be safe to start slurmd if self.ensure_slurmd_starts(): logger.debug("## slurmctld started and slurmd is running") else: event.defer() self._check_status() def _on_config_changed(self, event): """Handle charm configuration changes.""" if self.model.unit.is_leader(): logger.debug("## slurmd config changed - leader") self._on_set_partition_info_on_app_relation_data(event) nhc_conf = self.model.config.get('nhc-conf') if nhc_conf: if nhc_conf != self._stored.nhc_conf: self._stored.nhc_conf = nhc_conf self._slurm_manager.render_nhc_config(nhc_conf) def get_partition_name(self) -> str: """Return the partition_name in the slurmd relation.""" # Determine if a user-supplied partition-name config exists, if so # ensure the partition_name is consistent with the supplied config. # If no partition name has been specified then generate one. partition_name = self._slurmd_peer.partition_name partition_name_from_config = self.config.get("partition-name") if partition_name: if partition_name_from_config: partition_name_from_config = partition_name_from_config.replace( ' ', '-') if partition_name != partition_name_from_config: self._set_partition_name(partition_name_from_config) partition_name = partition_name_from_config else: logger.debug("Partition name unchanged.") else: logger.debug("Partition name unchanged.") else: partition_name = f"osd-{self.app.name}" logger.debug(f"Partition name: {partition_name}") self._set_partition_name(partition_name) return partition_name def _set_partition_name(self, name: str): """Set the partition_name in the slurmd relation.""" if self.model.unit.is_leader(): self._slurmd_peer.partition_name = name def _write_munge_key_and_restart_munge(self): logger.debug('#### slurmd charm - writting munge key') self._slurm_manager.configure_munge_key( self._slurmd.get_stored_munge_key()) if self._slurm_manager.restart_munged(): logger.debug("## Munge restarted succesfully") else: logger.error("## Unable to restart munge") def _on_version_action(self, event): """Return version of installed components. - Slurm - munge - NHC - infiniband """ version = {} version['slurm'] = self._slurm_manager.slurm_version() version['munge'] = self._slurm_manager.munge_version() version['nhc'] = self._slurm_manager.nhc_version() version['infiniband'] = self._slurm_manager.infiniband_version() event.set_results(version) def _on_node_configured_action(self, event): """Remove node from DownNodes.""" # trigger reconfig self._slurmd.configure_new_node() logger.debug('### This node is not new anymore') def _on_get_node_inventory_action(self, event): """Return node inventory.""" inventory = self._slurmd.node_inventory event.set_results({'inventory': inventory}) def get_infiniband_repo(self, event): """Return the currently used infiniband repository.""" repo = self._slurm_manager.infiniband.repository event.set_results({'infiniband-repo': repo}) def set_infiniband_repo(self, event): """Set the infiniband repository.""" repo = event.params["repo"] logger.debug(f"#### setting custom infiniband repo: {repo}") repo = base64.b64decode(repo).decode() self._slurm_manager.infiniband.repository = repo def install_infiniband(self, event): """Install infiniband.""" logger.debug("#### Installing Infiniband") self._slurm_manager.infiniband.install() event.set_results({'installation': 'Successfull. Please reboot node.'}) self.unit.status = BlockedStatus("Need reboot for Infiniband") def uninstall_infiniband(self, event): """Install infiniband.""" logger.debug("#### Uninstalling Infiniband") self._slurm_manager.infiniband.uninstall() def start_infiniband(self, event): """Start Infiniband systemd service.""" logger.debug("#### Starting Infiniband service") self._slurm_manager.infiniband.start() def enable_infiniband(self, event): """Enable Infiniband systemd service.""" logger.debug("#### Enabling Infiniband service") self._slurm_manager.infiniband.enable() def stop_infiniband(self, event): """Stop Infiniband systemd service.""" logger.debug("#### Stoping Infiniband service") self._slurm_manager.infiniband.stop() def is_active_infiniband(self, event): """Check if Infiniband systemd service is arctive.""" status = self._slurm_manager.infiniband.is_active() logger.debug(f"#### Infiniband service is-active: {status}") event.set_results({'infiniband-is-active': status}) def nvidia_repo(self, event): """Set or get the used nvidia repository.""" repo = event.params.get("repo", None) if repo: self._slurm_manager.nvidia.repository = base64.b64decode( repo).decode() event.set_results( {'nvidia-repo': self._slurm_manager.nvidia.repository}) def nvidia_package(self, event): """Set or get the used nvidia package.""" package = event.params.get("package", None) if package or package == "": # user supplied a package name -> store it self._slurm_manager.nvidia.package = package event.set_results( {'nvidia-package': self._slurm_manager.nvidia.package}) def nvidia_install(self, event): """Install nvidia drivers.""" logger.debug("#### Installing nvidia drivers: %s", self._slurm_manager.nvidia.package) self._slurm_manager.nvidia.install() event.set_results({'installation': 'Successfull. Please reboot node.'}) self.unit.status = BlockedStatus("Need reboot for nvidia") def _on_show_nhc_config(self, event): """Show current nhc.conf.""" nhc_conf = self._slurm_manager.get_nhc_config() event.set_results({"nhc.conf": nhc_conf}) def _on_set_partition_info_on_app_relation_data(self, event): """Set the slurm partition info on the application relation data.""" # Only the leader can set data on the relation. if self.model.unit.is_leader(): # If the relation with slurmctld exists then set our # partition info on the application relation data. # This handler shouldn't fire if the relation isn't made, # but add this extra check here just incase. if self._slurmd.is_joined: partition = self._assemble_partition() if partition: self._slurmd.set_partition_info_on_app_relation_data( partition) else: event.defer() else: event.defer() def _assemble_partition(self): """Assemble the partition info.""" partition_name = self.get_partition_name() partition_config = self.config.get("partition-config") partition_state = self.config.get("partition-state") logger.debug(f"## partition_name: {partition_name}") return { "partition_name": partition_name, "partition_state": partition_state, "partition_config": partition_config, } @property def hostname(self) -> str: """Return the hostname.""" return self._slurm_manager.hostname @property def cluster_name(self) -> str: """Return the cluster-name.""" return self._stored.cluster_name @cluster_name.setter def cluster_name(self, name: str): """Set the cluster-name.""" self._stored.cluster_name = name
class SlurmctldCharm(CharmBase): """Slurmctld lifecycle events.""" _stored = StoredState() def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default( jwt_key=str(), munge_key=str(), slurm_installed=False, slurmd_available=False, slurmrestd_available=False, slurmdbd_available=False, down_nodes=list(), ) self._slurm_manager = SlurmManager(self, "slurmctld") self._slurmd = Slurmd(self, "slurmd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmrestd = Slurmrestd(self, "slurmrestd") self._slurmctld_peer = SlurmctldPeer(self, "slurmctld-peer") self._prolog_epilog = PrologEpilog(self, "prolog-epilog") self._grafana = GrafanaSource(self, "grafana-source") self._influxdb = InfluxDB(self, "influxdb-api") self._elasticsearch = Elasticsearch(self, "elasticsearch") self._fluentbit = FluentbitClient(self, "fluentbit") self._user_group = UserGroupProvides(self, "user-group") self._etcd = EtcdOps() event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self.on.config_changed: self._on_write_slurm_config, self.on.leader_elected: self._on_leader_elected, # slurm component lifecycle events self._slurmdbd.on.slurmdbd_available: self._on_slurmdbd_available, self._slurmdbd.on.slurmdbd_unavailable: self._on_slurmdbd_unavailable, self._slurmd.on.slurmd_available: self._on_write_slurm_config, self._slurmd.on.slurmd_unavailable: self._on_write_slurm_config, self._slurmd.on.slurmd_departed: self._on_write_slurm_config, self._slurmrestd.on.slurmrestd_available: self._on_slurmrestd_available, self._slurmrestd.on.slurmrestd_unavailable: self._on_write_slurm_config, self._slurmctld_peer.on.slurmctld_peer_available: self. _on_write_slurm_config, # NOTE: a second slurmctld should get the jwt/munge keys and configure them # fluentbit self.on["fluentbit"].relation_created: self._on_fluentbit_relation_created, # Addons lifecycle events self._prolog_epilog.on.prolog_epilog_available: self._on_write_slurm_config, self._prolog_epilog.on.prolog_epilog_unavailable: self._on_write_slurm_config, self._grafana.on.grafana_available: self._on_grafana_available, self._influxdb.on.influxdb_available: self._on_influxdb_available, self._influxdb.on.influxdb_unavailable: self._on_write_slurm_config, self._elasticsearch.on.elasticsearch_available: self._on_elasticsearch_available, self._elasticsearch.on.elasticsearch_unavailable: self._on_write_slurm_config, self._user_group.on.create_user_group: self._on_create_user_group, self._user_group.on.remove_user_group: self._on_remove_user_group, # actions self.on.show_current_config_action: self._on_show_current_config, self.on.drain_action: self._drain_nodes_action, self.on.resume_action: self._resume_nodes_action, self.on.influxdb_info_action: self._infludb_info_action, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) @property def hostname(self): """Return the hostname.""" return self._slurm_manager.hostname @property def port(self): """Return the port.""" return self._slurm_manager.port @property def cluster_name(self) -> str: """Return the cluster name.""" return self.config.get("cluster-name") @property def _slurmctld_info(self): return self._slurmctld_peer.get_slurmctld_info() @property def slurmdbd_info(self): """Return slurmdbd_info from relation.""" return self._slurmdbd.get_slurmdbd_info() @property def _slurmd_info(self) -> list: return self._slurmd.get_slurmd_info() @property def _cluster_info(self): """Assemble information about the cluster.""" cluster_info = {} cluster_info['cluster_name'] = self.config.get('cluster-name') cluster_info['custom_config'] = self.config.get('custom-config') cluster_info['proctrack_type'] = self.config.get('proctrack-type') cluster_info['cgroup_config'] = self.config.get('cgroup-config') interval = self.config.get('health-check-interval') state = self.config.get('health-check-state') nhc = self._slurm_manager.slurm_config_nhc_values(interval, state) cluster_info.update(nhc) return cluster_info @property def _addons_info(self): """Assemble addons for slurm.conf.""" return { **self._assemble_prolog_epilog(), **self._assemble_acct_gather_addon(), **self._assemble_elastic_search_addon() } def _assemble_prolog_epilog(self) -> dict: """Generate the prolog_epilog section of the addons.""" logger.debug("## Generating prolog epilog configuration") prolog_epilog = self._prolog_epilog.get_prolog_epilog() if prolog_epilog: return {"prolog_epilog": prolog_epilog} else: return {} def _assemble_acct_gather_addon(self): """Generate the acct gather section of the addons.""" logger.debug("## Generating acct gather configuration") addons = dict() influxdb_info = self._get_influxdb_info() if influxdb_info: addons["acct_gather"] = influxdb_info addons["acct_gather"]["default"] = "all" addons["acct_gather_profile"] = "acct_gather_profile/influxdb" # it is possible to setup influxdb or hdf5 profiles without the # relation, using the custom-config section of slurm.conf. We need to # support setting up the acct_gather configuration for this scenario acct_gather_custom = self.config.get("acct-gather-custom") if acct_gather_custom: if not addons.get("acct_gather"): addons["acct_gather"] = dict() addons["acct_gather"]["custom"] = acct_gather_custom addons["acct_gather_frequency"] = self.config.get( "acct-gather-frequency") return addons def _assemble_elastic_search_addon(self): """Generate the acct gather section of the addons.""" logger.debug("## Generating elastic search addon configuration") addon = dict() elasticsearch_ingress = self._elasticsearch.elasticsearch_ingress if elasticsearch_ingress: suffix = f"/{self.cluster_name}/jobcomp" addon = { "elasticsearch_address": f"{elasticsearch_ingress}{suffix}" } return addon def set_slurmd_available(self, flag: bool): """Set stored value of slurmd available.""" self._stored.slurmd_available = flag def _set_slurmdbd_available(self, flag: bool): """Set stored value of slurmdbd available.""" self._stored.slurmdbd_available = flag def set_slurmrestd_available(self, flag: bool): """Set stored value of slurmdrest available.""" self._stored.slurmrestd_available = flag def _is_leader(self): return self.model.unit.is_leader() def is_slurm_installed(self): """Return true/false based on whether or not slurm is installed.""" return self._stored.slurm_installed def _on_show_current_config(self, event): """Show current slurm.conf.""" slurm_conf = self._slurm_manager.get_slurm_conf() event.set_results({"slurm.conf": slurm_conf}) def _on_install(self, event): """Perform installation operations for slurmctld.""" self.unit.set_workload_version(Path("version").read_text().strip()) self.unit.status = WaitingStatus("Installing slurmctld") custom_repo = self.config.get("custom-slurm-repo") successful_installation = self._slurm_manager.install(custom_repo) if successful_installation: self._stored.slurm_installed = True # Store the munge_key and jwt_rsa key in the stored state. # NOTE: Use leadership settings instead of stored state when # leadership settings support becomes available in the framework. if self._is_leader(): # NOTE the backup controller should also have the jwt and munge # keys configured. We should move these information to the # peer relation. self._stored.jwt_rsa = self._slurm_manager.generate_jwt_rsa() self._stored.munge_key = self._slurm_manager.get_munge_key() self._slurm_manager.configure_jwt_rsa(self.get_jwt_rsa()) else: # NOTE: the secondary slurmctld should get the jwt and munge # keys from the peer relation here logger.debug("secondary slurmctld") # all slurmctld should restart munged here, as it would assure # munge is working self._slurm_manager.restart_munged() else: self.unit.status = BlockedStatus("Error installing slurmctld") event.defer() logger.debug("## Retrieving etcd resource to install it") try: etcd_path = self.model.resources.fetch("etcd") logger.debug(f"## Found etcd resource: {etcd_path}") except ModelError: logger.error("## Missing etcd resource") self.unit.status = BlockedStatus("Missing etcd resource") event.defer() return self._etcd.install(etcd_path) self._check_status() def _on_fluentbit_relation_created(self, event): """Set up Fluentbit log forwarding.""" logger.debug("## Configuring fluentbit") cfg = list() cfg.extend(self._slurm_manager.fluentbit_config_nhc) cfg.extend(self._slurm_manager.fluentbit_config_slurm) self._fluentbit.configure(cfg) def _on_upgrade(self, event): """Perform upgrade operations.""" self.unit.set_workload_version(Path("version").read_text().strip()) def _on_update_status(self, event): """Handle update status.""" self._check_status() def _on_leader_elected(self, event: LeaderElectedEvent) -> None: logger.debug("## slurmctld - leader elected") self._etcd.start() # populate etcd with the nodelist slurm_config = self._assemble_slurm_config() accounted_nodes = self._assemble_all_nodes( slurm_config.get("partitions", [])) logger.debug( f"## Sending to etcd list of accounted nodes: {accounted_nodes}") self._etcd.set_list_of_accounted_nodes(accounted_nodes) def _check_status(self): """Check for all relations and set appropriate status. This charm needs these conditions to be satified in order to be ready: - Slurm components installed. - Munge running. - slurmdbd node running. - slurmd inventory. """ # NOTE: slurmd and slurmrestd are not needed for slurmctld to work, # only for the cluster to operate. But we need slurmd inventory # to assemble slurm.conf if self._slurm_manager.needs_reboot: self.unit.status = BlockedStatus("Machine needs reboot") return False if not self._stored.slurm_installed: self.unit.status = BlockedStatus("Error installing slurmctld") return False if (self._is_leader() and not self._etcd.is_active()): self.unit.status = WaitingStatus("Initializing charm") return False if not self._slurm_manager.check_munged(): self.unit.status = BlockedStatus("Error configuring munge key") return False # statuses of mandatory components: # - joined: someone executed juju relate slurmctld foo # - available: the units exchanged data through the relation # NOTE: slurmrestd is not mandatory for the cluster to work, that's why # it is not acounted for in here statuses = { "slurmd": { "available": self._stored.slurmd_available, "joined": self._slurmd.is_joined }, "slurmdbd": { "available": self._stored.slurmdbd_available, "joined": self._slurmdbd.is_joined } } relations_needed = list() waiting_on = list() for component in statuses.keys(): if not statuses[component]["joined"]: relations_needed.append(component) if not statuses[component]["available"]: waiting_on.append(component) if len(relations_needed): msg = f"Need relations: {','.join(relations_needed)}" self.unit.status = BlockedStatus(msg) return False if len(waiting_on): msg = f"Wating on: {','.join(waiting_on)}" self.unit.status = WaitingStatus(msg) return False self.unit.status = ActiveStatus("slurmctld available") return True def get_munge_key(self): """Get the stored munge key.""" return self._stored.munge_key def get_jwt_rsa(self): """Get the stored jwt_rsa key.""" return self._stored.jwt_rsa def _assemble_partitions(self, slurmd_info): """Make any needed modifications to partition data.""" slurmd_info_tmp = copy.deepcopy(slurmd_info) default_partition_from_config = self.config.get("default-partition") for partition in slurmd_info: # Deep copy the partition to a tmp var so we can modify it as # needed whilst not modifying the object we are iterating over. partition_tmp = copy.deepcopy(partition) # Extract the partition_name from the partition. partition_name = partition["partition_name"] # Check that the default_partition isn't defined in the charm # config. # If the user hasn't provided a default partition, then we infer # the partition_default by defaulting to the "configurator" # partition. if default_partition_from_config: if default_partition_from_config == partition_name: partition_tmp["partition_default"] = "YES" slurmd_info_tmp.remove(partition) slurmd_info_tmp.append(partition_tmp) return slurmd_info_tmp def _assemble_slurm_config(self): """Assemble and return the slurm config.""" logger.debug('## Assembling new slurm.conf') slurmctld_info = self._slurmctld_info slurmdbd_info = self.slurmdbd_info slurmd_info = self._slurmd_info cluster_info = self._cluster_info logger.debug("######## INFO") logger.debug(f'## slurmd: {slurmd_info}') logger.debug(f'## slurmctld_info: {slurmctld_info}') logger.debug(f'## slurmdbd_info: {slurmdbd_info}') logger.debug(f'## cluster_info: {cluster_info}') logger.debug("######## INFO - end") if not (slurmctld_info and slurmd_info and slurmdbd_info): return {} addons_info = self._addons_info partitions_info = self._assemble_partitions(slurmd_info) down_nodes = self._assemble_down_nodes(slurmd_info) logger.debug(f'#### addons: {addons_info}') logger.debug(f'#### partitions_info: {partitions_info}') logger.debug(f"#### Down nodes: {down_nodes}") return { "partitions": partitions_info, "down_nodes": down_nodes, **slurmctld_info, **slurmdbd_info, **addons_info, **cluster_info, } def _on_slurmrestd_available(self, event): """Set slurm_config on the relation when slurmrestd available.""" if not self._check_status(): event.defer() return slurm_config = self._assemble_slurm_config() if not slurm_config: self.unit.status = BlockedStatus( "Cannot generate slurm_config - defering event.") event.defer() return if self._stored.slurmrestd_available: self._slurmrestd.set_slurm_config_on_app_relation_data( slurm_config, ) self._slurmrestd.restart_slurmrestd() def _on_slurmdbd_available(self, event): self._set_slurmdbd_available(True) self._on_write_slurm_config(event) def _on_slurmdbd_unavailable(self, event): self._set_slurmdbd_available(False) self._check_status() def _on_write_slurm_config(self, event): """Check that we have what we need before we proceed.""" logger.debug("### Slurmctld - _on_write_slurm_config()") # only the leader should write the config, restart, and scontrol reconf if not self._is_leader(): return if not self._check_status(): event.defer() return slurm_config = self._assemble_slurm_config() if slurm_config: self._slurm_manager.render_slurm_configs(slurm_config) # restart is needed if nodes are added/removed from the cluster self._slurm_manager.slurm_systemctl('restart') self._slurm_manager.slurm_cmd('scontrol', 'reconfigure') # send the list of hostnames to slurmd via etcd accounted_nodes = self._assemble_all_nodes( slurm_config["partitions"]) self._etcd.set_list_of_accounted_nodes(accounted_nodes) # send the custom NHC parameters to all slurmd self._slurmd.set_nhc_params(self.config.get('health-check-params')) # check for "not new anymore" nodes, i.e., nodes that runned the # node-configured action. Those nodes are not anymore in the # DownNodes section in the slurm.conf, but we need to resume them # manually and update the internal cache down_nodes = slurm_config['down_nodes'] configured_nodes = self._assemble_configured_nodes(down_nodes) logger.debug(f"### configured nodes: {configured_nodes}") self._resume_nodes(configured_nodes) self._stored.down_nodes = down_nodes.copy() # slurmrestd needs the slurm.conf file, so send it every time it changes if self._stored.slurmrestd_available: self._slurmrestd.set_slurm_config_on_app_relation_data( slurm_config) # NOTE: scontrol reconfigure does not restart slurmrestd self._slurmrestd.restart_slurmrestd() else: logger.debug("## Should rewrite slurm.conf, but we don't have it. " "Deferring.") event.defer() @staticmethod def _assemble_all_nodes(slurmd_info: list) -> List[str]: """Parse slurmd_info and return a list with all hostnames.""" nodes = list() for partition in slurmd_info: for node in partition["inventory"]: nodes.append(node["node_name"]) return nodes @staticmethod def _assemble_down_nodes(slurmd_info): """Parse partitions' nodes and assemble a list of DownNodes.""" down_nodes = [] for partition in slurmd_info: for node in partition["inventory"]: if node["new_node"]: down_nodes.append(node["node_name"]) return down_nodes def _assemble_configured_nodes(self, down_nodes): """Assemble list of nodes that are not new anymore. new_node status is removed with an action, this method returns a list of nodes that were previously new but are not anymore. """ configured_nodes = [] for node in self._stored.down_nodes: if node not in down_nodes: configured_nodes.append(node) return configured_nodes def _resume_nodes(self, nodelist): """Run scontrol to resume the speficied node list.""" nodes = ",".join(nodelist) update_cmd = f"update nodename={nodes} state=resume" self._slurm_manager.slurm_cmd('scontrol', update_cmd) def _on_grafana_available(self, event): """Create the grafana-source if we are the leader and have influxdb.""" if not self._is_leader(): return influxdb_info = self._get_influxdb_info() if influxdb_info: self._grafana.set_grafana_source_info(influxdb_info) else: logger.error( "## Can not set Grafana source: missing influxdb relation") def _on_influxdb_available(self, event): """Assemble addons to forward slurm data to influxdb.""" self._on_write_slurm_config(event) def _on_elasticsearch_available(self, event): """Assemble addons to forward Slurm data to elasticsearch.""" self._on_write_slurm_config(event) def _get_influxdb_info(self) -> dict: """Return influxdb info.""" return self._influxdb.get_influxdb_info() def _drain_nodes_action(self, event): """Drain specified nodes.""" nodes = event.params['nodename'] reason = event.params['reason'] logger.debug(f'#### Draining {nodes} because {reason}.') event.log(f'Draining {nodes} because {reason}.') try: cmd = f'scontrol update nodename={nodes} state=drain reason="{reason}"' subprocess.check_output(shlex.split(cmd)) event.set_results({'status': 'draining', 'nodes': nodes}) except subprocess.CalledProcessError as e: event.fail(message=f'Error draining {nodes}: {e.output}') def _resume_nodes_action(self, event): """Resume specified nodes.""" nodes = event.params['nodename'] logger.debug(f'#### Resuming {nodes}.') event.log(f'Resuming {nodes}.') try: cmd = f'scontrol update nodename={nodes} state=resume' subprocess.check_output(shlex.split(cmd)) event.set_results({'status': 'resuming', 'nodes': nodes}) except subprocess.CalledProcessError as e: event.fail(message=f'Error resuming {nodes}: {e.output}') def _infludb_info_action(self, event): influxdb_info = self._get_influxdb_info() if not influxdb_info: influxdb_info = "not related" logger.debug(f"## InfluxDB-info action: {influxdb_info}") event.set_results({"influxdb": influxdb_info}) def _on_create_user_group(self, event): """Create the user and group provided.""" user = self._user_group.user_name user_uid = self._user_group.user_uid group = self._user_group.group_name # Create the group. try: subprocess.check_output(["groupadd", "--gid", user_uid, group]) # use the UID as the GID except subprocess.CalledProcessError as e: if e.returncode == 9: logger.warning("## Group already exists.") if e.returncode == 4: logger.warning("## GID already exists.") self._user_group._relation.data[self._user_group.model.app][ "status"] = "failure: GID already exists" return else: logger.error(f"## Error creating group: {e}") # Create the user. try: subprocess.check_output([ "useradd", "--system", "--no-create-home", "--gid", group, "--shell", "/usr/sbin/nologin", "-u", user_uid, user, ]) except subprocess.CalledProcessError as e: if e.returncode == 9: logger.warning("## User already exists.") if e.returncode == 4: logger.warning("## UID already exists.") self._user_group._relation.data[self._user_group.model.app][ "status"] = "failure: UID already exists" return else: logger.error(f"## Error creating user: {e}") self._user_group._relation.data[ self._user_group.model.app]["status"] = "success: User created" def _on_remove_user_group(self, event): """Remove the user and group provided.""" user = self._user_group.user_name group = self._user_group.group_name # Remove the user. try: subprocess.check_output(["userdel", user]) except subprocess.CalledProcessError as e: logger.error(f"## Error deleting user: {e}") # Remove the group. try: subprocess.check_output(["groupdel", group]) except subprocess.CalledProcessError as e: logger.error(f"## Error deleting group: {e}")
class SlurmdbdCharm(CharmBase): """Slurmdbd Charm.""" _stored = StoredState() on = SlurmdbdCharmEvents() def __init__(self, *args): """Set the default class attributes.""" super().__init__(*args) self._stored.set_default(db_info=dict(), jwt_available=False, munge_available=False, slurm_installed=False, cluster_name=str()) self._db = MySQLClient(self, "db") # self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmdbd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer") self._fluentbit = FluentbitClient(self, "fluentbit") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self.on.config_changed: self._write_config_and_restart_slurmdbd, self.on.jwt_available: self._on_jwt_available, self.on.munge_available: self._on_munge_available, self.on.write_config: self._write_config_and_restart_slurmdbd, self._db.on.database_available: self._write_config_and_restart_slurmdbd, self._db.on.database_unavailable: self._on_db_unavailable, self._slurmdbd_peer.on.slurmdbd_peer_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmctld_available: self._on_slurmctld_available, self._slurmdbd.on.slurmctld_unavailable: self._on_slurmctld_unavailable, # fluentbit self.on["fluentbit"].relation_created: self._on_fluentbit_relation_created, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): """Perform installation operations for slurmdbd.""" self.unit.set_workload_version(Path("version").read_text().strip()) self.unit.status = WaitingStatus("Installing slurmdbd") custom_repo = self.config.get("custom-slurm-repo") successful_installation = self._slurm_manager.install(custom_repo) if successful_installation: self._stored.slurm_installed = True self.unit.status = ActiveStatus("slurmdbd successfully installed") else: self.unit.status = BlockedStatus("Error installing slurmdbd") event.defer() return self._check_status() def _on_fluentbit_relation_created(self, event): """Set up Fluentbit log forwarding.""" self._configure_fluentbit() def _configure_fluentbit(self): logger.debug("## Configuring fluentbit") cfg = list() cfg.extend(self._slurm_manager.fluentbit_config_nhc) cfg.extend(self._slurm_manager.fluentbit_config_slurm) self._fluentbit.configure(cfg) def _on_upgrade(self, event): """Perform upgrade operations.""" self.unit.set_workload_version(Path("version").read_text().strip()) def _on_update_status(self, event): """Handle update status.""" self._check_status() def _on_jwt_available(self, event): """Retrieve and configure the jwt_rsa key.""" # jwt rsa lives in slurm spool dir, it is created when slurm is installed if not self._stored.slurm_installed: event.defer() return jwt_rsa = self._slurmdbd.get_jwt_rsa() self._slurm_manager.configure_jwt_rsa(jwt_rsa) self._stored.jwt_available = True def _on_munge_available(self, event): """Retrieve munge key and start munged.""" # munge is installed together with slurm if not self._stored.slurm_installed: event.defer() return munge_key = self._slurmdbd.get_munge_key() self._slurm_manager.configure_munge_key(munge_key) if self._slurm_manager.restart_munged(): logger.debug("## Munge restarted succesfully") self._stored.munge_available = True else: logger.error("## Unable to restart munge") self.unit.status = BlockedStatus("Error restarting munge") event.defer() def _on_db_unavailable(self, event): self._stored.db_info = dict() # TODO tell slurmctld that slurmdbd left? self._check_status() def _on_slurmctld_available(self, event): self.on.jwt_available.emit() self.on.munge_available.emit() self.on.write_config.emit() if self._fluentbit._relation is not None: self._configure_fluentbit() def _on_slurmctld_unavailable(self, event): """Reset state and charm status when slurmctld broken.""" self._stored.jwt_available = False self._stored.munge_available = False self._check_status() def _is_leader(self): return self.model.unit.is_leader() def _write_config_and_restart_slurmdbd(self, event): """Check for prereqs before writing config/restart of slurmdbd.""" # Ensure all pre-conditions are met with _check_status(), if not # defer the event. if not self._check_status(): event.defer() return db_info = self._stored.db_info slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() # settings from the config.yaml config = {"slurmdbd_debug": self.config.get("slurmdbd-debug")} slurmdbd_config = { **config, **slurmdbd_info, **db_info, } self._slurm_manager.slurm_systemctl("stop") self._slurm_manager.render_slurm_configs(slurmdbd_config) # At this point, we must guarantee that slurmdbd is correctly # initialized. Its startup might take a while, so we have to wait # for it. self._check_slurmdbd() # Only the leader can set relation data on the application. # Enforce that no one other then the leader trys to set # application relation data. if self.model.unit.is_leader(): self._slurmdbd.set_slurmdbd_info_on_app_relation_data( slurmdbd_config, ) self._check_status() def _check_slurmdbd(self, max_attemps=3) -> None: """Ensure slurmdbd is up and running.""" logger.debug("## Checking if slurmdbd is active") for i in range(max_attemps): if self._slurm_manager.slurm_is_active(): logger.debug("## Slurmdbd running") break else: logger.warning("## Slurmdbd not running, trying to start it") self.unit.status = WaitingStatus("Starting slurmdbd") self._slurm_manager.restart_slurm_component() sleep(1 + i) if self._slurm_manager.slurm_is_active(): self._check_status() else: self.unit.status = BlockedStatus("Cannot start slurmdbd") def _check_status(self) -> bool: """Check that we have the things we need.""" if self._slurm_manager.needs_reboot: self.unit.status = BlockedStatus("Machine needs reboot") return False slurm_installed = self._stored.slurm_installed if not slurm_installed: self.unit.status = BlockedStatus("Error installing slurm") return False # we must be sure to initialize the charms correctly. Slurmdbd must # first connect to the db to be able to connect to slurmctld correctly slurmctld_available = (self._stored.jwt_available and self._stored.munge_available) statuses = { "MySQL": { "available": self._stored.db_info != dict(), "joined": self._db.is_joined }, "slurcmtld": { "available": slurmctld_available, "joined": self._slurmdbd.is_joined } } relations_needed = list() waiting_on = list() for component in statuses.keys(): if not statuses[component]["joined"]: relations_needed.append(component) if not statuses[component]["available"]: waiting_on.append(component) if len(relations_needed): msg = f"Need relations: {','.join(relations_needed)}" self.unit.status = BlockedStatus(msg) return False if len(waiting_on): msg = f"Wating on: {','.join(waiting_on)}" self.unit.status = WaitingStatus(msg) return False slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() if not slurmdbd_info: self.unit.status = WaitingStatus("slurmdbd starting") return False if not self._slurm_manager.check_munged(): self.unit.status = WaitingStatus("munged starting") return False self.unit.status = ActiveStatus("slurmdbd available") return True def get_port(self): """Return the port from slurm-ops-manager.""" return self._slurm_manager.port def get_hostname(self): """Return the hostname from slurm-ops-manager.""" return self._slurm_manager.hostname def set_db_info(self, db_info): """Set the db_info in the stored state.""" self._stored.db_info = db_info @property def cluster_name(self) -> str: """Return the cluster-name.""" return self._stored.cluster_name @cluster_name.setter def cluster_name(self, name: str): """Set the cluster-name.""" self._stored.cluster_name = name