def __init__(self, *args): """Set the default class attributes.""" super().__init__(*args) self._stored.set_default( db_info=dict(), slurmdbd_config=dict(), munge_key_available=False, slurm_installed=False, ) self._db = MySQLClient(self, "db") self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmdbd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer") event_handler_bindings = { self.on.install: self._on_install, self.on.config_changed: self._write_config_and_restart_slurmdbd, self._db.on.database_available: self._write_config_and_restart_slurmdbd, self._slurmdbd_peer.on.slurmdbd_peer_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmdbd_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmdbd_unavailable: self._on_slurm_configurator_unavailable, self._slurmdbd.on.munge_key_available: self._on_munge_key_available, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
def __init__(self, *args): """Set the defaults for slurmdbd.""" super().__init__(*args) self._stored.set_default(db_info=dict()) self._stored.set_default(munge_key=str()) self._stored.set_default(slurm_installed=False) self._slurm_manager = SlurmManager(self, "slurmdbd") self._slurmdbd = SlurmdbdProvidesRelation(self, "slurmdbd") self._db = MySQLClient(self, "db") event_handler_bindings = { self.on.install: self._on_install, self.on.config_changed: self._write_config_and_restart_slurmdbd, self._db.on.database_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.munge_key_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmctld_unavailable: self._on_slurmctld_unavailable, self.on.upgrade_charm: self._on_upgrade, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
def __init__(self, *args): """Initialize charm and configure states and events to observe.""" super().__init__(*args) self._stored.set_default(slurm_installed=False, slurmrestd_restarted=False, cluster_name=str()) self._slurm_manager = SlurmManager(self, "slurmrestd") self._slurmrestd = SlurmrestdRequires(self, 'slurmrestd') self._fluentbit = FluentbitClient(self, "fluentbit") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self._slurmrestd.on.config_available: self._on_check_status_and_write_config, self._slurmrestd.on.config_unavailable: self._on_config_unavailable, self._slurmrestd.on.munge_key_available: self._on_configure_munge_key, self._slurmrestd.on.jwt_rsa_available: self._on_configure_jwt_rsa, self._slurmrestd.on.restart_slurmrestd: self._on_restart_slurmrestd, # fluentbit self.on["fluentbit"].relation_created: self._on_fluentbit_relation_created, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
def __init__(self, *args): """Initialize charm and configure states and events to observe.""" super().__init__(*args) self._stored.set_default( slurm_config=dict(), slurm_installed=False, slurmctld_available=False, ) self.slurm_manager = SlurmManager(self, "slurmrestd") self._slurmrestd = SlurmrestdRequires(self, 'slurmrestd') event_handler_bindings = { self.on.install: self._on_install, self.on.start: self._on_check_status_and_write_config, self.on.upgrade_charm: self._on_upgrade, self._slurmrestd.on.slurmctld_available: self._on_check_status_and_write_config, self._slurmrestd.on.slurmctld_unavailable: self._on_check_status_and_write_config, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default( user_node_state=str(), partition_name=str(), config_available=False, ) self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmd") self._slurmd = Slurmd(self, "slurmd") self._slurmd_peer = SlurmdPeer(self, "slurmd-peer") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.config_changed: self._on_send_slurmd_info, self._slurmd_peer.on.slurmd_peer_available: self._on_send_slurmd_info, self._slurmd.on.slurm_config_available: self._on_check_status_and_write_config, self.on.set_node_state_action: self._on_set_node_state_action, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default( munge_key=str(), slurmctld_controller_type=str(), ) self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmctld") self._slurmctld = Slurmctld(self, "slurmctld") self._slurmctld_peer = SlurmctldPeer(self, "slurmctld-peer") event_handler_bindings = { self.on.install: self._on_install, self._slurmctld.on.slurm_config_available: self._on_check_status_and_write_config, self._slurmctld_peer.on.slurmctld_peer_available: self._on_slurmctld_peer_available, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
def __init__(self, *args): """Initialize charm state, and observe charm lifecycle events.""" super().__init__(*args) self.config = self.model.config self.slurm_manager = SlurmManager(self, 'slurmd') self.slurmd = SlurmdProvides(self, "slurmd") self._stored.set_default( slurm_installed=False, slurm_config_available=False, slurm_config=dict(), ) event_handler_bindings = { self.on.install: self._on_install, self.on.config_changed: self._on_config_changed, self.on.upgrade_charm: self._on_upgrade, self.slurmd.on.slurmctld_available: self._on_render_config_and_restart, self.slurmd.on.slurmctld_unavailable: self._on_render_config_and_restart, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default(nhc_conf=str(), slurm_installed=False, slurmctld_available=False, slurmctld_started=False, cluster_name=str()) self._slurm_manager = SlurmManager(self, "slurmd") self._fluentbit = FluentbitClient(self, "fluentbit") # interface to slurmctld, should only have one slurmctld per slurmd app self._slurmd = Slurmd(self, "slurmd") self._slurmd_peer = SlurmdPeer(self, "slurmd-peer") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self.on.config_changed: self._on_config_changed, self.on.slurmctld_started: self._on_slurmctld_started, self.on.slurmd_start: self._on_slurmd_start, self.on.check_etcd: self._on_check_etcd, self._slurmd.on.slurmctld_available: self._on_slurmctld_available, self._slurmd.on.slurmctld_unavailable: self._on_slurmctld_unavailable, # fluentbit self.on["fluentbit"].relation_created: self._on_configure_fluentbit, # actions self.on.version_action: self._on_version_action, self.on.node_configured_action: self._on_node_configured_action, self.on.get_node_inventory_action: self._on_get_node_inventory_action, self.on.show_nhc_config_action: self._on_show_nhc_config, # infiniband actions self.on.get_infiniband_repo_action: self.get_infiniband_repo, self.on.set_infiniband_repo_action: self.set_infiniband_repo, self.on.install_infiniband_action: self.install_infiniband, self.on.uninstall_infiniband_action: self.uninstall_infiniband, self.on.start_infiniband_action: self.start_infiniband, self.on.enable_infiniband_action: self.enable_infiniband, self.on.stop_infiniband_action: self.stop_infiniband, self.on.is_active_infiniband_action: self.is_active_infiniband, # nvdia actions self.on.nvidia_repo_action: self.nvidia_repo, self.on.nvidia_package_action: self.nvidia_package, self.on.nvidia_install_action: self.nvidia_install, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
def __init__(self, *args): """Set the default class attributes.""" super().__init__(*args) self._stored.set_default(db_info=dict(), jwt_available=False, munge_available=False, slurm_installed=False, cluster_name=str()) self._db = MySQLClient(self, "db") # self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmdbd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer") self._fluentbit = FluentbitClient(self, "fluentbit") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self.on.config_changed: self._write_config_and_restart_slurmdbd, self.on.jwt_available: self._on_jwt_available, self.on.munge_available: self._on_munge_available, self.on.write_config: self._write_config_and_restart_slurmdbd, self._db.on.database_available: self._write_config_and_restart_slurmdbd, self._db.on.database_unavailable: self._on_db_unavailable, self._slurmdbd_peer.on.slurmdbd_peer_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmctld_available: self._on_slurmctld_available, self._slurmdbd.on.slurmctld_unavailable: self._on_slurmctld_unavailable, # fluentbit self.on["fluentbit"].relation_created: self._on_fluentbit_relation_created, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
class SlurmLoginCharm(CharmBase): """Operator charm responsible for lifecycle operations for slurmctld.""" _stored = StoredState() def __init__(self, *args): """Initialize charm and configure states and events to observe.""" super().__init__(*args) self._stored.set_default( slurm_installed=False, config_available=False, ) self.slurm_manager = SlurmManager(self, "slurmrestd") self._slurmrestd = SlurmrestdRequires(self, 'slurmrestd') event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self._slurmrestd.on.config_available: self._on_check_status_and_write_config, self._slurmrestd.on.config_unavailable: self._on_check_status_and_write_config, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): self.slurm_manager.install() self.unit.status = ActiveStatus("slurm installed") self._stored.slurm_installed = True def _on_upgrade(self, event): """Upgrade charm event handler.""" self.slurm_manager.upgrade() def _on_check_status_and_write_config(self, event): slurm_installed = self._stored.slurm_installed slurm_config = self._stored.config_available logger.debug("##### inside check status and write config ######") if not (slurm_installed and slurm_config): if not slurm_config: self.unit.status = BlockedStatus( "NEED RELATION TO SLURM-CONFIGURATOR") else: self.unit.status = BlockedStatus("SLURM NOT INSTALLED") event.defer() return else: logger.debug("##### STATUS CONFIRMED ######") config = dict(self._slurmrestd.get_slurm_config()) logger.debug(config) self.slurm_manager.render_config_and_restart(config) self.unit.status = ActiveStatus("Slurmrestd Available") def set_config_available(self, boolean): """Set self._stored.slurmctld_available.""" self._stored.config_available = boolean
class SlurmrestdCharm(CharmBase): """Operator charm responsible for lifecycle operations for slurmrestd.""" _stored = StoredState() def __init__(self, *args): """Initialize charm and configure states and events to observe.""" super().__init__(*args) self._stored.set_default( slurm_installed=False, slurmrestd_restarted=False, ) self._slurm_manager = SlurmManager(self, "slurmrestd") self._slurmrestd = SlurmrestdRequires(self, 'slurmrestd') event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self._slurmrestd.on.config_available: self._on_check_status_and_write_config, self._slurmrestd.on.munge_key_available: self._on_configure_munge_key, self._slurmrestd.on.restart_slurmrestd: self._on_restart_slurmrestd, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): self._slurm_manager.install(self.config["snapstore-channel"]) self.unit.status = ActiveStatus("slurm installed") self._stored.slurm_installed = True def _on_upgrade(self, event): """Upgrade charm event handler.""" slurm_config = self._check_status() snapstore_channel = self.config["snapstore-channel"] self._slurm_manager.upgrade(slurm_config, snapstore_channel) def _on_restart_slurmrestd(self, event): """Resart the slurmrestd component.""" self._slurm_manager.restart_slurm_component() def _on_configure_munge_key(self, event): """Configure the munge key. 1) Get the munge key from the stored state of the slurmrestd relation 2) Write the munge key to the munge key path and chmod 3) Restart munged 4) Set munge_key_available in charm stored state """ if not self._stored.slurm_installed: event.defer() return munge_key = self._slurmrestd.get_stored_munge_key() self._slurm_manager.configure_munge_key(munge_key) self._slurm_manager.restart_munged() self._stored.munge_key_available = True def _check_status(self): slurm_config = self._slurmrestd.get_stored_slurm_config() munge_key_available = self._stored.munge_key_available slurm_configurator_joined = self._slurmrestd.is_joined # Check and see if we have what we need for operation. if not slurm_configurator_joined: self.unit.status = BlockedStatus( "Needed relations: slurm-configurator") return None elif not (munge_key_available and slurm_config): self.unit.status = WaitingStatus("Waiting on: configuration") return None return dict(slurm_config) def _on_check_status_and_write_config(self, event): slurm_config = self._check_status() if not slurm_config: event.defer() return self._slurm_manager.render_slurm_configs(slurm_config) # Only restart slurmrestd the first time the node is brought up. if not self._stored.slurmrestd_restarted: self._slurm_manager.restart_slurm_component() self._stored.slurmrestd_restarted = True self.unit.status = ActiveStatus("slurmrestd available")
class SlurmctldCharm(CharmBase): """Slurmctld lifecycle events.""" _stored = StoredState() def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default( munge_key=str(), slurmctld_controller_type=str(), ) self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmctld") self._slurmctld = Slurmctld(self, "slurmctld") self._slurmctld_peer = SlurmctldPeer(self, "slurmctld-peer") event_handler_bindings = { self.on.install: self._on_install, self._slurmctld.on.slurm_config_available: self._on_check_status_and_write_config, self._slurmctld_peer.on.slurmctld_peer_available: self._on_slurmctld_peer_available, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): self._slurm_manager.install() self._stored.slurm_installed = True self.unit.status = ActiveStatus("Slurm Installed") def _on_upgrade(self, event): self._slurm_manager.upgrade() def _on_slurmctld_peer_available(self, event): if self.framework.model.unit.is_leader(): if self._slurmctld.is_joined: slurmctld_info = self._slurmctld_peer.get_slurmctld_info() if slurmctld_info: self._slurmctld.set_slurmctld_info_on_app_relation_data( slurmctld_info) return event.defer() return def _on_check_status_and_write_config(self, event): if not self._check_status(): event.defer() return slurm_config = self._slurmctld.get_slurm_config_from_relation() if not slurm_config: event.defer() return munge_key = self._stored.munge_key if not munge_key: event.defer() return self._slurm_manager.render_config_and_restart({ **slurm_config, 'munge_key': munge_key }) self.unit.status = ActiveStatus("Slurmctld Available") def _check_status(self): munge_key = self._stored.munge_key slurm_installed = self._stored.slurm_installed slurm_config = self._slurmctld.get_slurm_config_from_relation() if not (munge_key and slurm_installed and slurm_config): if not munge_key: self.unit.status = BlockedStatus( "NEED RELATION TO SLURM CONFIGURATOR") elif not slurm_config: self.unit.status = BlockedStatus("WAITING ON SLURM CONFIG") else: self.unit.status = BlockedStatus("SLURM NOT INSTALLED") return False else: return True def set_munge_key(self, munge_key): """Set the munge_key in _stored state.""" self._stored.munge_key = munge_key def get_slurm_component(self): """Return the slurm component.""" return self._slurm_manager.slurm_component def get_hostname(self): """Return the hostname.""" return self._slurm_manager.hostname def get_port(self): """Return the port.""" return self._slurm_manager.port
class SlurmdbdCharm(CharmBase): """Slurmdbd Charm.""" _stored = StoredState() on = SlurmdbdCharmEvents() def __init__(self, *args): """Set the default class attributes.""" super().__init__(*args) self._stored.set_default(db_info=dict(), jwt_available=False, munge_available=False, slurm_installed=False, cluster_name=str()) self._db = MySQLClient(self, "db") # self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmdbd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer") self._fluentbit = FluentbitClient(self, "fluentbit") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self.on.config_changed: self._write_config_and_restart_slurmdbd, self.on.jwt_available: self._on_jwt_available, self.on.munge_available: self._on_munge_available, self.on.write_config: self._write_config_and_restart_slurmdbd, self._db.on.database_available: self._write_config_and_restart_slurmdbd, self._db.on.database_unavailable: self._on_db_unavailable, self._slurmdbd_peer.on.slurmdbd_peer_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmctld_available: self._on_slurmctld_available, self._slurmdbd.on.slurmctld_unavailable: self._on_slurmctld_unavailable, # fluentbit self.on["fluentbit"].relation_created: self._on_fluentbit_relation_created, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): """Perform installation operations for slurmdbd.""" self.unit.set_workload_version(Path("version").read_text().strip()) self.unit.status = WaitingStatus("Installing slurmdbd") custom_repo = self.config.get("custom-slurm-repo") successful_installation = self._slurm_manager.install(custom_repo) if successful_installation: self._stored.slurm_installed = True self.unit.status = ActiveStatus("slurmdbd successfully installed") else: self.unit.status = BlockedStatus("Error installing slurmdbd") event.defer() return self._check_status() def _on_fluentbit_relation_created(self, event): """Set up Fluentbit log forwarding.""" self._configure_fluentbit() def _configure_fluentbit(self): logger.debug("## Configuring fluentbit") cfg = list() cfg.extend(self._slurm_manager.fluentbit_config_nhc) cfg.extend(self._slurm_manager.fluentbit_config_slurm) self._fluentbit.configure(cfg) def _on_upgrade(self, event): """Perform upgrade operations.""" self.unit.set_workload_version(Path("version").read_text().strip()) def _on_update_status(self, event): """Handle update status.""" self._check_status() def _on_jwt_available(self, event): """Retrieve and configure the jwt_rsa key.""" # jwt rsa lives in slurm spool dir, it is created when slurm is installed if not self._stored.slurm_installed: event.defer() return jwt_rsa = self._slurmdbd.get_jwt_rsa() self._slurm_manager.configure_jwt_rsa(jwt_rsa) self._stored.jwt_available = True def _on_munge_available(self, event): """Retrieve munge key and start munged.""" # munge is installed together with slurm if not self._stored.slurm_installed: event.defer() return munge_key = self._slurmdbd.get_munge_key() self._slurm_manager.configure_munge_key(munge_key) if self._slurm_manager.restart_munged(): logger.debug("## Munge restarted succesfully") self._stored.munge_available = True else: logger.error("## Unable to restart munge") self.unit.status = BlockedStatus("Error restarting munge") event.defer() def _on_db_unavailable(self, event): self._stored.db_info = dict() # TODO tell slurmctld that slurmdbd left? self._check_status() def _on_slurmctld_available(self, event): self.on.jwt_available.emit() self.on.munge_available.emit() self.on.write_config.emit() if self._fluentbit._relation is not None: self._configure_fluentbit() def _on_slurmctld_unavailable(self, event): """Reset state and charm status when slurmctld broken.""" self._stored.jwt_available = False self._stored.munge_available = False self._check_status() def _is_leader(self): return self.model.unit.is_leader() def _write_config_and_restart_slurmdbd(self, event): """Check for prereqs before writing config/restart of slurmdbd.""" # Ensure all pre-conditions are met with _check_status(), if not # defer the event. if not self._check_status(): event.defer() return db_info = self._stored.db_info slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() # settings from the config.yaml config = {"slurmdbd_debug": self.config.get("slurmdbd-debug")} slurmdbd_config = { **config, **slurmdbd_info, **db_info, } self._slurm_manager.slurm_systemctl("stop") self._slurm_manager.render_slurm_configs(slurmdbd_config) # At this point, we must guarantee that slurmdbd is correctly # initialized. Its startup might take a while, so we have to wait # for it. self._check_slurmdbd() # Only the leader can set relation data on the application. # Enforce that no one other then the leader trys to set # application relation data. if self.model.unit.is_leader(): self._slurmdbd.set_slurmdbd_info_on_app_relation_data( slurmdbd_config, ) self._check_status() def _check_slurmdbd(self, max_attemps=3) -> None: """Ensure slurmdbd is up and running.""" logger.debug("## Checking if slurmdbd is active") for i in range(max_attemps): if self._slurm_manager.slurm_is_active(): logger.debug("## Slurmdbd running") break else: logger.warning("## Slurmdbd not running, trying to start it") self.unit.status = WaitingStatus("Starting slurmdbd") self._slurm_manager.restart_slurm_component() sleep(1 + i) if self._slurm_manager.slurm_is_active(): self._check_status() else: self.unit.status = BlockedStatus("Cannot start slurmdbd") def _check_status(self) -> bool: """Check that we have the things we need.""" if self._slurm_manager.needs_reboot: self.unit.status = BlockedStatus("Machine needs reboot") return False slurm_installed = self._stored.slurm_installed if not slurm_installed: self.unit.status = BlockedStatus("Error installing slurm") return False # we must be sure to initialize the charms correctly. Slurmdbd must # first connect to the db to be able to connect to slurmctld correctly slurmctld_available = (self._stored.jwt_available and self._stored.munge_available) statuses = { "MySQL": { "available": self._stored.db_info != dict(), "joined": self._db.is_joined }, "slurcmtld": { "available": slurmctld_available, "joined": self._slurmdbd.is_joined } } relations_needed = list() waiting_on = list() for component in statuses.keys(): if not statuses[component]["joined"]: relations_needed.append(component) if not statuses[component]["available"]: waiting_on.append(component) if len(relations_needed): msg = f"Need relations: {','.join(relations_needed)}" self.unit.status = BlockedStatus(msg) return False if len(waiting_on): msg = f"Wating on: {','.join(waiting_on)}" self.unit.status = WaitingStatus(msg) return False slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() if not slurmdbd_info: self.unit.status = WaitingStatus("slurmdbd starting") return False if not self._slurm_manager.check_munged(): self.unit.status = WaitingStatus("munged starting") return False self.unit.status = ActiveStatus("slurmdbd available") return True def get_port(self): """Return the port from slurm-ops-manager.""" return self._slurm_manager.port def get_hostname(self): """Return the hostname from slurm-ops-manager.""" return self._slurm_manager.hostname def set_db_info(self, db_info): """Set the db_info in the stored state.""" self._stored.db_info = db_info @property def cluster_name(self) -> str: """Return the cluster-name.""" return self._stored.cluster_name @cluster_name.setter def cluster_name(self, name: str): """Set the cluster-name.""" self._stored.cluster_name = name
class SlurmdbdCharm(CharmBase): """Slurmdbd Charm Class.""" _stored = StoredState() def __init__(self, *args): """Set the defaults for slurmdbd.""" super().__init__(*args) self._stored.set_default(db_info=dict()) self._stored.set_default(munge_key=str()) self._stored.set_default(slurm_installed=False) self._slurm_manager = SlurmManager(self, "slurmdbd") self._slurmdbd = SlurmdbdProvidesRelation(self, "slurmdbd") self._db = MySQLClient(self, "db") event_handler_bindings = { self.on.install: self._on_install, self.on.config_changed: self._write_config_and_restart_slurmdbd, self._db.on.database_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.munge_key_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmctld_unavailable: self._on_slurmctld_unavailable, self.on.upgrade_charm: self._on_upgrade, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): self._slurm_manager.install() self._stored.slurm_installed = True self.unit.status = ActiveStatus("Slurm Installed") def _on_upgrade(self, event): """Handle upgrade charm event.""" logger.debug('_on_upgrade(): entering') #self._slurm_manager.upgrade() resource_path = str(self.model.resources.fetch('slurm')) subprocess.call([ "snap", "install", resource_path, "--dangerous", "--classic", ]) def _on_slurmctld_unavailable(self, event): self.unit.status = BlockedStatus("Need relation to slurmctld.") def _check_status(self) -> bool: """Check that we have the things we need.""" db_info = self._stored.db_info munge_key = self._stored.munge_key slurm_installed = self._stored.slurm_installed if not (db_info and slurm_installed and munge_key): if not self._stored.db_info: self.unit.status = BlockedStatus("Need relation to MySQL.") elif not self._stored.munge_key: self.unit.status = BlockedStatus("Need relation to slurmctld.") return False return True def _write_config_and_restart_slurmdbd(self, event): """Check for prereqs before writing config/restart of slurmdbd.""" if not self._check_status(): event.defer() return slurmdbd_host_port_addr = { 'slurmdbd_hostname': socket.gethostname().split(".")[0], 'slurmdbd_port': "6819", } slurmdbd_config = { 'munge_key': self._stored.munge_key, **slurmdbd_host_port_addr, **self.model.config, **self._stored.db_info, } self._slurm_manager.render_config_and_restart(slurmdbd_config) self._slurmdbd.set_slurmdbd_available_on_unit_relation_data() self.unit.status = ActiveStatus("Slurmdbd Available") def set_munge_key(self, munge_key): """Set the munge key in the stored state.""" self._stored.munge_key = munge_key def set_db_info(self, db_info): """Set the db_info in the stored state.""" self._stored.db_info = db_info
class SlurmctldCharm(CharmBase): """Slurmctld lifecycle events.""" _stored = StoredState() def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default( jwt_key=str(), munge_key=str(), slurm_installed=False, slurmd_available=False, slurmrestd_available=False, slurmdbd_available=False, down_nodes=list(), ) self._slurm_manager = SlurmManager(self, "slurmctld") self._slurmd = Slurmd(self, "slurmd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmrestd = Slurmrestd(self, "slurmrestd") self._slurmctld_peer = SlurmctldPeer(self, "slurmctld-peer") self._prolog_epilog = PrologEpilog(self, "prolog-epilog") self._grafana = GrafanaSource(self, "grafana-source") self._influxdb = InfluxDB(self, "influxdb-api") self._elasticsearch = Elasticsearch(self, "elasticsearch") self._fluentbit = FluentbitClient(self, "fluentbit") self._user_group = UserGroupProvides(self, "user-group") self._etcd = EtcdOps() event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self.on.config_changed: self._on_write_slurm_config, self.on.leader_elected: self._on_leader_elected, # slurm component lifecycle events self._slurmdbd.on.slurmdbd_available: self._on_slurmdbd_available, self._slurmdbd.on.slurmdbd_unavailable: self._on_slurmdbd_unavailable, self._slurmd.on.slurmd_available: self._on_write_slurm_config, self._slurmd.on.slurmd_unavailable: self._on_write_slurm_config, self._slurmd.on.slurmd_departed: self._on_write_slurm_config, self._slurmrestd.on.slurmrestd_available: self._on_slurmrestd_available, self._slurmrestd.on.slurmrestd_unavailable: self._on_write_slurm_config, self._slurmctld_peer.on.slurmctld_peer_available: self. _on_write_slurm_config, # NOTE: a second slurmctld should get the jwt/munge keys and configure them # fluentbit self.on["fluentbit"].relation_created: self._on_fluentbit_relation_created, # Addons lifecycle events self._prolog_epilog.on.prolog_epilog_available: self._on_write_slurm_config, self._prolog_epilog.on.prolog_epilog_unavailable: self._on_write_slurm_config, self._grafana.on.grafana_available: self._on_grafana_available, self._influxdb.on.influxdb_available: self._on_influxdb_available, self._influxdb.on.influxdb_unavailable: self._on_write_slurm_config, self._elasticsearch.on.elasticsearch_available: self._on_elasticsearch_available, self._elasticsearch.on.elasticsearch_unavailable: self._on_write_slurm_config, self._user_group.on.create_user_group: self._on_create_user_group, self._user_group.on.remove_user_group: self._on_remove_user_group, # actions self.on.show_current_config_action: self._on_show_current_config, self.on.drain_action: self._drain_nodes_action, self.on.resume_action: self._resume_nodes_action, self.on.influxdb_info_action: self._infludb_info_action, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) @property def hostname(self): """Return the hostname.""" return self._slurm_manager.hostname @property def port(self): """Return the port.""" return self._slurm_manager.port @property def cluster_name(self) -> str: """Return the cluster name.""" return self.config.get("cluster-name") @property def _slurmctld_info(self): return self._slurmctld_peer.get_slurmctld_info() @property def slurmdbd_info(self): """Return slurmdbd_info from relation.""" return self._slurmdbd.get_slurmdbd_info() @property def _slurmd_info(self) -> list: return self._slurmd.get_slurmd_info() @property def _cluster_info(self): """Assemble information about the cluster.""" cluster_info = {} cluster_info['cluster_name'] = self.config.get('cluster-name') cluster_info['custom_config'] = self.config.get('custom-config') cluster_info['proctrack_type'] = self.config.get('proctrack-type') cluster_info['cgroup_config'] = self.config.get('cgroup-config') interval = self.config.get('health-check-interval') state = self.config.get('health-check-state') nhc = self._slurm_manager.slurm_config_nhc_values(interval, state) cluster_info.update(nhc) return cluster_info @property def _addons_info(self): """Assemble addons for slurm.conf.""" return { **self._assemble_prolog_epilog(), **self._assemble_acct_gather_addon(), **self._assemble_elastic_search_addon() } def _assemble_prolog_epilog(self) -> dict: """Generate the prolog_epilog section of the addons.""" logger.debug("## Generating prolog epilog configuration") prolog_epilog = self._prolog_epilog.get_prolog_epilog() if prolog_epilog: return {"prolog_epilog": prolog_epilog} else: return {} def _assemble_acct_gather_addon(self): """Generate the acct gather section of the addons.""" logger.debug("## Generating acct gather configuration") addons = dict() influxdb_info = self._get_influxdb_info() if influxdb_info: addons["acct_gather"] = influxdb_info addons["acct_gather"]["default"] = "all" addons["acct_gather_profile"] = "acct_gather_profile/influxdb" # it is possible to setup influxdb or hdf5 profiles without the # relation, using the custom-config section of slurm.conf. We need to # support setting up the acct_gather configuration for this scenario acct_gather_custom = self.config.get("acct-gather-custom") if acct_gather_custom: if not addons.get("acct_gather"): addons["acct_gather"] = dict() addons["acct_gather"]["custom"] = acct_gather_custom addons["acct_gather_frequency"] = self.config.get( "acct-gather-frequency") return addons def _assemble_elastic_search_addon(self): """Generate the acct gather section of the addons.""" logger.debug("## Generating elastic search addon configuration") addon = dict() elasticsearch_ingress = self._elasticsearch.elasticsearch_ingress if elasticsearch_ingress: suffix = f"/{self.cluster_name}/jobcomp" addon = { "elasticsearch_address": f"{elasticsearch_ingress}{suffix}" } return addon def set_slurmd_available(self, flag: bool): """Set stored value of slurmd available.""" self._stored.slurmd_available = flag def _set_slurmdbd_available(self, flag: bool): """Set stored value of slurmdbd available.""" self._stored.slurmdbd_available = flag def set_slurmrestd_available(self, flag: bool): """Set stored value of slurmdrest available.""" self._stored.slurmrestd_available = flag def _is_leader(self): return self.model.unit.is_leader() def is_slurm_installed(self): """Return true/false based on whether or not slurm is installed.""" return self._stored.slurm_installed def _on_show_current_config(self, event): """Show current slurm.conf.""" slurm_conf = self._slurm_manager.get_slurm_conf() event.set_results({"slurm.conf": slurm_conf}) def _on_install(self, event): """Perform installation operations for slurmctld.""" self.unit.set_workload_version(Path("version").read_text().strip()) self.unit.status = WaitingStatus("Installing slurmctld") custom_repo = self.config.get("custom-slurm-repo") successful_installation = self._slurm_manager.install(custom_repo) if successful_installation: self._stored.slurm_installed = True # Store the munge_key and jwt_rsa key in the stored state. # NOTE: Use leadership settings instead of stored state when # leadership settings support becomes available in the framework. if self._is_leader(): # NOTE the backup controller should also have the jwt and munge # keys configured. We should move these information to the # peer relation. self._stored.jwt_rsa = self._slurm_manager.generate_jwt_rsa() self._stored.munge_key = self._slurm_manager.get_munge_key() self._slurm_manager.configure_jwt_rsa(self.get_jwt_rsa()) else: # NOTE: the secondary slurmctld should get the jwt and munge # keys from the peer relation here logger.debug("secondary slurmctld") # all slurmctld should restart munged here, as it would assure # munge is working self._slurm_manager.restart_munged() else: self.unit.status = BlockedStatus("Error installing slurmctld") event.defer() logger.debug("## Retrieving etcd resource to install it") try: etcd_path = self.model.resources.fetch("etcd") logger.debug(f"## Found etcd resource: {etcd_path}") except ModelError: logger.error("## Missing etcd resource") self.unit.status = BlockedStatus("Missing etcd resource") event.defer() return self._etcd.install(etcd_path) self._check_status() def _on_fluentbit_relation_created(self, event): """Set up Fluentbit log forwarding.""" logger.debug("## Configuring fluentbit") cfg = list() cfg.extend(self._slurm_manager.fluentbit_config_nhc) cfg.extend(self._slurm_manager.fluentbit_config_slurm) self._fluentbit.configure(cfg) def _on_upgrade(self, event): """Perform upgrade operations.""" self.unit.set_workload_version(Path("version").read_text().strip()) def _on_update_status(self, event): """Handle update status.""" self._check_status() def _on_leader_elected(self, event: LeaderElectedEvent) -> None: logger.debug("## slurmctld - leader elected") self._etcd.start() # populate etcd with the nodelist slurm_config = self._assemble_slurm_config() accounted_nodes = self._assemble_all_nodes( slurm_config.get("partitions", [])) logger.debug( f"## Sending to etcd list of accounted nodes: {accounted_nodes}") self._etcd.set_list_of_accounted_nodes(accounted_nodes) def _check_status(self): """Check for all relations and set appropriate status. This charm needs these conditions to be satified in order to be ready: - Slurm components installed. - Munge running. - slurmdbd node running. - slurmd inventory. """ # NOTE: slurmd and slurmrestd are not needed for slurmctld to work, # only for the cluster to operate. But we need slurmd inventory # to assemble slurm.conf if self._slurm_manager.needs_reboot: self.unit.status = BlockedStatus("Machine needs reboot") return False if not self._stored.slurm_installed: self.unit.status = BlockedStatus("Error installing slurmctld") return False if (self._is_leader() and not self._etcd.is_active()): self.unit.status = WaitingStatus("Initializing charm") return False if not self._slurm_manager.check_munged(): self.unit.status = BlockedStatus("Error configuring munge key") return False # statuses of mandatory components: # - joined: someone executed juju relate slurmctld foo # - available: the units exchanged data through the relation # NOTE: slurmrestd is not mandatory for the cluster to work, that's why # it is not acounted for in here statuses = { "slurmd": { "available": self._stored.slurmd_available, "joined": self._slurmd.is_joined }, "slurmdbd": { "available": self._stored.slurmdbd_available, "joined": self._slurmdbd.is_joined } } relations_needed = list() waiting_on = list() for component in statuses.keys(): if not statuses[component]["joined"]: relations_needed.append(component) if not statuses[component]["available"]: waiting_on.append(component) if len(relations_needed): msg = f"Need relations: {','.join(relations_needed)}" self.unit.status = BlockedStatus(msg) return False if len(waiting_on): msg = f"Wating on: {','.join(waiting_on)}" self.unit.status = WaitingStatus(msg) return False self.unit.status = ActiveStatus("slurmctld available") return True def get_munge_key(self): """Get the stored munge key.""" return self._stored.munge_key def get_jwt_rsa(self): """Get the stored jwt_rsa key.""" return self._stored.jwt_rsa def _assemble_partitions(self, slurmd_info): """Make any needed modifications to partition data.""" slurmd_info_tmp = copy.deepcopy(slurmd_info) default_partition_from_config = self.config.get("default-partition") for partition in slurmd_info: # Deep copy the partition to a tmp var so we can modify it as # needed whilst not modifying the object we are iterating over. partition_tmp = copy.deepcopy(partition) # Extract the partition_name from the partition. partition_name = partition["partition_name"] # Check that the default_partition isn't defined in the charm # config. # If the user hasn't provided a default partition, then we infer # the partition_default by defaulting to the "configurator" # partition. if default_partition_from_config: if default_partition_from_config == partition_name: partition_tmp["partition_default"] = "YES" slurmd_info_tmp.remove(partition) slurmd_info_tmp.append(partition_tmp) return slurmd_info_tmp def _assemble_slurm_config(self): """Assemble and return the slurm config.""" logger.debug('## Assembling new slurm.conf') slurmctld_info = self._slurmctld_info slurmdbd_info = self.slurmdbd_info slurmd_info = self._slurmd_info cluster_info = self._cluster_info logger.debug("######## INFO") logger.debug(f'## slurmd: {slurmd_info}') logger.debug(f'## slurmctld_info: {slurmctld_info}') logger.debug(f'## slurmdbd_info: {slurmdbd_info}') logger.debug(f'## cluster_info: {cluster_info}') logger.debug("######## INFO - end") if not (slurmctld_info and slurmd_info and slurmdbd_info): return {} addons_info = self._addons_info partitions_info = self._assemble_partitions(slurmd_info) down_nodes = self._assemble_down_nodes(slurmd_info) logger.debug(f'#### addons: {addons_info}') logger.debug(f'#### partitions_info: {partitions_info}') logger.debug(f"#### Down nodes: {down_nodes}") return { "partitions": partitions_info, "down_nodes": down_nodes, **slurmctld_info, **slurmdbd_info, **addons_info, **cluster_info, } def _on_slurmrestd_available(self, event): """Set slurm_config on the relation when slurmrestd available.""" if not self._check_status(): event.defer() return slurm_config = self._assemble_slurm_config() if not slurm_config: self.unit.status = BlockedStatus( "Cannot generate slurm_config - defering event.") event.defer() return if self._stored.slurmrestd_available: self._slurmrestd.set_slurm_config_on_app_relation_data( slurm_config, ) self._slurmrestd.restart_slurmrestd() def _on_slurmdbd_available(self, event): self._set_slurmdbd_available(True) self._on_write_slurm_config(event) def _on_slurmdbd_unavailable(self, event): self._set_slurmdbd_available(False) self._check_status() def _on_write_slurm_config(self, event): """Check that we have what we need before we proceed.""" logger.debug("### Slurmctld - _on_write_slurm_config()") # only the leader should write the config, restart, and scontrol reconf if not self._is_leader(): return if not self._check_status(): event.defer() return slurm_config = self._assemble_slurm_config() if slurm_config: self._slurm_manager.render_slurm_configs(slurm_config) # restart is needed if nodes are added/removed from the cluster self._slurm_manager.slurm_systemctl('restart') self._slurm_manager.slurm_cmd('scontrol', 'reconfigure') # send the list of hostnames to slurmd via etcd accounted_nodes = self._assemble_all_nodes( slurm_config["partitions"]) self._etcd.set_list_of_accounted_nodes(accounted_nodes) # send the custom NHC parameters to all slurmd self._slurmd.set_nhc_params(self.config.get('health-check-params')) # check for "not new anymore" nodes, i.e., nodes that runned the # node-configured action. Those nodes are not anymore in the # DownNodes section in the slurm.conf, but we need to resume them # manually and update the internal cache down_nodes = slurm_config['down_nodes'] configured_nodes = self._assemble_configured_nodes(down_nodes) logger.debug(f"### configured nodes: {configured_nodes}") self._resume_nodes(configured_nodes) self._stored.down_nodes = down_nodes.copy() # slurmrestd needs the slurm.conf file, so send it every time it changes if self._stored.slurmrestd_available: self._slurmrestd.set_slurm_config_on_app_relation_data( slurm_config) # NOTE: scontrol reconfigure does not restart slurmrestd self._slurmrestd.restart_slurmrestd() else: logger.debug("## Should rewrite slurm.conf, but we don't have it. " "Deferring.") event.defer() @staticmethod def _assemble_all_nodes(slurmd_info: list) -> List[str]: """Parse slurmd_info and return a list with all hostnames.""" nodes = list() for partition in slurmd_info: for node in partition["inventory"]: nodes.append(node["node_name"]) return nodes @staticmethod def _assemble_down_nodes(slurmd_info): """Parse partitions' nodes and assemble a list of DownNodes.""" down_nodes = [] for partition in slurmd_info: for node in partition["inventory"]: if node["new_node"]: down_nodes.append(node["node_name"]) return down_nodes def _assemble_configured_nodes(self, down_nodes): """Assemble list of nodes that are not new anymore. new_node status is removed with an action, this method returns a list of nodes that were previously new but are not anymore. """ configured_nodes = [] for node in self._stored.down_nodes: if node not in down_nodes: configured_nodes.append(node) return configured_nodes def _resume_nodes(self, nodelist): """Run scontrol to resume the speficied node list.""" nodes = ",".join(nodelist) update_cmd = f"update nodename={nodes} state=resume" self._slurm_manager.slurm_cmd('scontrol', update_cmd) def _on_grafana_available(self, event): """Create the grafana-source if we are the leader and have influxdb.""" if not self._is_leader(): return influxdb_info = self._get_influxdb_info() if influxdb_info: self._grafana.set_grafana_source_info(influxdb_info) else: logger.error( "## Can not set Grafana source: missing influxdb relation") def _on_influxdb_available(self, event): """Assemble addons to forward slurm data to influxdb.""" self._on_write_slurm_config(event) def _on_elasticsearch_available(self, event): """Assemble addons to forward Slurm data to elasticsearch.""" self._on_write_slurm_config(event) def _get_influxdb_info(self) -> dict: """Return influxdb info.""" return self._influxdb.get_influxdb_info() def _drain_nodes_action(self, event): """Drain specified nodes.""" nodes = event.params['nodename'] reason = event.params['reason'] logger.debug(f'#### Draining {nodes} because {reason}.') event.log(f'Draining {nodes} because {reason}.') try: cmd = f'scontrol update nodename={nodes} state=drain reason="{reason}"' subprocess.check_output(shlex.split(cmd)) event.set_results({'status': 'draining', 'nodes': nodes}) except subprocess.CalledProcessError as e: event.fail(message=f'Error draining {nodes}: {e.output}') def _resume_nodes_action(self, event): """Resume specified nodes.""" nodes = event.params['nodename'] logger.debug(f'#### Resuming {nodes}.') event.log(f'Resuming {nodes}.') try: cmd = f'scontrol update nodename={nodes} state=resume' subprocess.check_output(shlex.split(cmd)) event.set_results({'status': 'resuming', 'nodes': nodes}) except subprocess.CalledProcessError as e: event.fail(message=f'Error resuming {nodes}: {e.output}') def _infludb_info_action(self, event): influxdb_info = self._get_influxdb_info() if not influxdb_info: influxdb_info = "not related" logger.debug(f"## InfluxDB-info action: {influxdb_info}") event.set_results({"influxdb": influxdb_info}) def _on_create_user_group(self, event): """Create the user and group provided.""" user = self._user_group.user_name user_uid = self._user_group.user_uid group = self._user_group.group_name # Create the group. try: subprocess.check_output(["groupadd", "--gid", user_uid, group]) # use the UID as the GID except subprocess.CalledProcessError as e: if e.returncode == 9: logger.warning("## Group already exists.") if e.returncode == 4: logger.warning("## GID already exists.") self._user_group._relation.data[self._user_group.model.app][ "status"] = "failure: GID already exists" return else: logger.error(f"## Error creating group: {e}") # Create the user. try: subprocess.check_output([ "useradd", "--system", "--no-create-home", "--gid", group, "--shell", "/usr/sbin/nologin", "-u", user_uid, user, ]) except subprocess.CalledProcessError as e: if e.returncode == 9: logger.warning("## User already exists.") if e.returncode == 4: logger.warning("## UID already exists.") self._user_group._relation.data[self._user_group.model.app][ "status"] = "failure: UID already exists" return else: logger.error(f"## Error creating user: {e}") self._user_group._relation.data[ self._user_group.model.app]["status"] = "success: User created" def _on_remove_user_group(self, event): """Remove the user and group provided.""" user = self._user_group.user_name group = self._user_group.group_name # Remove the user. try: subprocess.check_output(["userdel", user]) except subprocess.CalledProcessError as e: logger.error(f"## Error deleting user: {e}") # Remove the group. try: subprocess.check_output(["groupdel", group]) except subprocess.CalledProcessError as e: logger.error(f"## Error deleting group: {e}")
class SlurmdbdCharm(CharmBase): """Slurmdbd Charm.""" _stored = StoredState() def __init__(self, *args): """Set the default class attributes.""" super().__init__(*args) self._stored.set_default( db_info=dict(), slurmdbd_config=dict(), munge_key_available=False, slurm_installed=False, ) self._db = MySQLClient(self, "db") self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmdbd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer") event_handler_bindings = { self.on.install: self._on_install, self.on.config_changed: self._write_config_and_restart_slurmdbd, self._db.on.database_available: self._write_config_and_restart_slurmdbd, self._slurmdbd_peer.on.slurmdbd_peer_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmdbd_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmdbd_unavailable: self._on_slurm_configurator_unavailable, self._slurmdbd.on.munge_key_available: self._on_munge_key_available, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): self._slurm_manager.install(self.config["snapstore-channel"]) self._stored.slurm_installed = True self.unit.status = ActiveStatus("slurm snap successfully installed") def _on_upgrade(self, event): """Handle upgrade charm event.""" self._slurm_manager.upgrade() def _on_munge_key_available(self, event): if not self._stored.slurm_installed: event.defer() return munge_key = self._slurmdbd.get_munge_key() self._slurm_manager.configure_munge_key(munge_key) self._slurm_manager.restart_munged() self._stored.munge_key_available = True def _on_slurm_configurator_unavailable(self, event): self._stored.munge_key_available = False self._check_status() def _check_status(self) -> bool: """Check that we have the things we need.""" db_info = self._stored.db_info munge_key_available = self._stored.munge_key_available slurm_installed = self._stored.slurm_installed slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() deps = [ slurmdbd_info, db_info, slurm_installed, munge_key_available, ] if not all(deps): if not db_info: self.unit.status = BlockedStatus( "Need relation to MySQL." ) elif not munge_key_available: self.unit.status = BlockedStatus( "Need relation to slurm-configurator." ) return False return True def _write_config_and_restart_slurmdbd(self, event): """Check for prereqs before writing config/restart of slurmdbd.""" # Ensure all pre-conditions are met with _check_status(), if not # defer the event. if not self._check_status(): event.defer() return db_info = self._stored.db_info slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() slurmdbd_stored_config = dict(self._stored.slurmdbd_config) slurmdbd_config = { **self.config, **slurmdbd_info, **db_info, } if slurmdbd_config != slurmdbd_stored_config: self._stored.slurmdbd_config = slurmdbd_config self._slurm_manager.render_slurm_configs(slurmdbd_config) self._slurm_manager.restart_slurm_component() # Only the leader can set relation data on the application. # Enforce that no one other then the leader trys to set # application relation data. if self.model.unit.is_leader(): self._slurmdbd.set_slurmdbd_info_on_app_relation_data( slurmdbd_config, ) self.unit.status = ActiveStatus("slurmdbd available") def get_port(self): """Return the port from slurm-ops-manager.""" return self._slurm_manager.port def get_hostname(self): """Return the hostname from slurm-ops-manager.""" return self._slurm_manager.hostname def set_db_info(self, db_info): """Set the db_info in the stored state.""" self._stored.db_info = db_info
class SlurmdCharm(CharmBase): """Slurmd lifecycle events.""" _stored = StoredState() def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default( munge_key_available=False, slurmd_restarted=False, user_node_state=str(), partition_name=str(), ) self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmd") self._slurmd = Slurmd(self, "slurmd") self._slurmd_peer = SlurmdPeer(self, "slurmd-peer") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.start: self._on_check_status_and_write_config, self.on.config_changed: self._on_config_changed, self._slurmd_peer.on.slurmd_peer_available: self._on_set_partition_info_on_app_relation_data, self._slurmd_peer.on.slurmd_peer_departed: self._on_set_partition_info_on_app_relation_data, self._slurmd.on.slurm_config_available: self._on_check_status_and_write_config, self._slurmd.on.slurm_config_unavailable: self._on_check_status_and_write_config, self._slurmd.on.restart_slurmd: self._on_restart_slurmd, self._slurmd.on.munge_key_available: self._on_write_munge_key, self.on.set_node_state_action: self._on_set_node_state_action, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): self._slurm_manager.install(self.config["snapstore-channel"]) if self.model.unit.is_leader(): self._get_set_partition_name() logger.debug(f"PARTITION_NAME: {self._stored.partition_name}") self._stored.slurm_installed = True self.unit.status = ActiveStatus("Slurm installed") def _on_upgrade(self, event): slurm_config = self._check_status() if not slurm_config: event.defer() return self._slurm_manager.upgrade( slurm_config, self.config["snapstore-channel"] ) def _on_config_changed(self, event): if self.model.unit.is_leader(): self._get_set_partition_name() if self._check_status(): self._on_set_partition_info_on_app_relation_data( event ) def _on_write_munge_key(self, event): if not self._stored.slurm_installed: event.defer() return munge_key = self._slurmd.get_stored_munge_key() self._slurm_manager.configure_munge_key(munge_key) self._slurm_manager.restart_munged() self._stored.munge_key_available = True def _on_check_status_and_write_config(self, event): slurm_config = self._check_status() if not slurm_config: event.defer() return # if slurm_config['configless']: # slurmctld_hostname = slurm_config['active_controller_hostname'] # self._slurm_manager.configure_slurmctld_hostname( # slurmctld_hostname # ) # self._slurm_manager.restart_slurm_component() # else: # Ensure we aren't dealing with a StoredDict before trying # to render the slurm.conf. slurm_config = dict(slurm_config) self._slurm_manager.render_slurm_configs(slurm_config) # Only restart slurmd the first time the node is brought up. if not self._stored.slurmd_restarted: self._slurm_manager.restart_slurm_component() self._stored.slurmd_restarted = True self.unit.status = ActiveStatus("slurmd available") def _on_restart_slurmd(self, event): self._slurm_manager.restart_slurm_component() def _check_status(self): munge_key_available = self._stored.munge_key_available slurm_installed = self._stored.slurm_installed slurm_config = self._slurmd.get_stored_slurm_config() slurmd_joined = self._slurmd.is_joined if not slurmd_joined: self.unit.status = BlockedStatus( "Needed relations: slurm-configurator" ) return None elif not (munge_key_available and slurm_config and slurm_installed): self.unit.status = WaitingStatus( "Waiting on: configuration" ) return None return dict(slurm_config) def _on_set_node_state_action(self, event): """Set the node state.""" self._stored.user_node_state = event.params["node-state"] self._on_set_partition_info_on_app_relation_data(event) def _on_set_partition_info_on_app_relation_data(self, event): """Set the slurm partition info on the application relation data.""" # Only the leader can set data on the relation. if self.framework.model.unit.is_leader(): # If the relation with slurm-configurator exists then set our # partition info on the application relation data. # This handler shouldn't fire if the relation isn't made, # but add this extra check here just incase. if self._slurmd.is_joined: partition = self._assemble_partition() if partition: self._slurmd.set_partition_info_on_app_relation_data( partition ) return event.defer() return def _assemble_partition(self): """Assemble the partition info.""" partition_name = self._stored.partition_name partition_config = self.config.get("partition-config") partition_state = self.config.get("partition-state") slurmd_inventory = self._assemble_slurmd_inventory() return { "inventory": slurmd_inventory, "partition_name": partition_name, "partition_state": partition_state, "partition_config": partition_config, } def _assemble_slurmd_inventory(self): """Apply mutations to nodes in the partition, return slurmd nodes.""" slurmd_inventory = self._slurmd_peer.get_slurmd_inventory() if not slurmd_inventory: return None # If the user has set custom state for nodes # ensure we update the state for the targeted nodes. user_node_state = self._stored.user_node_state if user_node_state: node_states = { item.split("=")[0]: item.split("=")[1] for item in user_node_state.split(",") } # Copy the slurmd_inventory returned from the the slurmd-peer # relation to a temporary variable that we will use to # iterate over while we conditionally make modifications to the # original inventory. slurmd_inventory_tmp = copy.deepcopy(slurmd_inventory) # Iterate over the slurmd nodes in the partition and check # for nodes that need their state modified. for partition in slurmd_inventory_tmp: partition_tmp = copy.deepcopy(partition) for slurmd_node in partition["inventory"]: if slurmd_node["hostname"] in node_states.keys(): slurmd_node_tmp = copy.deepcopy(slurmd_node) slurmd_node_tmp["state"] = \ node_states[slurmd_node["hostname"]] partition_tmp["inventory"].remove(slurmd_node) partition_tmp["inventory"].append(slurmd_node_tmp) slurmd_inventory.remove(partition) slurmd_inventory.append(partition_tmp) return slurmd_inventory def _get_set_partition_name(self): """Set the partition name.""" # Determine if a partition-name config exists, if so # ensure the self._stored.partition_name is consistent with the # supplied config. # If no partition name has been specified then generate one. partition_name = self.config.get("partition-name") if partition_name: if partition_name != self._stored.partition_name: self._stored.partition_name = partition_name elif not self._stored.partition_name: self._stored.partition_name = f"juju-compute-{random_string()}" return def get_partition_name(self): """Return the partition_name.""" return self._stored.partition_name def get_slurm_component(self): """Return the slurm component.""" return self._slurm_manager.slurm_component def get_hostname(self): """Return the hostname.""" return self._slurm_manager.hostname def get_port(self): """Return the port.""" return self._slurm_manager.port
class SlurmctldCharm(CharmBase): """Slurmctld lifecycle events.""" _stored = StoredState() def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default( munge_key_available=False, slurmctld_controller_type=str(), ) self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmctld") self._slurmctld = Slurmctld(self, "slurmctld") self._slurmctld_peer = SlurmctldPeer(self, "slurmctld-peer") event_handler_bindings = { self.on.install: self._on_install, self._slurmctld.on.slurm_config_available: self._on_check_status_and_write_config, self._slurmctld.on.scontrol_reconfigure: self._on_scontrol_reconfigure, self._slurmctld.on.restart_slurmctld: self._on_restart_slurmctld, self._slurmctld.on.munge_key_available: self._on_write_munge_key, self._slurmctld_peer.on.slurmctld_peer_available: self._on_slurmctld_peer_available, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): self._slurm_manager.install(self.config["snapstore-channel"]) self._stored.slurm_installed = True self.unit.status = ActiveStatus("slurm snap successfully installed") def _on_upgrade(self, event): slurm_config = dict(self._check_status()) snapstore_channel = self.config["snapstore-channel"] self._slurm_manager.upgrade(slurm_config, snapstore_channel) def _on_write_munge_key(self, event): if not self._stored.slurm_installed: event.defer() return munge_key = self._slurmctld.get_stored_munge_key() self._slurm_manager.configure_munge_key(munge_key) self._slurm_manager.restart_munged() self._stored.munge_key_available = True def _on_slurmctld_peer_available(self, event): if self.framework.model.unit.is_leader(): if self._slurmctld.is_joined: slurmctld_info = self._slurmctld_peer.get_slurmctld_info() if slurmctld_info: self._slurmctld.set_slurmctld_info_on_app_relation_data( slurmctld_info ) return event.defer() return def _on_check_status_and_write_config(self, event): slurm_config = self._check_status() if not slurm_config: event.defer() return self._slurm_manager.render_slurm_configs(dict(slurm_config)) self.unit.status = ActiveStatus("slurmctld available") def _on_restart_slurmctld(self, event): self._slurm_manager.restart_slurm_component() def _on_scontrol_reconfigure(self, event): self._slurm_manager.slurm_cmd("scontrol", "reconfigure") def _check_status(self): munge_key_available = self._stored.munge_key_available slurm_installed = self._stored.slurm_installed slurm_config = self._slurmctld.get_stored_slurm_config() slurmctld_joined = self._slurmctld.is_joined if not slurmctld_joined: self.unit.status = BlockedStatus( "Relations needed: slurm-configurator" ) return None elif not (munge_key_available and slurm_installed and slurm_config): self.unit.status = WaitingStatus( "Waiting on: configuration" ) return None return slurm_config def get_slurm_component(self): """Return the slurm component.""" return self._slurm_manager.slurm_component def get_hostname(self): """Return the hostname.""" return self._slurm_manager.hostname def get_port(self): """Return the port.""" return self._slurm_manager.port
class SlurmdCharm(CharmBase): """Slurmd lifecycle events.""" _stored = StoredState() on = SlurmdCharmEvents() def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default(nhc_conf=str(), slurm_installed=False, slurmctld_available=False, slurmctld_started=False, cluster_name=str()) self._slurm_manager = SlurmManager(self, "slurmd") self._fluentbit = FluentbitClient(self, "fluentbit") # interface to slurmctld, should only have one slurmctld per slurmd app self._slurmd = Slurmd(self, "slurmd") self._slurmd_peer = SlurmdPeer(self, "slurmd-peer") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self.on.config_changed: self._on_config_changed, self.on.slurmctld_started: self._on_slurmctld_started, self.on.slurmd_start: self._on_slurmd_start, self.on.check_etcd: self._on_check_etcd, self._slurmd.on.slurmctld_available: self._on_slurmctld_available, self._slurmd.on.slurmctld_unavailable: self._on_slurmctld_unavailable, # fluentbit self.on["fluentbit"].relation_created: self._on_configure_fluentbit, # actions self.on.version_action: self._on_version_action, self.on.node_configured_action: self._on_node_configured_action, self.on.get_node_inventory_action: self._on_get_node_inventory_action, self.on.show_nhc_config_action: self._on_show_nhc_config, # infiniband actions self.on.get_infiniband_repo_action: self.get_infiniband_repo, self.on.set_infiniband_repo_action: self.set_infiniband_repo, self.on.install_infiniband_action: self.install_infiniband, self.on.uninstall_infiniband_action: self.uninstall_infiniband, self.on.start_infiniband_action: self.start_infiniband, self.on.enable_infiniband_action: self.enable_infiniband, self.on.stop_infiniband_action: self.stop_infiniband, self.on.is_active_infiniband_action: self.is_active_infiniband, # nvdia actions self.on.nvidia_repo_action: self.nvidia_repo, self.on.nvidia_package_action: self.nvidia_package, self.on.nvidia_install_action: self.nvidia_install, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): """Perform installation operations for slurmd.""" self.unit.set_workload_version(Path("version").read_text().strip()) self.unit.status = WaitingStatus("Installing slurmd") custom_repo = self.config.get("custom-slurm-repo") successful_installation = self._slurm_manager.install(custom_repo) logger.debug(f"### slurmd installed: {successful_installation}") if successful_installation: self._stored.slurm_installed = True else: self.unit.status = BlockedStatus("Error installing slurmd") event.defer() self._check_status() def _on_configure_fluentbit(self, event): """Set up Fluentbit log forwarding.""" self._configure_fluentbit() def _configure_fluentbit(self): logger.debug("## Configuring fluentbit") cfg = list() cfg.extend(self._slurm_manager.fluentbit_config_nhc) cfg.extend(self._slurm_manager.fluentbit_config_slurm) self._fluentbit.configure(cfg) def _on_upgrade(self, event): """Perform upgrade operations.""" self.unit.set_workload_version(Path("version").read_text().strip()) def _on_update_status(self, event): """Handle update status.""" self._check_status() def _check_status(self) -> bool: """Check if we heve all needed components. - partition name - slurm installed - slurmctld available and working - munge key configured and working """ if self._slurm_manager.needs_reboot: self.unit.status = BlockedStatus("Machine needs reboot") return False if not self.get_partition_name(): self.unit.status = WaitingStatus("Waiting on charm configuration") return False if not self._stored.slurm_installed: self.unit.status = BlockedStatus("Error installing slurmd") return False if not self._slurmd.is_joined: self.unit.status = BlockedStatus("Need relations: slurmctld") return False if not self._stored.slurmctld_available: self.unit.status = WaitingStatus("Waiting on: slurmctld") return False if not self._slurm_manager.check_munged(): self.unit.status = BlockedStatus("Error configuring munge key") return False if not self._stored.slurmctld_started: self.unit.status = WaitingStatus("Waiting slurmctld to start") return False self.unit.status = ActiveStatus("slurmd available") return True def ensure_slurmd_starts(self, max_attemps=10) -> bool: """Ensure slurmd is up and running.""" logger.debug("## Stoping slurmd") self._slurm_manager.slurm_systemctl('stop') for i in range(max_attemps): if self._slurm_manager.slurm_is_active(): logger.debug("## Slurmd running") break else: logger.warning("## Slurmd not running, trying to start it") self.unit.status = WaitingStatus("Starting slurmd") self._slurm_manager.restart_slurm_component() sleep(2 + i) if self._slurm_manager.slurm_is_active(): return True else: self.unit.status = BlockedStatus("Cannot start slurmd") return False def _set_slurmctld_available(self, flag: bool): """Change stored value for slurmctld availability.""" self._stored.slurmctld_available = flag def _set_slurmctld_started(self, flag: bool): """Change stored value for slurmctld started.""" self._stored.slurmctld_started = flag def _on_slurmctld_available(self, event): """Get data from slurmctld and send inventory.""" if not self._stored.slurm_installed: event.defer() return logger.debug( '#### Slurmctld available - setting overrides for configless') # get slurmctld host:port from relation and override systemd services host = self._slurmd.slurmctld_hostname port = self._slurmd.slurmctld_port self._slurm_manager.create_configless_systemd_override(host, port) self._slurm_manager.daemon_reload() self._write_munge_key_and_restart_munge() self._set_slurmctld_available(True) self._on_set_partition_info_on_app_relation_data(event) self._check_status() # check etcd for hostnames self.on.check_etcd.emit() def _on_check_etcd(self, event): """Check if node is accounted for. Check if slurmctld accounted for this node's inventory for the first time, if so, emit slurmctld_started event, so the node can start the daemon. """ host = self._slurmd.slurmctld_address port = self._slurmd.etcd_port logger.debug(f"## Connecting to etcd3 in {host}:{port}") client = Etcd3Client(host=host, port=port, api_path="/v3/") logger.debug("## Querying etcd3 for node list") try: v = client.get(key="all_nodes") logger.debug(f"## Got: {v}") except Exception as e: logger.error( f"## Unable to connect to {host} to get list of nodes: {e}") event.defer() return node_accounted = False if v: hostnames = json.loads(v[0]) logger.debug(f"### etcd3 node list: {hostnames}") if self.hostname in hostnames: self.on.slurmctld_started.emit() node_accounted = True if not node_accounted: logger.debug("## Node not accounted for. Deferring.") event.defer() def _on_slurmctld_unavailable(self, event): logger.debug("## Slurmctld unavailable") self._set_slurmctld_available(False) self._set_slurmctld_started(False) self._slurm_manager.slurm_systemctl('stop') self._check_status() def _on_slurmctld_started(self, event): """Set flag to True and emit slurmd_start event.""" self._set_slurmctld_started(True) self.on.slurmd_start.emit() def _on_slurmd_start(self, event): if not self._check_status(): event.defer() return # only set up fluentbit if we have a relation to it if self._fluentbit._relation is not None: self._configure_fluentbit() # at this point, we have slurm installed, munge configured, and we know # slurmctld accounted for this node. It should be safe to start slurmd if self.ensure_slurmd_starts(): logger.debug("## slurmctld started and slurmd is running") else: event.defer() self._check_status() def _on_config_changed(self, event): """Handle charm configuration changes.""" if self.model.unit.is_leader(): logger.debug("## slurmd config changed - leader") self._on_set_partition_info_on_app_relation_data(event) nhc_conf = self.model.config.get('nhc-conf') if nhc_conf: if nhc_conf != self._stored.nhc_conf: self._stored.nhc_conf = nhc_conf self._slurm_manager.render_nhc_config(nhc_conf) def get_partition_name(self) -> str: """Return the partition_name in the slurmd relation.""" # Determine if a user-supplied partition-name config exists, if so # ensure the partition_name is consistent with the supplied config. # If no partition name has been specified then generate one. partition_name = self._slurmd_peer.partition_name partition_name_from_config = self.config.get("partition-name") if partition_name: if partition_name_from_config: partition_name_from_config = partition_name_from_config.replace( ' ', '-') if partition_name != partition_name_from_config: self._set_partition_name(partition_name_from_config) partition_name = partition_name_from_config else: logger.debug("Partition name unchanged.") else: logger.debug("Partition name unchanged.") else: partition_name = f"osd-{self.app.name}" logger.debug(f"Partition name: {partition_name}") self._set_partition_name(partition_name) return partition_name def _set_partition_name(self, name: str): """Set the partition_name in the slurmd relation.""" if self.model.unit.is_leader(): self._slurmd_peer.partition_name = name def _write_munge_key_and_restart_munge(self): logger.debug('#### slurmd charm - writting munge key') self._slurm_manager.configure_munge_key( self._slurmd.get_stored_munge_key()) if self._slurm_manager.restart_munged(): logger.debug("## Munge restarted succesfully") else: logger.error("## Unable to restart munge") def _on_version_action(self, event): """Return version of installed components. - Slurm - munge - NHC - infiniband """ version = {} version['slurm'] = self._slurm_manager.slurm_version() version['munge'] = self._slurm_manager.munge_version() version['nhc'] = self._slurm_manager.nhc_version() version['infiniband'] = self._slurm_manager.infiniband_version() event.set_results(version) def _on_node_configured_action(self, event): """Remove node from DownNodes.""" # trigger reconfig self._slurmd.configure_new_node() logger.debug('### This node is not new anymore') def _on_get_node_inventory_action(self, event): """Return node inventory.""" inventory = self._slurmd.node_inventory event.set_results({'inventory': inventory}) def get_infiniband_repo(self, event): """Return the currently used infiniband repository.""" repo = self._slurm_manager.infiniband.repository event.set_results({'infiniband-repo': repo}) def set_infiniband_repo(self, event): """Set the infiniband repository.""" repo = event.params["repo"] logger.debug(f"#### setting custom infiniband repo: {repo}") repo = base64.b64decode(repo).decode() self._slurm_manager.infiniband.repository = repo def install_infiniband(self, event): """Install infiniband.""" logger.debug("#### Installing Infiniband") self._slurm_manager.infiniband.install() event.set_results({'installation': 'Successfull. Please reboot node.'}) self.unit.status = BlockedStatus("Need reboot for Infiniband") def uninstall_infiniband(self, event): """Install infiniband.""" logger.debug("#### Uninstalling Infiniband") self._slurm_manager.infiniband.uninstall() def start_infiniband(self, event): """Start Infiniband systemd service.""" logger.debug("#### Starting Infiniband service") self._slurm_manager.infiniband.start() def enable_infiniband(self, event): """Enable Infiniband systemd service.""" logger.debug("#### Enabling Infiniband service") self._slurm_manager.infiniband.enable() def stop_infiniband(self, event): """Stop Infiniband systemd service.""" logger.debug("#### Stoping Infiniband service") self._slurm_manager.infiniband.stop() def is_active_infiniband(self, event): """Check if Infiniband systemd service is arctive.""" status = self._slurm_manager.infiniband.is_active() logger.debug(f"#### Infiniband service is-active: {status}") event.set_results({'infiniband-is-active': status}) def nvidia_repo(self, event): """Set or get the used nvidia repository.""" repo = event.params.get("repo", None) if repo: self._slurm_manager.nvidia.repository = base64.b64decode( repo).decode() event.set_results( {'nvidia-repo': self._slurm_manager.nvidia.repository}) def nvidia_package(self, event): """Set or get the used nvidia package.""" package = event.params.get("package", None) if package or package == "": # user supplied a package name -> store it self._slurm_manager.nvidia.package = package event.set_results( {'nvidia-package': self._slurm_manager.nvidia.package}) def nvidia_install(self, event): """Install nvidia drivers.""" logger.debug("#### Installing nvidia drivers: %s", self._slurm_manager.nvidia.package) self._slurm_manager.nvidia.install() event.set_results({'installation': 'Successfull. Please reboot node.'}) self.unit.status = BlockedStatus("Need reboot for nvidia") def _on_show_nhc_config(self, event): """Show current nhc.conf.""" nhc_conf = self._slurm_manager.get_nhc_config() event.set_results({"nhc.conf": nhc_conf}) def _on_set_partition_info_on_app_relation_data(self, event): """Set the slurm partition info on the application relation data.""" # Only the leader can set data on the relation. if self.model.unit.is_leader(): # If the relation with slurmctld exists then set our # partition info on the application relation data. # This handler shouldn't fire if the relation isn't made, # but add this extra check here just incase. if self._slurmd.is_joined: partition = self._assemble_partition() if partition: self._slurmd.set_partition_info_on_app_relation_data( partition) else: event.defer() else: event.defer() def _assemble_partition(self): """Assemble the partition info.""" partition_name = self.get_partition_name() partition_config = self.config.get("partition-config") partition_state = self.config.get("partition-state") logger.debug(f"## partition_name: {partition_name}") return { "partition_name": partition_name, "partition_state": partition_state, "partition_config": partition_config, } @property def hostname(self) -> str: """Return the hostname.""" return self._slurm_manager.hostname @property def cluster_name(self) -> str: """Return the cluster-name.""" return self._stored.cluster_name @cluster_name.setter def cluster_name(self, name: str): """Set the cluster-name.""" self._stored.cluster_name = name
class SlurmdCharm(CharmBase): """Slurmd lifecycle events.""" _stored = StoredState() def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default( user_node_state=str(), partition_name=str(), config_available=False, ) self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmd") self._slurmd = Slurmd(self, "slurmd") self._slurmd_peer = SlurmdPeer(self, "slurmd-peer") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.config_changed: self._on_send_slurmd_info, self._slurmd_peer.on.slurmd_peer_available: self._on_send_slurmd_info, self._slurmd.on.slurm_config_available: self._on_check_status_and_write_config, self.on.set_node_state_action: self._on_set_node_state_action, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_config_changed(self, event): self.get_set_return_partition_name() self._on_send_slurmd_info(event) def _on_install(self, event): self._slurm_manager.install() self._stored.slurm_installed = True self.unit.status = ActiveStatus("Slurm Installed") def _on_upgrade(self, event): self._slurm_manager.upgrade() def _on_set_node_state_action(self, event): """Set the node state.""" self._stored.user_node_state = event.params["node-state"] self._on_send_slurm_info(event) def _on_send_slurmd_info(self, event): if self.framework.model.unit.is_leader(): if self._slurmd.is_joined: partition = self._assemble_partition() if partition: self._slurmd.set_slurmd_info_on_app_relation_data( partition) return event.defer() return def _on_check_status_and_write_config(self, event): if not self._check_status(): event.defer() return slurm_config = dict(self._slurmd.get_slurm_config()) self._slurm_manager.render_config_and_restart(slurm_config) self.unit.status = ActiveStatus("Slurmd Available") def _check_status(self): slurm_installed = self._stored.slurm_installed config_available = self._stored.config_available if not (slurm_installed and config_available): self.unit.status = BlockedStatus( "NEED RELATION TO SLURM CONFIGURATOR") return False else: return True def _assemble_partition(self): """Assemble the partition info.""" partition_name = self._stored.partition_name partition_config = self.model.config.get('partition-config') partition_state = self.model.config.get('partition-state') slurmd_info = self._assemble_slurmd_info() return { 'inventory': slurmd_info, 'partition_name': partition_name, 'partition_state': partition_state, 'partition_config': partition_config, } def _assemble_slurmd_info(self): """Apply mutations to nodes in the partition, return slurmd nodes.""" slurmd_info = self._slurmd_peer.get_slurmd_info() if not slurmd_info: return None # If the user has set custom state for nodes # ensure we update the state for the targeted nodes. user_node_state = self._stored.user_node_state if user_node_state: node_states = { item.split("=")[0]: item.split("=")[1] for item in user_node_state.split(",") } # Copy the slurmd_info returned from the the slurmd-peer relation # to a temporary variable to which we will make modifications. slurmd_info_tmp = copy.deepcopy(slurmd_info) # Iterate over the slurmd nodes in the partition and check # for nodes that need their state modified. for partition in slurmd_info: partition_tmp = copy.deepcopy(partition) for slurmd_node in partition['inventory']: if slurmd_node['hostname'] in node_states.keys(): slurmd_node_tmp = copy.deepcopy(slurmd_node) slurmd_node_tmp['state'] = \ node_states[slurmd_node['hostname']] partition_tmp['inventory'].remove(slurmd_node) partition_tmp['inventory'].append(slurmd_node_tmp) slurmd_info_tmp.remove(partition) slurmd_info_tmp.append(partition_tmp) else: slurmd_info_tmp = slurmd_info return slurmd_info_tmp def get_set_return_partition_name(self): """Set the partition name.""" # Determine if a partition-name config exists, if so # ensure the partition_name known by the charm is consistent. # If no partition name has been specified then generate one. partition_name = self.model.config.get('partition-name') if partition_name: if partition_name != self._stored.partition_name: self._stored.partition_name = partition_name elif not self._stored.partition_name: self._stored.partition_name = f"juju-compute-{random_string()}" return self._stored.partition_name def get_slurm_component(self): """Return the slurm component.""" return self._slurm_manager.slurm_component def get_hostname(self): """Return the hostname.""" return self._slurm_manager.hostname def get_port(self): """Return the port.""" return self._slurm_manager.port
class SlurmConfiguratorCharm(CharmBase): """Facilitate slurm configuration operations.""" _stored = StoredState() def __init__(self, *args): """Init charm, _stored defaults, interfaces and observe events.""" super().__init__(*args) self._stored.set_default( default_partition=str(), munge_key=str(), slurm_installed=False, slurmctld_available=False, slurmdbd_available=False, slurmd_available=False, slurmrestd_available=False, ) self._elasticsearch = Elasticsearch(self, "elasticsearch") self._grafana = GrafanaSource(self, "grafana-source") self._influxdb = InfluxDB(self, "influxdb-api") self._nhc = Nhc(self, "nhc") self._slurmrestd = Slurmrestd(self, "slurmrestd") self._slurm_manager = SlurmManager(self, "slurmd") self._slurmctld = Slurmctld(self, "slurmctld") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmd = Slurmd(self, "slurmd") self._prolog_epilog = PrologEpilog(self, "prolog-epilog") # #### Charm lifecycle events #### # event_handler_bindings = { # #### Juju lifecycle events #### # self.on.install: self._on_install, # self.on.start: # self._on_check_status_and_write_config, self.on.config_changed: self._on_check_status_and_write_config, self.on.upgrade_charm: self._on_upgrade, # ######## Addons lifecycle events ######## # self._elasticsearch.on.elasticsearch_available: self._on_check_status_and_write_config, self._elasticsearch.on.elasticsearch_unavailable: self._on_check_status_and_write_config, self._grafana.on.grafana_available: self._on_grafana_available, self._influxdb.on.influxdb_available: self._on_influxdb_available, self._influxdb.on.influxdb_unavailable: self._on_check_status_and_write_config, self._nhc.on.nhc_bin_available: self._on_check_status_and_write_config, # ######## Slurm component lifecycle events ######## # self._slurmctld.on.slurmctld_available: self._on_check_status_and_write_config, self._slurmctld.on.slurmctld_unavailable: self._on_check_status_and_write_config, self._slurmdbd.on.slurmdbd_available: self._on_check_status_and_write_config, self._slurmdbd.on.slurmdbd_unavailable: self._on_check_status_and_write_config, self._slurmd.on.slurmd_available: self._on_check_status_and_write_config, self._slurmd.on.slurmd_unavailable: self._on_check_status_and_write_config, self._slurmrestd.on.slurmrestd_available: self._on_check_status_and_write_config, self._slurmrestd.on.slurmrestd_unavailable: self._on_check_status_and_write_config, self._prolog_epilog.on.prolog_epilog_available: self._on_check_status_and_write_config, self._prolog_epilog.on.prolog_epilog_unavailable: self._on_check_status_and_write_config, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): """Install the slurm snap and set the munge key.""" self._slurm_manager.install() self._stored.munge_key = self._slurm_manager.get_munge_key() self._stored.slurm_installed = True self.unit.status = ActiveStatus("Slurm Installed") def _on_upgrade(self, event): """Upgrade the charm.""" slurm_config = self._assemble_slurm_config() if not slurm_config: self.unit.status = BlockedStatus( "Cannot generate slurm_config, defering upgrade.") event.defer() return self._slurm_manager.upgrade(slurm_config) def _on_grafana_available(self, event): """Create the grafana-source if we are the leader and have influxdb.""" leader = self._is_leader() influxdb_info = self._get_influxdb_info() grafana = self._grafana if leader and influxdb_info: grafana.set_grafana_source_info(influxdb_info) def _on_influxdb_available(self, event): """Create the grafana-source if we have all the things.""" grafana = self._grafana influxdb_info = self._get_influxdb_info() leader = self._is_leader() if leader and grafana.is_joined and influxdb_info: grafana.set_grafana_source_info(influxdb_info) self._on_check_status_and_write_config(event) def _on_check_status_and_write_config(self, event): """Check that we have what we need before we proceed.""" if not self._check_status(): event.defer() return # Generate the slurm_config slurm_config = self._assemble_slurm_config() if not slurm_config: self.unit.status = BlockedStatus( "Cannot generate slurm_config - defering event.") event.defer() return self._slurmctld.set_slurm_config_on_app_relation_data(slurm_config, ) self._slurmd.set_slurm_config_on_app_relation_data(slurm_config, ) if self._stored.slurmrestd_available: self._slurmrestd.set_slurm_config_on_app_relation_data( slurm_config, ) self._slurm_manager.render_config_and_restart({ **slurm_config, 'munge_key': self.get_munge_key() }) def _assemble_slurm_config(self): """Assemble and return the slurm config.""" slurmctld_info = self._slurmctld.get_slurmctld_info() slurmdbd_info = self._slurmdbd.get_slurmdbd_info() slurmd_info = self._slurmd.get_slurmd_info() if not (slurmd_info and slurmctld_info and slurmdbd_info): return {} addons_info = self._assemble_addons() partitions_info = self._assemble_partitions(slurmd_info) logger.debug(addons_info) logger.debug(partitions_info) logger.debug(slurmctld_info) logger.debug(slurmdbd_info) return { 'munge_key': self._stored.munge_key, 'partitions': partitions_info, **slurmctld_info, **slurmdbd_info, **addons_info, **self.model.config, } def _assemble_partitions(self, slurmd_info): """Make any needed modifications to partition data.""" slurmd_info_tmp = copy.deepcopy(slurmd_info) for partition in slurmd_info: # Deep copy the partition to a tmp var so we can modify it as # needed whilst not modifying the object we are iterating over. partition_tmp = copy.deepcopy(partition) # Extract the partition_name from the partition and from the charm # config. partition_name = partition['partition_name'] default_partition_from_config = self.model.config.get( 'default_partition') # Check that the default_partition isn't defined in the charm # config. # If the user hasn't provided a default partition, then we infer # the partition_default by defaulting to the first related slurmd # application. if not default_partition_from_config: if partition['partition_name'] ==\ self._stored.default_partition: partition_tmp['partition_default'] = 'YES' else: if default_partition_from_config == partition_name: partition_tmp['partition_default'] = 'YES' slurmd_info_tmp.remove(partition) slurmd_info_tmp.append(partition_tmp) return slurmd_info_tmp def _assemble_addons(self): """Assemble any addon components.""" acct_gather = self._get_influxdb_info() elasticsearch_ingress = \ self._elasticsearch.get_elasticsearch_ingress() nhc_info = self._nhc.get_nhc_info() prolog_epilog = self._prolog_epilog.get_prolog_epilog() ctxt = dict() if prolog_epilog: ctxt['prolog_epilog'] = prolog_epilog if acct_gather: ctxt['acct_gather'] = acct_gather acct_gather_custom = self.model.config.get('acct_gather_custom') if acct_gather_custom: ctxt['acct_gather']['custom'] = acct_gather_custom if nhc_info: ctxt['nhc'] = { 'nhc_bin': nhc_info['nhc_bin'], 'health_check_interval': nhc_info['health_check_interval'], 'health_check_node_state': nhc_info['health_check_node_state'], } if elasticsearch_ingress: ctxt['elasticsearch_address'] = elasticsearch_ingress return ctxt def _check_status(self): """Check that the core components we need exist.""" slurmctld_available = self._stored.slurmctld_available slurmdbd_available = self._stored.slurmdbd_available slurmd_available = self._stored.slurmd_available slurm_installed = self._stored.slurm_installed default_partition = self._stored.default_partition deps = [ default_partition, slurmctld_available, slurmdbd_available, slurmd_available, slurm_installed, ] if not all(deps): if not slurmctld_available: self.unit.status = BlockedStatus("NEED RELATION TO SLURMCTLD") elif not slurmdbd_available: self.unit.status = BlockedStatus("NEED RELATION TO SLURMDBD") elif not slurmd_available: self.unit.status = BlockedStatus("NEED RELATION TO SLURMD") elif not slurm_installed: self.unit.status = BlockedStatus("SLURM NOT INSTALLED") else: self.unit.status = BlockedStatus("PARTITION NAME UNAVAILABLE") return False else: self.unit.status = ActiveStatus("") return True def _get_influxdb_info(self): """Return influxdb info.""" return self._influxdb.get_influxdb_info() def _is_leader(self): return self.model.unit.is_leader() def get_munge_key(self): """Return the slurmdbd_info from stored state.""" return self._stored.munge_key def get_default_partition(self): """Return self._stored.default_partition.""" return self._stored.default_partition def is_slurm_installed(self): """Return true/false based on whether or not slurm is installed.""" return self._stored.slurm_installed def set_slurmctld_available(self, slurmctld_available): """Set slurmctld_available.""" self._stored.slurmctld_available = slurmctld_available def set_slurmdbd_available(self, slurmdbd_available): """Set slurmdbd_available.""" self._stored.slurmdbd_available = slurmdbd_available def set_default_partition(self, partition_name): """Set self._stored.default_partition.""" self._stored.default_partition = partition_name def set_slurmd_available(self, slurmd_available): """Set slurmd_available.""" self._stored.slurmd_available = slurmd_available def set_slurmrestd_available(self, slurmrestd_available): """Set slurmrestd_available.""" self._stored.slurmrestd_available = slurmrestd_available
class SlurmdbdCharm(CharmBase): """Slurmdbd Charm.""" _stored = StoredState() def __init__(self, *args): """Set the default class attributes.""" super().__init__(*args) self._stored.set_default(munge_key=str()) self._stored.set_default(db_info=dict()) self._stored.set_default(slurm_installed=False) self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmdbd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer") self._db = MySQLClient(self, "db") event_handler_bindings = { self.on.install: self._on_install, self.on.config_changed: self._write_config_and_restart_slurmdbd, self._db.on.database_available: self._write_config_and_restart_slurmdbd, self._slurmdbd_peer.on.slurmdbd_peer_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmdbd_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmdbd_unavailable: self._on_slurmdbd_unavailable, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): self._slurm_manager.install() self._stored.slurm_installed = True self.unit.status = ActiveStatus("Slurm Installed") def _on_upgrade(self, event): """Handle upgrade charm event.""" self._slurm_manager.upgrade() def _on_leader_elected(self, event): self._slurmdbd_peer._on_relation_changed(event) def _on_slurmdbd_unavailable(self, event): self._check_status() def _check_status(self) -> bool: """Check that we have the things we need.""" db_info = self._stored.db_info munge_key = self._stored.munge_key slurm_installed = self._stored.slurm_installed slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() deps = [ slurmdbd_info, db_info, slurm_installed, munge_key, ] if not all(deps): if not db_info: self.unit.status = BlockedStatus("Need relation to MySQL.") elif not munge_key: self.unit.status = BlockedStatus( "Need relation to slurm-configurator.") return False return True def _write_config_and_restart_slurmdbd(self, event): """Check for prereqs before writing config/restart of slurmdbd.""" if not self._check_status(): event.defer() return db_info = self._stored.db_info slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() slurmdbd_config = { 'munge_key': self._stored.munge_key, **self.model.config, **slurmdbd_info, **db_info, } if self.model.unit.is_leader(): self._slurmdbd.set_slurmdbd_info_on_app_relation_data( slurmdbd_info) self._slurm_manager.render_config_and_restart(slurmdbd_config) self.unit.status = ActiveStatus("Slurmdbd Available") def get_port(self): """Return the port from slurm-ops-manager.""" return self._slurm_manager.port def get_hostname(self): """Return the hostname from slurm-ops-manager.""" return self._slurm_manager.hostname def get_slurm_component(self): """Return the slurm component.""" return self._slurm_manager.slurm_component def set_munge_key(self, munge_key): """Set the munge key in the stored state.""" self._stored.munge_key = munge_key def set_db_info(self, db_info): """Set the db_info in the stored state.""" self._stored.db_info = db_info
def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default( jwt_key=str(), munge_key=str(), slurm_installed=False, slurmd_available=False, slurmrestd_available=False, slurmdbd_available=False, down_nodes=list(), ) self._slurm_manager = SlurmManager(self, "slurmctld") self._slurmd = Slurmd(self, "slurmd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmrestd = Slurmrestd(self, "slurmrestd") self._slurmctld_peer = SlurmctldPeer(self, "slurmctld-peer") self._prolog_epilog = PrologEpilog(self, "prolog-epilog") self._grafana = GrafanaSource(self, "grafana-source") self._influxdb = InfluxDB(self, "influxdb-api") self._elasticsearch = Elasticsearch(self, "elasticsearch") self._fluentbit = FluentbitClient(self, "fluentbit") self._user_group = UserGroupProvides(self, "user-group") self._etcd = EtcdOps() event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self.on.config_changed: self._on_write_slurm_config, self.on.leader_elected: self._on_leader_elected, # slurm component lifecycle events self._slurmdbd.on.slurmdbd_available: self._on_slurmdbd_available, self._slurmdbd.on.slurmdbd_unavailable: self._on_slurmdbd_unavailable, self._slurmd.on.slurmd_available: self._on_write_slurm_config, self._slurmd.on.slurmd_unavailable: self._on_write_slurm_config, self._slurmd.on.slurmd_departed: self._on_write_slurm_config, self._slurmrestd.on.slurmrestd_available: self._on_slurmrestd_available, self._slurmrestd.on.slurmrestd_unavailable: self._on_write_slurm_config, self._slurmctld_peer.on.slurmctld_peer_available: self. _on_write_slurm_config, # NOTE: a second slurmctld should get the jwt/munge keys and configure them # fluentbit self.on["fluentbit"].relation_created: self._on_fluentbit_relation_created, # Addons lifecycle events self._prolog_epilog.on.prolog_epilog_available: self._on_write_slurm_config, self._prolog_epilog.on.prolog_epilog_unavailable: self._on_write_slurm_config, self._grafana.on.grafana_available: self._on_grafana_available, self._influxdb.on.influxdb_available: self._on_influxdb_available, self._influxdb.on.influxdb_unavailable: self._on_write_slurm_config, self._elasticsearch.on.elasticsearch_available: self._on_elasticsearch_available, self._elasticsearch.on.elasticsearch_unavailable: self._on_write_slurm_config, self._user_group.on.create_user_group: self._on_create_user_group, self._user_group.on.remove_user_group: self._on_remove_user_group, # actions self.on.show_current_config_action: self._on_show_current_config, self.on.drain_action: self._drain_nodes_action, self.on.resume_action: self._resume_nodes_action, self.on.influxdb_info_action: self._infludb_info_action, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
class SlurmdCharm(CharmBase): """Operator charm responsible for facilitating slurmd lifecycle events.""" _stored = StoredState() def __init__(self, *args): """Initialize charm state, and observe charm lifecycle events.""" super().__init__(*args) self.config = self.model.config self.slurm_manager = SlurmManager(self, 'slurmd') self.slurmd = SlurmdProvides(self, "slurmd") self._stored.set_default( slurm_installed=False, slurm_config_available=False, slurm_config=dict(), ) event_handler_bindings = { self.on.install: self._on_install, self.on.config_changed: self._on_config_changed, self.on.upgrade_charm: self._on_upgrade, self.slurmd.on.slurmctld_available: self._on_render_config_and_restart, self.slurmd.on.slurmctld_unavailable: self._on_render_config_and_restart, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): """Install the slurm scheduler as snap or tar file.""" self.slurm_manager.install() self.unit.status = ActiveStatus("Slurm Installed") self._stored.slurm_installed = True def _on_upgrade(self, event): """Upgrade charm event handler.""" slurm_config = dict(self._stored.slurm_config) self.slurm_manager.upgrade(slurm_config, resource=False) def _on_config_changed(self, event): self.slurmd.force_set_config_on_app_relation_data() def _on_render_config_and_restart(self, event): """Retrieve slurm_config from controller and write slurm.conf.""" slurm_installed = self._stored.slurm_installed slurm_config_available = self._stored.slurm_config_available if (slurm_installed and slurm_config_available): # cast StoredState -> python dict slurm_config = dict(self._stored.slurm_config) self.slurm_manager.render_config_and_restart(slurm_config) self.unit.status = ActiveStatus("Slurmd Available") else: self.unit.status = BlockedStatus( "Blocked need relation to slurmctld." ) event.defer() return def is_slurm_installed(self): """Return true/false based on whether or not slurm is installed.""" return self._stored.slurm_installed def set_slurm_config_available(self, config_available): """Set slurm_config_available in local stored state.""" self._stored.slurm_config_available = config_available def set_slurm_config(self, slurm_config): """Set the slurm_config in local stored state.""" self._stored.slurm_config = slurm_config
class SlurmdbdCharm(CharmBase): """Slurmdbd Charm.""" _stored = StoredState() def __init__(self, *args): """Set the default class attributes.""" super().__init__(*args) self._stored.set_default(munge_key=str()) self._stored.set_default(db_info=dict()) self._stored.set_default(slurm_installed=False) self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmdbd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer") self._db = MySQLClient(self, "db") event_handler_bindings = { self.on.install: self._on_install, self.on.config_changed: self._write_config_and_restart_slurmdbd, self._db.on.database_available: self._write_config_and_restart_slurmdbd, self._slurmdbd_peer.on.slurmdbd_peer_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmdbd_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmdbd_unavailable: self._on_slurmdbd_unavailable, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): self._slurm_manager.install() self._stored.slurm_installed = True self.unit.status = ActiveStatus("Slurm Installed") def _on_upgrade(self, event): """Handle upgrade charm event.""" self._slurm_manager.upgrade() def _on_leader_elected(self, event): self._slurmdbd_peer._on_relation_changed(event) def _on_slurmdbd_unavailable(self, event): self._check_status() def _check_status(self) -> bool: """Check that we have the things we need.""" db_info = self._stored.db_info munge_key = self._stored.munge_key slurm_installed = self._stored.slurm_installed slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() deps = [ slurmdbd_info, db_info, slurm_installed, munge_key, ] if not all(deps): if not db_info: self.unit.status = BlockedStatus("Need relation to MySQL.") elif not munge_key: self.unit.status = BlockedStatus( "Need relation to slurm-configurator.") return False return True def _write_config_and_restart_slurmdbd(self, event): """Check for prereqs before writing config/restart of slurmdbd.""" # Ensure all pre-conditions are met with _check_statu(), if not # defer the event. if not self._check_status(): event.defer() return db_info = self._stored.db_info slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() slurmdbd_config = { 'munge_key': self._stored.munge_key, **self.model.config, **slurmdbd_info, **db_info, } self._slurm_manager.render_config_and_restart(slurmdbd_config) logger.debug("rendering config and restarting") # Only the leader can set relation data on the application. # Enforce that no one other then the leader trys to set # application relation data. if self.model.unit.is_leader(): self._slurmdbd.set_slurmdbd_info_on_app_relation_data({ # Juju, and subsequently the operator framework do not # emit relation-changed events if data hasn't actually # changed on the other side of the relation. Even if we set # the data multiple times, it doesn't mean anything unless # the data being set is different then what already exists # in the relation data. # # We use 'slurmdbd_info_id' to ensure the slurmdbd_info # is unique each time it is set on the application relation # data. This is needed so that that related applications # (namely slurm-configurator) will observe a # relation-changed event. # # This event (_write_config_and_restart_slurmdbd) may be # invoked multiple times once _check_status() returns True # (aka pre-conditions are met that account for the deffered # invocations.) # This means that the same slurmdbd_info data may be set on # application data multiple times and slurmdbd may be # reconfigured and restarted while slurmctld and the rest # of the stack are trying to come up and create the clustr. # # We need slurm-configurator to emit the relation-changed # event for the slurmdbd relation every time data is set, # not just when data has changed. # slurm-configurator need to re-emit its chain # of observed events to ensure all services end up getting # reconfigured *and* restarted *after* slurmdbd, for each # time that slurmdbd gets reconfigured and restarted. # # For this reason, 'slurmdbd_info_id' only # matters in the context of making sure the application # relation data actually changes so that relation-changed # event is observed on the other side. 'slurmdbd_info_id': str(uuid.uuid4()), **slurmdbd_info }) self.unit.status = ActiveStatus("Slurmdbd Available") def get_port(self): """Return the port from slurm-ops-manager.""" return self._slurm_manager.port def get_hostname(self): """Return the hostname from slurm-ops-manager.""" return self._slurm_manager.hostname def get_slurm_component(self): """Return the slurm component.""" return self._slurm_manager.slurm_component def set_munge_key(self, munge_key): """Set the munge key in the stored state.""" self._stored.munge_key = munge_key def set_db_info(self, db_info): """Set the db_info in the stored state.""" self._stored.db_info = db_info
class SlurmConfiguratorCharm(CharmBase): """Facilitate slurm configuration operations.""" _stored = StoredState() def __init__(self, *args): """Init charm, _stored defaults, interfaces and observe events.""" super().__init__(*args) self._stored.set_default( munge_key=str(), override_slurm_conf=None, slurm_installed=False, slurmd_restarted=False, slurmctld_available=False, slurmdbd_available=False, slurmd_available=False, slurmrestd_available=False, ) self._elasticsearch = Elasticsearch(self, "elasticsearch") self._grafana = GrafanaSource(self, "grafana-source") self._influxdb = InfluxDB(self, "influxdb-api") self._nhc = Nhc(self, "nhc") self._slurmrestd = Slurmrestd(self, "slurmrestd") self._slurm_manager = SlurmManager(self, "slurmd") self._slurmctld = Slurmctld(self, "slurmctld") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmd = Slurmd(self, "slurmd") self._prolog_epilog = PrologEpilog(self, "prolog-epilog") # #### Charm lifecycle events #### # event_handler_bindings = { # #### Juju lifecycle events #### # self.on.install: self._on_install, self.on.config_changed: self._on_check_status_and_write_config, self.on.upgrade_charm: self._on_upgrade, # ######## Addons lifecycle events ######## # self._elasticsearch.on.elasticsearch_available: self._on_check_status_and_write_config, self._elasticsearch.on.elasticsearch_unavailable: self._on_check_status_and_write_config, self._grafana.on.grafana_available: self._on_grafana_available, self._influxdb.on.influxdb_available: self._on_influxdb_available, self._influxdb.on.influxdb_unavailable: self._on_check_status_and_write_config, self._nhc.on.nhc_bin_available: self._on_check_status_and_write_config, # ######## Slurm component lifecycle events ######## # self._slurmctld.on.slurmctld_available: self._on_check_status_and_write_config, self._slurmctld.on.slurmctld_unavailable: self._on_check_status_and_write_config, self._slurmdbd.on.slurmdbd_available: self._on_check_status_and_write_config, self._slurmdbd.on.slurmdbd_unavailable: self._on_check_status_and_write_config, self._slurmd.on.slurmd_available: self._on_check_status_and_write_config, self._slurmd.on.slurmd_unavailable: self._on_check_status_and_write_config, self._slurmrestd.on.slurmrestd_available: self._on_slurmrestd_available, self._slurmrestd.on.slurmrestd_unavailable: self._on_check_status_and_write_config, self._prolog_epilog.on.prolog_epilog_available: self._on_check_status_and_write_config, self._prolog_epilog.on.prolog_epilog_unavailable: self._on_check_status_and_write_config, # Actions self.on.scontrol_reconfigure_action: self._on_scontrol_reconfigure, self.on.get_slurm_conf_action: self._on_get_slurm_conf, self.on.set_slurm_conf_action: self._on_set_slurm_conf, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_scontrol_reconfigure(self, event): """Run 'scontrol reconfigure' on slurmctld.""" self._slurmctld.scontrol_reconfigure() def _on_get_slurm_conf(self, event): """Return the slurm.conf.""" # Determine if we have an override config. override_slurm_conf = self._stored.override_slurm_conf if override_slurm_conf: slurm_conf = override_slurm_conf else: slurm_conf = self._slurm_manager.get_slurm_conf() # Return the slurm.conf as the result of the action. event.set_results({"slurm.conf": slurm_conf}) def _on_set_slurm_conf(self, event): """Set the override slurm.conf.""" self._stored.override_slurm_conf = event.params["slurm-conf"] def _on_install(self, event): """Install the slurm snap and capture the munge key.""" self._slurm_manager.install(self.config["snapstore-channel"]) self._stored.munge_key = self._slurm_manager.get_munge_key() self._stored.slurm_installed = True self.unit.status = ActiveStatus("slurm installed") def _on_upgrade(self, event): """Upgrade the charm.""" slurm_config = \ self._stored.override_slurm_conf or self._assemble_slurm_config() if not slurm_config: self.unit.status = BlockedStatus( "Cannot generate slurm_config, defering upgrade.") event.defer() return self._slurm_manager.upgrade(slurm_config, self.config["snapstore-channel"]) def _on_grafana_available(self, event): """Create the grafana-source if we are the leader and have influxdb.""" leader = self._is_leader() influxdb_info = self._get_influxdb_info() grafana = self._grafana if leader and influxdb_info: grafana.set_grafana_source_info(influxdb_info) def _on_influxdb_available(self, event): """Create the grafana-source if we have all the things.""" grafana = self._grafana influxdb_info = self._get_influxdb_info() leader = self._is_leader() if leader and grafana.is_joined and influxdb_info: grafana.set_grafana_source_info(influxdb_info) self._on_check_status_and_write_config(event) def _on_slurmrestd_available(self, event): """Set slurm_config on the relation when slurmrestd available.""" if not self._check_status(): event.defer() return # Generate the slurm_config slurm_config = self._assemble_slurm_config() if not slurm_config: self.unit.status = BlockedStatus( "Cannot generate slurm_config - defering event.") event.defer() return if self._stored.slurmrestd_available: self._slurmrestd.set_slurm_config_on_app_relation_data( slurm_config, ) self._slurmrestd.restart_slurmrestd() def _on_check_status_and_write_config(self, event): """Check that we have what we need before we proceed.""" if not self._check_status(): event.defer() return # Generate the slurm_config slurm_config = self._assemble_slurm_config() if not slurm_config: self.unit.status = BlockedStatus( "Cannot generate slurm_config - defering event.") event.defer() return self._slurmctld.set_slurm_config_on_app_relation_data(slurm_config, ) self._slurmctld.restart_slurmctld() self._slurmd.set_slurm_config_on_app_relation_data(slurm_config, ) if self._stored.slurmrestd_available: self._slurmrestd.set_slurm_config_on_app_relation_data( slurm_config, ) self._slurmrestd.restart_slurmrestd() self._slurm_manager.render_slurm_configs(slurm_config) if not self._stored.slurmd_restarted: self._slurm_manager.restart_slurm_component() self._stored.slurmd_restarted = True self._slurmctld.scontrol_reconfigure() def _assemble_slurm_config(self): """Assemble and return the slurm config.""" slurmctld_info = self._slurmctld.get_slurmctld_info() slurmdbd_info = self._slurmdbd.get_slurmdbd_info() slurmd_info = self._slurmd.get_slurmd_info() if not (slurmd_info and slurmctld_info and slurmdbd_info): return {} addons_info = self._assemble_addons() partitions_info = self._assemble_partitions(slurmd_info) logger.debug(addons_info) logger.debug(partitions_info) logger.debug(slurmctld_info) logger.debug(slurmdbd_info) return { "partitions": partitions_info, **slurmctld_info, **slurmdbd_info, **addons_info, **self.config, } def _assemble_partitions(self, slurmd_info): """Make any needed modifications to partition data.""" slurmd_info_tmp = copy.deepcopy(slurmd_info) default_partition_from_config = self.config.get("default_partition") for partition in slurmd_info: # Deep copy the partition to a tmp var so we can modify it as # needed whilst not modifying the object we are iterating over. partition_tmp = copy.deepcopy(partition) # Extract the partition_name from the partition. partition_name = partition["partition_name"] # Check that the default_partition isn't defined in the charm # config. # If the user hasn't provided a default partition, then we infer # the partition_default by defaulting to the "configurator" # partition. if not default_partition_from_config: if partition["partition_name"] == "configurator": partition_tmp["partition_default"] = "YES" else: if default_partition_from_config == partition_name: partition_tmp["partition_default"] = "YES" slurmd_info_tmp.remove(partition) slurmd_info_tmp.append(partition_tmp) return slurmd_info_tmp def _assemble_addons(self): """Assemble any addon components.""" acct_gather = self._get_influxdb_info() elasticsearch_ingress = self._elasticsearch.get_elasticsearch_ingress() nhc_info = self._nhc.get_nhc_info() prolog_epilog = self._prolog_epilog.get_prolog_epilog() ctxt = dict() if prolog_epilog: ctxt["prolog_epilog"] = prolog_epilog if acct_gather: ctxt["acct_gather"] = acct_gather acct_gather_custom = self.config.get("acct_gather_custom") if acct_gather_custom: ctxt["acct_gather"]["custom"] = acct_gather_custom if nhc_info: ctxt["nhc"] = { "nhc_bin": nhc_info["nhc_bin"], "health_check_interval": nhc_info["health_check_interval"], "health_check_node_state": nhc_info["health_check_node_state"], } if elasticsearch_ingress: ctxt["elasticsearch_address"] = elasticsearch_ingress return ctxt def _check_status(self): """Check that the core components we need exist.""" slurm_component_statuses = { "slurmctld": { "available": self._stored.slurmctld_available, "joined": self._slurmctld.is_joined, }, "slurmd": { "available": self._stored.slurmd_available, "joined": self._slurmd.is_joined, }, "slurmdbd": { "available": self._stored.slurmdbd_available, "joined": self._slurmdbd.is_joined, }, } relations_needed = [] waiting_on = [] msg = str() for slurm_component in slurm_component_statuses.keys(): if not slurm_component_statuses[slurm_component]["joined"]: relations_needed.append(slurm_component) elif not slurm_component_statuses[slurm_component]["available"]: waiting_on.append(slurm_component) relations_needed_len = len(relations_needed) waiting_on_len = len(waiting_on) if relations_needed_len > 0: msg += f"Needed relations: {','.join(relations_needed)} " if waiting_on_len > 0: msg += f"Waiting on: {','.join(waiting_on)}" # Using what we have gathered about the status of each slurm component, # determine the application status. if relations_needed_len > 0: self.unit.status = BlockedStatus(msg) elif waiting_on_len > 0: self.unit.status = WaitingStatus(msg) else: self.unit.status = ActiveStatus("slurm-configurator available") return True return False def _get_influxdb_info(self): """Return influxdb info.""" return self._influxdb.get_influxdb_info() def _is_leader(self): return self.model.unit.is_leader() def get_munge_key(self): """Return the slurmdbd_info from stored state.""" return self._stored.munge_key def is_slurm_installed(self): """Return true/false based on whether or not slurm is installed.""" return self._stored.slurm_installed def set_slurmctld_available(self, slurmctld_available): """Set slurmctld_available.""" self._stored.slurmctld_available = slurmctld_available def set_slurmdbd_available(self, slurmdbd_available): """Set slurmdbd_available.""" self._stored.slurmdbd_available = slurmdbd_available def set_slurmd_available(self, slurmd_available): """Set slurmd_available.""" self._stored.slurmd_available = slurmd_available def set_slurmrestd_available(self, slurmrestd_available): """Set slurmrestd_available.""" self._stored.slurmrestd_available = slurmrestd_available
def __init__(self, *args): """Init charm, _stored defaults, interfaces and observe events.""" super().__init__(*args) self._stored.set_default( default_partition=str(), munge_key=str(), slurm_installed=False, slurmctld_available=False, slurmdbd_available=False, slurmd_available=False, slurmrestd_available=False, ) self._elasticsearch = Elasticsearch(self, "elasticsearch") self._grafana = GrafanaSource(self, "grafana-source") self._influxdb = InfluxDB(self, "influxdb-api") self._nhc = Nhc(self, "nhc") self._slurmrestd = Slurmrestd(self, "slurmrestd") self._slurm_manager = SlurmManager(self, "slurmd") self._slurmctld = Slurmctld(self, "slurmctld") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmd = Slurmd(self, "slurmd") self._prolog_epilog = PrologEpilog(self, "prolog-epilog") # #### Charm lifecycle events #### # event_handler_bindings = { # #### Juju lifecycle events #### # self.on.install: self._on_install, # self.on.start: # self._on_check_status_and_write_config, self.on.config_changed: self._on_check_status_and_write_config, self.on.upgrade_charm: self._on_upgrade, # ######## Addons lifecycle events ######## # self._elasticsearch.on.elasticsearch_available: self._on_check_status_and_write_config, self._elasticsearch.on.elasticsearch_unavailable: self._on_check_status_and_write_config, self._grafana.on.grafana_available: self._on_grafana_available, self._influxdb.on.influxdb_available: self._on_influxdb_available, self._influxdb.on.influxdb_unavailable: self._on_check_status_and_write_config, self._nhc.on.nhc_bin_available: self._on_check_status_and_write_config, # ######## Slurm component lifecycle events ######## # self._slurmctld.on.slurmctld_available: self._on_check_status_and_write_config, self._slurmctld.on.slurmctld_unavailable: self._on_check_status_and_write_config, self._slurmdbd.on.slurmdbd_available: self._on_check_status_and_write_config, self._slurmdbd.on.slurmdbd_unavailable: self._on_check_status_and_write_config, self._slurmd.on.slurmd_available: self._on_check_status_and_write_config, self._slurmd.on.slurmd_unavailable: self._on_check_status_and_write_config, self._slurmrestd.on.slurmrestd_available: self._on_check_status_and_write_config, self._slurmrestd.on.slurmrestd_unavailable: self._on_check_status_and_write_config, self._prolog_epilog.on.prolog_epilog_available: self._on_check_status_and_write_config, self._prolog_epilog.on.prolog_epilog_unavailable: self._on_check_status_and_write_config, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
class SlurmrestdCharm(CharmBase): """Operator charm responsible for lifecycle operations for slurmrestd.""" _stored = StoredState() def __init__(self, *args): """Initialize charm and configure states and events to observe.""" super().__init__(*args) self._stored.set_default(slurm_installed=False, slurmrestd_restarted=False, cluster_name=str()) self._slurm_manager = SlurmManager(self, "slurmrestd") self._slurmrestd = SlurmrestdRequires(self, 'slurmrestd') self._fluentbit = FluentbitClient(self, "fluentbit") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self._slurmrestd.on.config_available: self._on_check_status_and_write_config, self._slurmrestd.on.config_unavailable: self._on_config_unavailable, self._slurmrestd.on.munge_key_available: self._on_configure_munge_key, self._slurmrestd.on.jwt_rsa_available: self._on_configure_jwt_rsa, self._slurmrestd.on.restart_slurmrestd: self._on_restart_slurmrestd, # fluentbit self.on["fluentbit"].relation_created: self._on_fluentbit_relation_created, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): """Perform installation operations for slurmrestd.""" self.unit.set_workload_version(Path("version").read_text().strip()) self.unit.status = WaitingStatus("Installing slurmrestd") custom_repo = self.config.get("custom-slurm-repo") successful_installation = self._slurm_manager.install(custom_repo) if successful_installation: self.unit.status = ActiveStatus("slurmrestd installed") self._stored.slurm_installed = True self._slurm_manager.start_munged() else: self.unit.status = BlockedStatus("Error installing slurmrestd") event.defer() self._check_status() def _on_fluentbit_relation_created(self, event): """Set up Fluentbit log forwarding.""" self._configure_fluentbit() def _configure_fluentbit(self): logger.debug("## Configuring fluentbit") cfg = list() cfg.extend(self._slurm_manager.fluentbit_config_nhc) cfg.extend(self._slurm_manager.fluentbit_config_slurm) self._fluentbit.configure(cfg) def _on_upgrade(self, event): """Perform upgrade operations.""" self.unit.set_workload_version(Path("version").read_text().strip()) self._check_status() def _on_update_status(self, event): """Handle update status.""" self._check_status() def _on_config_unavailable(self, event): """Handle the config unavailable due to relation broken.""" # when the config becomes unavailable, we have to set this flag to False, # so the next time the config becoms avaiable, the daemon restarts self._stored.slurmrestd_restarted = False self._check_status() def _on_restart_slurmrestd(self, event): """Resart the slurmrestd component.""" logger.debug("## _on_restart_slurmrestd") if not self._check_status(): event.defer() return self._slurm_manager.restart_slurm_component() self._stored.slurmrestd_restarted = True def _on_configure_munge_key(self, event): """Configure the munge key. 1) Get the munge key from the stored state of the slurmrestd relation 2) Write the munge key to the munge key path and chmod 3) Restart munged """ if not self._stored.slurm_installed: event.defer() return logger.debug("## configuring new munge key") munge_key = self._slurmrestd.get_stored_munge_key() self._slurm_manager.configure_munge_key(munge_key) self._slurm_manager.restart_munged() def _on_configure_jwt_rsa(self, event): if not self._stored.slurm_installed: event.defer() return logger.debug("## configuring new jwt rsa") jwt_rsa = self._slurmrestd.get_stored_jwt_rsa() self._slurm_manager.configure_jwt_rsa(jwt_rsa) def _check_status(self) -> bool: if self._slurm_manager.needs_reboot: self.unit.status = BlockedStatus("Machine needs reboot") return False if not self._stored.slurm_installed: self.unit.status = BlockedStatus("Error installing slurmrestd") return False # Check and see if we have what we need for operation. if not self._slurmrestd.is_joined: self.unit.status = BlockedStatus("Need relations: slurmctld") return False slurmctld_available = (self._slurmrestd.get_stored_munge_key() and self._slurmrestd.get_stored_jwt_rsa() and self._slurmrestd.get_stored_slurm_config()) if not slurmctld_available: self.unit.status = WaitingStatus("Waiting on: slurmctld") return True self.unit.status = ActiveStatus("slurmrestd available") return True def _on_check_status_and_write_config(self, event): if not self._check_status(): event.defer() return slurm_config = self._slurmrestd.get_stored_slurm_config() if slurm_config: self._slurm_manager.render_slurm_configs(slurm_config) self.cluster_name = slurm_config.get("cluster_name") else: logger.error(f"## weird slurmconfig: {slurm_config}") # Only restart slurmrestd the first time the node is brought up. if not self._stored.slurmrestd_restarted: self._on_restart_slurmrestd(event) if self._fluentbit._relation is not None: self._configure_fluentbit() @property def cluster_name(self) -> str: """Return the cluster-name.""" return self._stored.cluster_name @cluster_name.setter def cluster_name(self, name: str): """Set the cluster-name.""" self._stored.cluster_name = name