def __init__(self, *args): """Set the default class attributes.""" super().__init__(*args) self._stored.set_default( db_info=dict(), slurmdbd_config=dict(), munge_key_available=False, slurm_installed=False, ) self._db = MySQLClient(self, "db") self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmdbd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer") event_handler_bindings = { self.on.install: self._on_install, self.on.config_changed: self._write_config_and_restart_slurmdbd, self._db.on.database_available: self._write_config_and_restart_slurmdbd, self._slurmdbd_peer.on.slurmdbd_peer_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmdbd_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmdbd_unavailable: self._on_slurm_configurator_unavailable, self._slurmdbd.on.munge_key_available: self._on_munge_key_available, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
def __init__(self, *args): """Set the default class attributes.""" super().__init__(*args) self._stored.set_default(db_info=dict(), jwt_available=False, munge_available=False, slurm_installed=False, cluster_name=str()) self._db = MySQLClient(self, "db") # self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmdbd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer") self._fluentbit = FluentbitClient(self, "fluentbit") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self.on.config_changed: self._write_config_and_restart_slurmdbd, self.on.jwt_available: self._on_jwt_available, self.on.munge_available: self._on_munge_available, self.on.write_config: self._write_config_and_restart_slurmdbd, self._db.on.database_available: self._write_config_and_restart_slurmdbd, self._db.on.database_unavailable: self._on_db_unavailable, self._slurmdbd_peer.on.slurmdbd_peer_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmctld_available: self._on_slurmctld_available, self._slurmdbd.on.slurmctld_unavailable: self._on_slurmctld_unavailable, # fluentbit self.on["fluentbit"].relation_created: self._on_fluentbit_relation_created, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
def __init__(self, *args): """Init charm, _stored defaults, interfaces and observe events.""" super().__init__(*args) self._stored.set_default( default_partition=str(), munge_key=str(), slurm_installed=False, slurmctld_available=False, slurmdbd_available=False, slurmd_available=False, slurmrestd_available=False, ) self._elasticsearch = Elasticsearch(self, "elasticsearch") self._grafana = GrafanaSource(self, "grafana-source") self._influxdb = InfluxDB(self, "influxdb-api") self._nhc = Nhc(self, "nhc") self._slurmrestd = Slurmrestd(self, "slurmrestd") self._slurm_manager = SlurmManager(self, "slurmd") self._slurmctld = Slurmctld(self, "slurmctld") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmd = Slurmd(self, "slurmd") self._prolog_epilog = PrologEpilog(self, "prolog-epilog") # #### Charm lifecycle events #### # event_handler_bindings = { # #### Juju lifecycle events #### # self.on.install: self._on_install, # self.on.start: # self._on_check_status_and_write_config, self.on.config_changed: self._on_check_status_and_write_config, self.on.upgrade_charm: self._on_upgrade, # ######## Addons lifecycle events ######## # self._elasticsearch.on.elasticsearch_available: self._on_check_status_and_write_config, self._elasticsearch.on.elasticsearch_unavailable: self._on_check_status_and_write_config, self._grafana.on.grafana_available: self._on_grafana_available, self._influxdb.on.influxdb_available: self._on_influxdb_available, self._influxdb.on.influxdb_unavailable: self._on_check_status_and_write_config, self._nhc.on.nhc_bin_available: self._on_check_status_and_write_config, # ######## Slurm component lifecycle events ######## # self._slurmctld.on.slurmctld_available: self._on_check_status_and_write_config, self._slurmctld.on.slurmctld_unavailable: self._on_check_status_and_write_config, self._slurmdbd.on.slurmdbd_available: self._on_check_status_and_write_config, self._slurmdbd.on.slurmdbd_unavailable: self._on_check_status_and_write_config, self._slurmd.on.slurmd_available: self._on_check_status_and_write_config, self._slurmd.on.slurmd_unavailable: self._on_check_status_and_write_config, self._slurmrestd.on.slurmrestd_available: self._on_check_status_and_write_config, self._slurmrestd.on.slurmrestd_unavailable: self._on_check_status_and_write_config, self._prolog_epilog.on.prolog_epilog_available: self._on_check_status_and_write_config, self._prolog_epilog.on.prolog_epilog_unavailable: self._on_check_status_and_write_config, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
class SlurmConfiguratorCharm(CharmBase): """Facilitate slurm configuration operations.""" _stored = StoredState() def __init__(self, *args): """Init charm, _stored defaults, interfaces and observe events.""" super().__init__(*args) self._stored.set_default( default_partition=str(), munge_key=str(), slurm_installed=False, slurmctld_available=False, slurmdbd_available=False, slurmd_available=False, slurmrestd_available=False, ) self._elasticsearch = Elasticsearch(self, "elasticsearch") self._grafana = GrafanaSource(self, "grafana-source") self._influxdb = InfluxDB(self, "influxdb-api") self._nhc = Nhc(self, "nhc") self._slurmrestd = Slurmrestd(self, "slurmrestd") self._slurm_manager = SlurmManager(self, "slurmd") self._slurmctld = Slurmctld(self, "slurmctld") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmd = Slurmd(self, "slurmd") self._prolog_epilog = PrologEpilog(self, "prolog-epilog") # #### Charm lifecycle events #### # event_handler_bindings = { # #### Juju lifecycle events #### # self.on.install: self._on_install, # self.on.start: # self._on_check_status_and_write_config, self.on.config_changed: self._on_check_status_and_write_config, self.on.upgrade_charm: self._on_upgrade, # ######## Addons lifecycle events ######## # self._elasticsearch.on.elasticsearch_available: self._on_check_status_and_write_config, self._elasticsearch.on.elasticsearch_unavailable: self._on_check_status_and_write_config, self._grafana.on.grafana_available: self._on_grafana_available, self._influxdb.on.influxdb_available: self._on_influxdb_available, self._influxdb.on.influxdb_unavailable: self._on_check_status_and_write_config, self._nhc.on.nhc_bin_available: self._on_check_status_and_write_config, # ######## Slurm component lifecycle events ######## # self._slurmctld.on.slurmctld_available: self._on_check_status_and_write_config, self._slurmctld.on.slurmctld_unavailable: self._on_check_status_and_write_config, self._slurmdbd.on.slurmdbd_available: self._on_check_status_and_write_config, self._slurmdbd.on.slurmdbd_unavailable: self._on_check_status_and_write_config, self._slurmd.on.slurmd_available: self._on_check_status_and_write_config, self._slurmd.on.slurmd_unavailable: self._on_check_status_and_write_config, self._slurmrestd.on.slurmrestd_available: self._on_check_status_and_write_config, self._slurmrestd.on.slurmrestd_unavailable: self._on_check_status_and_write_config, self._prolog_epilog.on.prolog_epilog_available: self._on_check_status_and_write_config, self._prolog_epilog.on.prolog_epilog_unavailable: self._on_check_status_and_write_config, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): """Install the slurm snap and set the munge key.""" self._slurm_manager.install() self._stored.munge_key = self._slurm_manager.get_munge_key() self._stored.slurm_installed = True self.unit.status = ActiveStatus("Slurm Installed") def _on_upgrade(self, event): """Upgrade the charm.""" slurm_config = self._assemble_slurm_config() if not slurm_config: self.unit.status = BlockedStatus( "Cannot generate slurm_config, defering upgrade.") event.defer() return self._slurm_manager.upgrade(slurm_config) def _on_grafana_available(self, event): """Create the grafana-source if we are the leader and have influxdb.""" leader = self._is_leader() influxdb_info = self._get_influxdb_info() grafana = self._grafana if leader and influxdb_info: grafana.set_grafana_source_info(influxdb_info) def _on_influxdb_available(self, event): """Create the grafana-source if we have all the things.""" grafana = self._grafana influxdb_info = self._get_influxdb_info() leader = self._is_leader() if leader and grafana.is_joined and influxdb_info: grafana.set_grafana_source_info(influxdb_info) self._on_check_status_and_write_config(event) def _on_check_status_and_write_config(self, event): """Check that we have what we need before we proceed.""" if not self._check_status(): event.defer() return # Generate the slurm_config slurm_config = self._assemble_slurm_config() if not slurm_config: self.unit.status = BlockedStatus( "Cannot generate slurm_config - defering event.") event.defer() return self._slurmctld.set_slurm_config_on_app_relation_data(slurm_config, ) self._slurmd.set_slurm_config_on_app_relation_data(slurm_config, ) if self._stored.slurmrestd_available: self._slurmrestd.set_slurm_config_on_app_relation_data( slurm_config, ) self._slurm_manager.render_config_and_restart({ **slurm_config, 'munge_key': self.get_munge_key() }) def _assemble_slurm_config(self): """Assemble and return the slurm config.""" slurmctld_info = self._slurmctld.get_slurmctld_info() slurmdbd_info = self._slurmdbd.get_slurmdbd_info() slurmd_info = self._slurmd.get_slurmd_info() if not (slurmd_info and slurmctld_info and slurmdbd_info): return {} addons_info = self._assemble_addons() partitions_info = self._assemble_partitions(slurmd_info) logger.debug(addons_info) logger.debug(partitions_info) logger.debug(slurmctld_info) logger.debug(slurmdbd_info) return { 'munge_key': self._stored.munge_key, 'partitions': partitions_info, **slurmctld_info, **slurmdbd_info, **addons_info, **self.model.config, } def _assemble_partitions(self, slurmd_info): """Make any needed modifications to partition data.""" slurmd_info_tmp = copy.deepcopy(slurmd_info) for partition in slurmd_info: # Deep copy the partition to a tmp var so we can modify it as # needed whilst not modifying the object we are iterating over. partition_tmp = copy.deepcopy(partition) # Extract the partition_name from the partition and from the charm # config. partition_name = partition['partition_name'] default_partition_from_config = self.model.config.get( 'default_partition') # Check that the default_partition isn't defined in the charm # config. # If the user hasn't provided a default partition, then we infer # the partition_default by defaulting to the first related slurmd # application. if not default_partition_from_config: if partition['partition_name'] ==\ self._stored.default_partition: partition_tmp['partition_default'] = 'YES' else: if default_partition_from_config == partition_name: partition_tmp['partition_default'] = 'YES' slurmd_info_tmp.remove(partition) slurmd_info_tmp.append(partition_tmp) return slurmd_info_tmp def _assemble_addons(self): """Assemble any addon components.""" acct_gather = self._get_influxdb_info() elasticsearch_ingress = \ self._elasticsearch.get_elasticsearch_ingress() nhc_info = self._nhc.get_nhc_info() prolog_epilog = self._prolog_epilog.get_prolog_epilog() ctxt = dict() if prolog_epilog: ctxt['prolog_epilog'] = prolog_epilog if acct_gather: ctxt['acct_gather'] = acct_gather acct_gather_custom = self.model.config.get('acct_gather_custom') if acct_gather_custom: ctxt['acct_gather']['custom'] = acct_gather_custom if nhc_info: ctxt['nhc'] = { 'nhc_bin': nhc_info['nhc_bin'], 'health_check_interval': nhc_info['health_check_interval'], 'health_check_node_state': nhc_info['health_check_node_state'], } if elasticsearch_ingress: ctxt['elasticsearch_address'] = elasticsearch_ingress return ctxt def _check_status(self): """Check that the core components we need exist.""" slurmctld_available = self._stored.slurmctld_available slurmdbd_available = self._stored.slurmdbd_available slurmd_available = self._stored.slurmd_available slurm_installed = self._stored.slurm_installed default_partition = self._stored.default_partition deps = [ default_partition, slurmctld_available, slurmdbd_available, slurmd_available, slurm_installed, ] if not all(deps): if not slurmctld_available: self.unit.status = BlockedStatus("NEED RELATION TO SLURMCTLD") elif not slurmdbd_available: self.unit.status = BlockedStatus("NEED RELATION TO SLURMDBD") elif not slurmd_available: self.unit.status = BlockedStatus("NEED RELATION TO SLURMD") elif not slurm_installed: self.unit.status = BlockedStatus("SLURM NOT INSTALLED") else: self.unit.status = BlockedStatus("PARTITION NAME UNAVAILABLE") return False else: self.unit.status = ActiveStatus("") return True def _get_influxdb_info(self): """Return influxdb info.""" return self._influxdb.get_influxdb_info() def _is_leader(self): return self.model.unit.is_leader() def get_munge_key(self): """Return the slurmdbd_info from stored state.""" return self._stored.munge_key def get_default_partition(self): """Return self._stored.default_partition.""" return self._stored.default_partition def is_slurm_installed(self): """Return true/false based on whether or not slurm is installed.""" return self._stored.slurm_installed def set_slurmctld_available(self, slurmctld_available): """Set slurmctld_available.""" self._stored.slurmctld_available = slurmctld_available def set_slurmdbd_available(self, slurmdbd_available): """Set slurmdbd_available.""" self._stored.slurmdbd_available = slurmdbd_available def set_default_partition(self, partition_name): """Set self._stored.default_partition.""" self._stored.default_partition = partition_name def set_slurmd_available(self, slurmd_available): """Set slurmd_available.""" self._stored.slurmd_available = slurmd_available def set_slurmrestd_available(self, slurmrestd_available): """Set slurmrestd_available.""" self._stored.slurmrestd_available = slurmrestd_available
def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default( jwt_key=str(), munge_key=str(), slurm_installed=False, slurmd_available=False, slurmrestd_available=False, slurmdbd_available=False, down_nodes=list(), ) self._slurm_manager = SlurmManager(self, "slurmctld") self._slurmd = Slurmd(self, "slurmd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmrestd = Slurmrestd(self, "slurmrestd") self._slurmctld_peer = SlurmctldPeer(self, "slurmctld-peer") self._prolog_epilog = PrologEpilog(self, "prolog-epilog") self._grafana = GrafanaSource(self, "grafana-source") self._influxdb = InfluxDB(self, "influxdb-api") self._elasticsearch = Elasticsearch(self, "elasticsearch") self._fluentbit = FluentbitClient(self, "fluentbit") self._user_group = UserGroupProvides(self, "user-group") self._etcd = EtcdOps() event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self.on.config_changed: self._on_write_slurm_config, self.on.leader_elected: self._on_leader_elected, # slurm component lifecycle events self._slurmdbd.on.slurmdbd_available: self._on_slurmdbd_available, self._slurmdbd.on.slurmdbd_unavailable: self._on_slurmdbd_unavailable, self._slurmd.on.slurmd_available: self._on_write_slurm_config, self._slurmd.on.slurmd_unavailable: self._on_write_slurm_config, self._slurmd.on.slurmd_departed: self._on_write_slurm_config, self._slurmrestd.on.slurmrestd_available: self._on_slurmrestd_available, self._slurmrestd.on.slurmrestd_unavailable: self._on_write_slurm_config, self._slurmctld_peer.on.slurmctld_peer_available: self. _on_write_slurm_config, # NOTE: a second slurmctld should get the jwt/munge keys and configure them # fluentbit self.on["fluentbit"].relation_created: self._on_fluentbit_relation_created, # Addons lifecycle events self._prolog_epilog.on.prolog_epilog_available: self._on_write_slurm_config, self._prolog_epilog.on.prolog_epilog_unavailable: self._on_write_slurm_config, self._grafana.on.grafana_available: self._on_grafana_available, self._influxdb.on.influxdb_available: self._on_influxdb_available, self._influxdb.on.influxdb_unavailable: self._on_write_slurm_config, self._elasticsearch.on.elasticsearch_available: self._on_elasticsearch_available, self._elasticsearch.on.elasticsearch_unavailable: self._on_write_slurm_config, self._user_group.on.create_user_group: self._on_create_user_group, self._user_group.on.remove_user_group: self._on_remove_user_group, # actions self.on.show_current_config_action: self._on_show_current_config, self.on.drain_action: self._drain_nodes_action, self.on.resume_action: self._resume_nodes_action, self.on.influxdb_info_action: self._infludb_info_action, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler)
class SlurmctldCharm(CharmBase): """Slurmctld lifecycle events.""" _stored = StoredState() def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default( jwt_key=str(), munge_key=str(), slurm_installed=False, slurmd_available=False, slurmrestd_available=False, slurmdbd_available=False, down_nodes=list(), ) self._slurm_manager = SlurmManager(self, "slurmctld") self._slurmd = Slurmd(self, "slurmd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmrestd = Slurmrestd(self, "slurmrestd") self._slurmctld_peer = SlurmctldPeer(self, "slurmctld-peer") self._prolog_epilog = PrologEpilog(self, "prolog-epilog") self._grafana = GrafanaSource(self, "grafana-source") self._influxdb = InfluxDB(self, "influxdb-api") self._elasticsearch = Elasticsearch(self, "elasticsearch") self._fluentbit = FluentbitClient(self, "fluentbit") self._user_group = UserGroupProvides(self, "user-group") self._etcd = EtcdOps() event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self.on.config_changed: self._on_write_slurm_config, self.on.leader_elected: self._on_leader_elected, # slurm component lifecycle events self._slurmdbd.on.slurmdbd_available: self._on_slurmdbd_available, self._slurmdbd.on.slurmdbd_unavailable: self._on_slurmdbd_unavailable, self._slurmd.on.slurmd_available: self._on_write_slurm_config, self._slurmd.on.slurmd_unavailable: self._on_write_slurm_config, self._slurmd.on.slurmd_departed: self._on_write_slurm_config, self._slurmrestd.on.slurmrestd_available: self._on_slurmrestd_available, self._slurmrestd.on.slurmrestd_unavailable: self._on_write_slurm_config, self._slurmctld_peer.on.slurmctld_peer_available: self. _on_write_slurm_config, # NOTE: a second slurmctld should get the jwt/munge keys and configure them # fluentbit self.on["fluentbit"].relation_created: self._on_fluentbit_relation_created, # Addons lifecycle events self._prolog_epilog.on.prolog_epilog_available: self._on_write_slurm_config, self._prolog_epilog.on.prolog_epilog_unavailable: self._on_write_slurm_config, self._grafana.on.grafana_available: self._on_grafana_available, self._influxdb.on.influxdb_available: self._on_influxdb_available, self._influxdb.on.influxdb_unavailable: self._on_write_slurm_config, self._elasticsearch.on.elasticsearch_available: self._on_elasticsearch_available, self._elasticsearch.on.elasticsearch_unavailable: self._on_write_slurm_config, self._user_group.on.create_user_group: self._on_create_user_group, self._user_group.on.remove_user_group: self._on_remove_user_group, # actions self.on.show_current_config_action: self._on_show_current_config, self.on.drain_action: self._drain_nodes_action, self.on.resume_action: self._resume_nodes_action, self.on.influxdb_info_action: self._infludb_info_action, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) @property def hostname(self): """Return the hostname.""" return self._slurm_manager.hostname @property def port(self): """Return the port.""" return self._slurm_manager.port @property def cluster_name(self) -> str: """Return the cluster name.""" return self.config.get("cluster-name") @property def _slurmctld_info(self): return self._slurmctld_peer.get_slurmctld_info() @property def slurmdbd_info(self): """Return slurmdbd_info from relation.""" return self._slurmdbd.get_slurmdbd_info() @property def _slurmd_info(self) -> list: return self._slurmd.get_slurmd_info() @property def _cluster_info(self): """Assemble information about the cluster.""" cluster_info = {} cluster_info['cluster_name'] = self.config.get('cluster-name') cluster_info['custom_config'] = self.config.get('custom-config') cluster_info['proctrack_type'] = self.config.get('proctrack-type') cluster_info['cgroup_config'] = self.config.get('cgroup-config') interval = self.config.get('health-check-interval') state = self.config.get('health-check-state') nhc = self._slurm_manager.slurm_config_nhc_values(interval, state) cluster_info.update(nhc) return cluster_info @property def _addons_info(self): """Assemble addons for slurm.conf.""" return { **self._assemble_prolog_epilog(), **self._assemble_acct_gather_addon(), **self._assemble_elastic_search_addon() } def _assemble_prolog_epilog(self) -> dict: """Generate the prolog_epilog section of the addons.""" logger.debug("## Generating prolog epilog configuration") prolog_epilog = self._prolog_epilog.get_prolog_epilog() if prolog_epilog: return {"prolog_epilog": prolog_epilog} else: return {} def _assemble_acct_gather_addon(self): """Generate the acct gather section of the addons.""" logger.debug("## Generating acct gather configuration") addons = dict() influxdb_info = self._get_influxdb_info() if influxdb_info: addons["acct_gather"] = influxdb_info addons["acct_gather"]["default"] = "all" addons["acct_gather_profile"] = "acct_gather_profile/influxdb" # it is possible to setup influxdb or hdf5 profiles without the # relation, using the custom-config section of slurm.conf. We need to # support setting up the acct_gather configuration for this scenario acct_gather_custom = self.config.get("acct-gather-custom") if acct_gather_custom: if not addons.get("acct_gather"): addons["acct_gather"] = dict() addons["acct_gather"]["custom"] = acct_gather_custom addons["acct_gather_frequency"] = self.config.get( "acct-gather-frequency") return addons def _assemble_elastic_search_addon(self): """Generate the acct gather section of the addons.""" logger.debug("## Generating elastic search addon configuration") addon = dict() elasticsearch_ingress = self._elasticsearch.elasticsearch_ingress if elasticsearch_ingress: suffix = f"/{self.cluster_name}/jobcomp" addon = { "elasticsearch_address": f"{elasticsearch_ingress}{suffix}" } return addon def set_slurmd_available(self, flag: bool): """Set stored value of slurmd available.""" self._stored.slurmd_available = flag def _set_slurmdbd_available(self, flag: bool): """Set stored value of slurmdbd available.""" self._stored.slurmdbd_available = flag def set_slurmrestd_available(self, flag: bool): """Set stored value of slurmdrest available.""" self._stored.slurmrestd_available = flag def _is_leader(self): return self.model.unit.is_leader() def is_slurm_installed(self): """Return true/false based on whether or not slurm is installed.""" return self._stored.slurm_installed def _on_show_current_config(self, event): """Show current slurm.conf.""" slurm_conf = self._slurm_manager.get_slurm_conf() event.set_results({"slurm.conf": slurm_conf}) def _on_install(self, event): """Perform installation operations for slurmctld.""" self.unit.set_workload_version(Path("version").read_text().strip()) self.unit.status = WaitingStatus("Installing slurmctld") custom_repo = self.config.get("custom-slurm-repo") successful_installation = self._slurm_manager.install(custom_repo) if successful_installation: self._stored.slurm_installed = True # Store the munge_key and jwt_rsa key in the stored state. # NOTE: Use leadership settings instead of stored state when # leadership settings support becomes available in the framework. if self._is_leader(): # NOTE the backup controller should also have the jwt and munge # keys configured. We should move these information to the # peer relation. self._stored.jwt_rsa = self._slurm_manager.generate_jwt_rsa() self._stored.munge_key = self._slurm_manager.get_munge_key() self._slurm_manager.configure_jwt_rsa(self.get_jwt_rsa()) else: # NOTE: the secondary slurmctld should get the jwt and munge # keys from the peer relation here logger.debug("secondary slurmctld") # all slurmctld should restart munged here, as it would assure # munge is working self._slurm_manager.restart_munged() else: self.unit.status = BlockedStatus("Error installing slurmctld") event.defer() logger.debug("## Retrieving etcd resource to install it") try: etcd_path = self.model.resources.fetch("etcd") logger.debug(f"## Found etcd resource: {etcd_path}") except ModelError: logger.error("## Missing etcd resource") self.unit.status = BlockedStatus("Missing etcd resource") event.defer() return self._etcd.install(etcd_path) self._check_status() def _on_fluentbit_relation_created(self, event): """Set up Fluentbit log forwarding.""" logger.debug("## Configuring fluentbit") cfg = list() cfg.extend(self._slurm_manager.fluentbit_config_nhc) cfg.extend(self._slurm_manager.fluentbit_config_slurm) self._fluentbit.configure(cfg) def _on_upgrade(self, event): """Perform upgrade operations.""" self.unit.set_workload_version(Path("version").read_text().strip()) def _on_update_status(self, event): """Handle update status.""" self._check_status() def _on_leader_elected(self, event: LeaderElectedEvent) -> None: logger.debug("## slurmctld - leader elected") self._etcd.start() # populate etcd with the nodelist slurm_config = self._assemble_slurm_config() accounted_nodes = self._assemble_all_nodes( slurm_config.get("partitions", [])) logger.debug( f"## Sending to etcd list of accounted nodes: {accounted_nodes}") self._etcd.set_list_of_accounted_nodes(accounted_nodes) def _check_status(self): """Check for all relations and set appropriate status. This charm needs these conditions to be satified in order to be ready: - Slurm components installed. - Munge running. - slurmdbd node running. - slurmd inventory. """ # NOTE: slurmd and slurmrestd are not needed for slurmctld to work, # only for the cluster to operate. But we need slurmd inventory # to assemble slurm.conf if self._slurm_manager.needs_reboot: self.unit.status = BlockedStatus("Machine needs reboot") return False if not self._stored.slurm_installed: self.unit.status = BlockedStatus("Error installing slurmctld") return False if (self._is_leader() and not self._etcd.is_active()): self.unit.status = WaitingStatus("Initializing charm") return False if not self._slurm_manager.check_munged(): self.unit.status = BlockedStatus("Error configuring munge key") return False # statuses of mandatory components: # - joined: someone executed juju relate slurmctld foo # - available: the units exchanged data through the relation # NOTE: slurmrestd is not mandatory for the cluster to work, that's why # it is not acounted for in here statuses = { "slurmd": { "available": self._stored.slurmd_available, "joined": self._slurmd.is_joined }, "slurmdbd": { "available": self._stored.slurmdbd_available, "joined": self._slurmdbd.is_joined } } relations_needed = list() waiting_on = list() for component in statuses.keys(): if not statuses[component]["joined"]: relations_needed.append(component) if not statuses[component]["available"]: waiting_on.append(component) if len(relations_needed): msg = f"Need relations: {','.join(relations_needed)}" self.unit.status = BlockedStatus(msg) return False if len(waiting_on): msg = f"Wating on: {','.join(waiting_on)}" self.unit.status = WaitingStatus(msg) return False self.unit.status = ActiveStatus("slurmctld available") return True def get_munge_key(self): """Get the stored munge key.""" return self._stored.munge_key def get_jwt_rsa(self): """Get the stored jwt_rsa key.""" return self._stored.jwt_rsa def _assemble_partitions(self, slurmd_info): """Make any needed modifications to partition data.""" slurmd_info_tmp = copy.deepcopy(slurmd_info) default_partition_from_config = self.config.get("default-partition") for partition in slurmd_info: # Deep copy the partition to a tmp var so we can modify it as # needed whilst not modifying the object we are iterating over. partition_tmp = copy.deepcopy(partition) # Extract the partition_name from the partition. partition_name = partition["partition_name"] # Check that the default_partition isn't defined in the charm # config. # If the user hasn't provided a default partition, then we infer # the partition_default by defaulting to the "configurator" # partition. if default_partition_from_config: if default_partition_from_config == partition_name: partition_tmp["partition_default"] = "YES" slurmd_info_tmp.remove(partition) slurmd_info_tmp.append(partition_tmp) return slurmd_info_tmp def _assemble_slurm_config(self): """Assemble and return the slurm config.""" logger.debug('## Assembling new slurm.conf') slurmctld_info = self._slurmctld_info slurmdbd_info = self.slurmdbd_info slurmd_info = self._slurmd_info cluster_info = self._cluster_info logger.debug("######## INFO") logger.debug(f'## slurmd: {slurmd_info}') logger.debug(f'## slurmctld_info: {slurmctld_info}') logger.debug(f'## slurmdbd_info: {slurmdbd_info}') logger.debug(f'## cluster_info: {cluster_info}') logger.debug("######## INFO - end") if not (slurmctld_info and slurmd_info and slurmdbd_info): return {} addons_info = self._addons_info partitions_info = self._assemble_partitions(slurmd_info) down_nodes = self._assemble_down_nodes(slurmd_info) logger.debug(f'#### addons: {addons_info}') logger.debug(f'#### partitions_info: {partitions_info}') logger.debug(f"#### Down nodes: {down_nodes}") return { "partitions": partitions_info, "down_nodes": down_nodes, **slurmctld_info, **slurmdbd_info, **addons_info, **cluster_info, } def _on_slurmrestd_available(self, event): """Set slurm_config on the relation when slurmrestd available.""" if not self._check_status(): event.defer() return slurm_config = self._assemble_slurm_config() if not slurm_config: self.unit.status = BlockedStatus( "Cannot generate slurm_config - defering event.") event.defer() return if self._stored.slurmrestd_available: self._slurmrestd.set_slurm_config_on_app_relation_data( slurm_config, ) self._slurmrestd.restart_slurmrestd() def _on_slurmdbd_available(self, event): self._set_slurmdbd_available(True) self._on_write_slurm_config(event) def _on_slurmdbd_unavailable(self, event): self._set_slurmdbd_available(False) self._check_status() def _on_write_slurm_config(self, event): """Check that we have what we need before we proceed.""" logger.debug("### Slurmctld - _on_write_slurm_config()") # only the leader should write the config, restart, and scontrol reconf if not self._is_leader(): return if not self._check_status(): event.defer() return slurm_config = self._assemble_slurm_config() if slurm_config: self._slurm_manager.render_slurm_configs(slurm_config) # restart is needed if nodes are added/removed from the cluster self._slurm_manager.slurm_systemctl('restart') self._slurm_manager.slurm_cmd('scontrol', 'reconfigure') # send the list of hostnames to slurmd via etcd accounted_nodes = self._assemble_all_nodes( slurm_config["partitions"]) self._etcd.set_list_of_accounted_nodes(accounted_nodes) # send the custom NHC parameters to all slurmd self._slurmd.set_nhc_params(self.config.get('health-check-params')) # check for "not new anymore" nodes, i.e., nodes that runned the # node-configured action. Those nodes are not anymore in the # DownNodes section in the slurm.conf, but we need to resume them # manually and update the internal cache down_nodes = slurm_config['down_nodes'] configured_nodes = self._assemble_configured_nodes(down_nodes) logger.debug(f"### configured nodes: {configured_nodes}") self._resume_nodes(configured_nodes) self._stored.down_nodes = down_nodes.copy() # slurmrestd needs the slurm.conf file, so send it every time it changes if self._stored.slurmrestd_available: self._slurmrestd.set_slurm_config_on_app_relation_data( slurm_config) # NOTE: scontrol reconfigure does not restart slurmrestd self._slurmrestd.restart_slurmrestd() else: logger.debug("## Should rewrite slurm.conf, but we don't have it. " "Deferring.") event.defer() @staticmethod def _assemble_all_nodes(slurmd_info: list) -> List[str]: """Parse slurmd_info and return a list with all hostnames.""" nodes = list() for partition in slurmd_info: for node in partition["inventory"]: nodes.append(node["node_name"]) return nodes @staticmethod def _assemble_down_nodes(slurmd_info): """Parse partitions' nodes and assemble a list of DownNodes.""" down_nodes = [] for partition in slurmd_info: for node in partition["inventory"]: if node["new_node"]: down_nodes.append(node["node_name"]) return down_nodes def _assemble_configured_nodes(self, down_nodes): """Assemble list of nodes that are not new anymore. new_node status is removed with an action, this method returns a list of nodes that were previously new but are not anymore. """ configured_nodes = [] for node in self._stored.down_nodes: if node not in down_nodes: configured_nodes.append(node) return configured_nodes def _resume_nodes(self, nodelist): """Run scontrol to resume the speficied node list.""" nodes = ",".join(nodelist) update_cmd = f"update nodename={nodes} state=resume" self._slurm_manager.slurm_cmd('scontrol', update_cmd) def _on_grafana_available(self, event): """Create the grafana-source if we are the leader and have influxdb.""" if not self._is_leader(): return influxdb_info = self._get_influxdb_info() if influxdb_info: self._grafana.set_grafana_source_info(influxdb_info) else: logger.error( "## Can not set Grafana source: missing influxdb relation") def _on_influxdb_available(self, event): """Assemble addons to forward slurm data to influxdb.""" self._on_write_slurm_config(event) def _on_elasticsearch_available(self, event): """Assemble addons to forward Slurm data to elasticsearch.""" self._on_write_slurm_config(event) def _get_influxdb_info(self) -> dict: """Return influxdb info.""" return self._influxdb.get_influxdb_info() def _drain_nodes_action(self, event): """Drain specified nodes.""" nodes = event.params['nodename'] reason = event.params['reason'] logger.debug(f'#### Draining {nodes} because {reason}.') event.log(f'Draining {nodes} because {reason}.') try: cmd = f'scontrol update nodename={nodes} state=drain reason="{reason}"' subprocess.check_output(shlex.split(cmd)) event.set_results({'status': 'draining', 'nodes': nodes}) except subprocess.CalledProcessError as e: event.fail(message=f'Error draining {nodes}: {e.output}') def _resume_nodes_action(self, event): """Resume specified nodes.""" nodes = event.params['nodename'] logger.debug(f'#### Resuming {nodes}.') event.log(f'Resuming {nodes}.') try: cmd = f'scontrol update nodename={nodes} state=resume' subprocess.check_output(shlex.split(cmd)) event.set_results({'status': 'resuming', 'nodes': nodes}) except subprocess.CalledProcessError as e: event.fail(message=f'Error resuming {nodes}: {e.output}') def _infludb_info_action(self, event): influxdb_info = self._get_influxdb_info() if not influxdb_info: influxdb_info = "not related" logger.debug(f"## InfluxDB-info action: {influxdb_info}") event.set_results({"influxdb": influxdb_info}) def _on_create_user_group(self, event): """Create the user and group provided.""" user = self._user_group.user_name user_uid = self._user_group.user_uid group = self._user_group.group_name # Create the group. try: subprocess.check_output(["groupadd", "--gid", user_uid, group]) # use the UID as the GID except subprocess.CalledProcessError as e: if e.returncode == 9: logger.warning("## Group already exists.") if e.returncode == 4: logger.warning("## GID already exists.") self._user_group._relation.data[self._user_group.model.app][ "status"] = "failure: GID already exists" return else: logger.error(f"## Error creating group: {e}") # Create the user. try: subprocess.check_output([ "useradd", "--system", "--no-create-home", "--gid", group, "--shell", "/usr/sbin/nologin", "-u", user_uid, user, ]) except subprocess.CalledProcessError as e: if e.returncode == 9: logger.warning("## User already exists.") if e.returncode == 4: logger.warning("## UID already exists.") self._user_group._relation.data[self._user_group.model.app][ "status"] = "failure: UID already exists" return else: logger.error(f"## Error creating user: {e}") self._user_group._relation.data[ self._user_group.model.app]["status"] = "success: User created" def _on_remove_user_group(self, event): """Remove the user and group provided.""" user = self._user_group.user_name group = self._user_group.group_name # Remove the user. try: subprocess.check_output(["userdel", user]) except subprocess.CalledProcessError as e: logger.error(f"## Error deleting user: {e}") # Remove the group. try: subprocess.check_output(["groupdel", group]) except subprocess.CalledProcessError as e: logger.error(f"## Error deleting group: {e}")
class SlurmdbdCharm(CharmBase): """Slurmdbd Charm.""" _stored = StoredState() def __init__(self, *args): """Set the default class attributes.""" super().__init__(*args) self._stored.set_default( db_info=dict(), slurmdbd_config=dict(), munge_key_available=False, slurm_installed=False, ) self._db = MySQLClient(self, "db") self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmdbd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer") event_handler_bindings = { self.on.install: self._on_install, self.on.config_changed: self._write_config_and_restart_slurmdbd, self._db.on.database_available: self._write_config_and_restart_slurmdbd, self._slurmdbd_peer.on.slurmdbd_peer_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmdbd_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmdbd_unavailable: self._on_slurm_configurator_unavailable, self._slurmdbd.on.munge_key_available: self._on_munge_key_available, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): self._slurm_manager.install(self.config["snapstore-channel"]) self._stored.slurm_installed = True self.unit.status = ActiveStatus("slurm snap successfully installed") def _on_upgrade(self, event): """Handle upgrade charm event.""" self._slurm_manager.upgrade() def _on_munge_key_available(self, event): if not self._stored.slurm_installed: event.defer() return munge_key = self._slurmdbd.get_munge_key() self._slurm_manager.configure_munge_key(munge_key) self._slurm_manager.restart_munged() self._stored.munge_key_available = True def _on_slurm_configurator_unavailable(self, event): self._stored.munge_key_available = False self._check_status() def _check_status(self) -> bool: """Check that we have the things we need.""" db_info = self._stored.db_info munge_key_available = self._stored.munge_key_available slurm_installed = self._stored.slurm_installed slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() deps = [ slurmdbd_info, db_info, slurm_installed, munge_key_available, ] if not all(deps): if not db_info: self.unit.status = BlockedStatus( "Need relation to MySQL." ) elif not munge_key_available: self.unit.status = BlockedStatus( "Need relation to slurm-configurator." ) return False return True def _write_config_and_restart_slurmdbd(self, event): """Check for prereqs before writing config/restart of slurmdbd.""" # Ensure all pre-conditions are met with _check_status(), if not # defer the event. if not self._check_status(): event.defer() return db_info = self._stored.db_info slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() slurmdbd_stored_config = dict(self._stored.slurmdbd_config) slurmdbd_config = { **self.config, **slurmdbd_info, **db_info, } if slurmdbd_config != slurmdbd_stored_config: self._stored.slurmdbd_config = slurmdbd_config self._slurm_manager.render_slurm_configs(slurmdbd_config) self._slurm_manager.restart_slurm_component() # Only the leader can set relation data on the application. # Enforce that no one other then the leader trys to set # application relation data. if self.model.unit.is_leader(): self._slurmdbd.set_slurmdbd_info_on_app_relation_data( slurmdbd_config, ) self.unit.status = ActiveStatus("slurmdbd available") def get_port(self): """Return the port from slurm-ops-manager.""" return self._slurm_manager.port def get_hostname(self): """Return the hostname from slurm-ops-manager.""" return self._slurm_manager.hostname def set_db_info(self, db_info): """Set the db_info in the stored state.""" self._stored.db_info = db_info
class SlurmdbdCharm(CharmBase): """Slurmdbd Charm.""" _stored = StoredState() on = SlurmdbdCharmEvents() def __init__(self, *args): """Set the default class attributes.""" super().__init__(*args) self._stored.set_default(db_info=dict(), jwt_available=False, munge_available=False, slurm_installed=False, cluster_name=str()) self._db = MySQLClient(self, "db") # self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmdbd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer") self._fluentbit = FluentbitClient(self, "fluentbit") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self.on.config_changed: self._write_config_and_restart_slurmdbd, self.on.jwt_available: self._on_jwt_available, self.on.munge_available: self._on_munge_available, self.on.write_config: self._write_config_and_restart_slurmdbd, self._db.on.database_available: self._write_config_and_restart_slurmdbd, self._db.on.database_unavailable: self._on_db_unavailable, self._slurmdbd_peer.on.slurmdbd_peer_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmctld_available: self._on_slurmctld_available, self._slurmdbd.on.slurmctld_unavailable: self._on_slurmctld_unavailable, # fluentbit self.on["fluentbit"].relation_created: self._on_fluentbit_relation_created, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): """Perform installation operations for slurmdbd.""" self.unit.set_workload_version(Path("version").read_text().strip()) self.unit.status = WaitingStatus("Installing slurmdbd") custom_repo = self.config.get("custom-slurm-repo") successful_installation = self._slurm_manager.install(custom_repo) if successful_installation: self._stored.slurm_installed = True self.unit.status = ActiveStatus("slurmdbd successfully installed") else: self.unit.status = BlockedStatus("Error installing slurmdbd") event.defer() return self._check_status() def _on_fluentbit_relation_created(self, event): """Set up Fluentbit log forwarding.""" self._configure_fluentbit() def _configure_fluentbit(self): logger.debug("## Configuring fluentbit") cfg = list() cfg.extend(self._slurm_manager.fluentbit_config_nhc) cfg.extend(self._slurm_manager.fluentbit_config_slurm) self._fluentbit.configure(cfg) def _on_upgrade(self, event): """Perform upgrade operations.""" self.unit.set_workload_version(Path("version").read_text().strip()) def _on_update_status(self, event): """Handle update status.""" self._check_status() def _on_jwt_available(self, event): """Retrieve and configure the jwt_rsa key.""" # jwt rsa lives in slurm spool dir, it is created when slurm is installed if not self._stored.slurm_installed: event.defer() return jwt_rsa = self._slurmdbd.get_jwt_rsa() self._slurm_manager.configure_jwt_rsa(jwt_rsa) self._stored.jwt_available = True def _on_munge_available(self, event): """Retrieve munge key and start munged.""" # munge is installed together with slurm if not self._stored.slurm_installed: event.defer() return munge_key = self._slurmdbd.get_munge_key() self._slurm_manager.configure_munge_key(munge_key) if self._slurm_manager.restart_munged(): logger.debug("## Munge restarted succesfully") self._stored.munge_available = True else: logger.error("## Unable to restart munge") self.unit.status = BlockedStatus("Error restarting munge") event.defer() def _on_db_unavailable(self, event): self._stored.db_info = dict() # TODO tell slurmctld that slurmdbd left? self._check_status() def _on_slurmctld_available(self, event): self.on.jwt_available.emit() self.on.munge_available.emit() self.on.write_config.emit() if self._fluentbit._relation is not None: self._configure_fluentbit() def _on_slurmctld_unavailable(self, event): """Reset state and charm status when slurmctld broken.""" self._stored.jwt_available = False self._stored.munge_available = False self._check_status() def _is_leader(self): return self.model.unit.is_leader() def _write_config_and_restart_slurmdbd(self, event): """Check for prereqs before writing config/restart of slurmdbd.""" # Ensure all pre-conditions are met with _check_status(), if not # defer the event. if not self._check_status(): event.defer() return db_info = self._stored.db_info slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() # settings from the config.yaml config = {"slurmdbd_debug": self.config.get("slurmdbd-debug")} slurmdbd_config = { **config, **slurmdbd_info, **db_info, } self._slurm_manager.slurm_systemctl("stop") self._slurm_manager.render_slurm_configs(slurmdbd_config) # At this point, we must guarantee that slurmdbd is correctly # initialized. Its startup might take a while, so we have to wait # for it. self._check_slurmdbd() # Only the leader can set relation data on the application. # Enforce that no one other then the leader trys to set # application relation data. if self.model.unit.is_leader(): self._slurmdbd.set_slurmdbd_info_on_app_relation_data( slurmdbd_config, ) self._check_status() def _check_slurmdbd(self, max_attemps=3) -> None: """Ensure slurmdbd is up and running.""" logger.debug("## Checking if slurmdbd is active") for i in range(max_attemps): if self._slurm_manager.slurm_is_active(): logger.debug("## Slurmdbd running") break else: logger.warning("## Slurmdbd not running, trying to start it") self.unit.status = WaitingStatus("Starting slurmdbd") self._slurm_manager.restart_slurm_component() sleep(1 + i) if self._slurm_manager.slurm_is_active(): self._check_status() else: self.unit.status = BlockedStatus("Cannot start slurmdbd") def _check_status(self) -> bool: """Check that we have the things we need.""" if self._slurm_manager.needs_reboot: self.unit.status = BlockedStatus("Machine needs reboot") return False slurm_installed = self._stored.slurm_installed if not slurm_installed: self.unit.status = BlockedStatus("Error installing slurm") return False # we must be sure to initialize the charms correctly. Slurmdbd must # first connect to the db to be able to connect to slurmctld correctly slurmctld_available = (self._stored.jwt_available and self._stored.munge_available) statuses = { "MySQL": { "available": self._stored.db_info != dict(), "joined": self._db.is_joined }, "slurcmtld": { "available": slurmctld_available, "joined": self._slurmdbd.is_joined } } relations_needed = list() waiting_on = list() for component in statuses.keys(): if not statuses[component]["joined"]: relations_needed.append(component) if not statuses[component]["available"]: waiting_on.append(component) if len(relations_needed): msg = f"Need relations: {','.join(relations_needed)}" self.unit.status = BlockedStatus(msg) return False if len(waiting_on): msg = f"Wating on: {','.join(waiting_on)}" self.unit.status = WaitingStatus(msg) return False slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() if not slurmdbd_info: self.unit.status = WaitingStatus("slurmdbd starting") return False if not self._slurm_manager.check_munged(): self.unit.status = WaitingStatus("munged starting") return False self.unit.status = ActiveStatus("slurmdbd available") return True def get_port(self): """Return the port from slurm-ops-manager.""" return self._slurm_manager.port def get_hostname(self): """Return the hostname from slurm-ops-manager.""" return self._slurm_manager.hostname def set_db_info(self, db_info): """Set the db_info in the stored state.""" self._stored.db_info = db_info @property def cluster_name(self) -> str: """Return the cluster-name.""" return self._stored.cluster_name @cluster_name.setter def cluster_name(self, name: str): """Set the cluster-name.""" self._stored.cluster_name = name
class SlurmConfiguratorCharm(CharmBase): """Facilitate slurm configuration operations.""" _stored = StoredState() def __init__(self, *args): """Init charm, _stored defaults, interfaces and observe events.""" super().__init__(*args) self._stored.set_default( munge_key=str(), override_slurm_conf=None, slurm_installed=False, slurmd_restarted=False, slurmctld_available=False, slurmdbd_available=False, slurmd_available=False, slurmrestd_available=False, ) self._elasticsearch = Elasticsearch(self, "elasticsearch") self._grafana = GrafanaSource(self, "grafana-source") self._influxdb = InfluxDB(self, "influxdb-api") self._nhc = Nhc(self, "nhc") self._slurmrestd = Slurmrestd(self, "slurmrestd") self._slurm_manager = SlurmManager(self, "slurmd") self._slurmctld = Slurmctld(self, "slurmctld") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmd = Slurmd(self, "slurmd") self._prolog_epilog = PrologEpilog(self, "prolog-epilog") # #### Charm lifecycle events #### # event_handler_bindings = { # #### Juju lifecycle events #### # self.on.install: self._on_install, self.on.config_changed: self._on_check_status_and_write_config, self.on.upgrade_charm: self._on_upgrade, # ######## Addons lifecycle events ######## # self._elasticsearch.on.elasticsearch_available: self._on_check_status_and_write_config, self._elasticsearch.on.elasticsearch_unavailable: self._on_check_status_and_write_config, self._grafana.on.grafana_available: self._on_grafana_available, self._influxdb.on.influxdb_available: self._on_influxdb_available, self._influxdb.on.influxdb_unavailable: self._on_check_status_and_write_config, self._nhc.on.nhc_bin_available: self._on_check_status_and_write_config, # ######## Slurm component lifecycle events ######## # self._slurmctld.on.slurmctld_available: self._on_check_status_and_write_config, self._slurmctld.on.slurmctld_unavailable: self._on_check_status_and_write_config, self._slurmdbd.on.slurmdbd_available: self._on_check_status_and_write_config, self._slurmdbd.on.slurmdbd_unavailable: self._on_check_status_and_write_config, self._slurmd.on.slurmd_available: self._on_check_status_and_write_config, self._slurmd.on.slurmd_unavailable: self._on_check_status_and_write_config, self._slurmrestd.on.slurmrestd_available: self._on_slurmrestd_available, self._slurmrestd.on.slurmrestd_unavailable: self._on_check_status_and_write_config, self._prolog_epilog.on.prolog_epilog_available: self._on_check_status_and_write_config, self._prolog_epilog.on.prolog_epilog_unavailable: self._on_check_status_and_write_config, # Actions self.on.scontrol_reconfigure_action: self._on_scontrol_reconfigure, self.on.get_slurm_conf_action: self._on_get_slurm_conf, self.on.set_slurm_conf_action: self._on_set_slurm_conf, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_scontrol_reconfigure(self, event): """Run 'scontrol reconfigure' on slurmctld.""" self._slurmctld.scontrol_reconfigure() def _on_get_slurm_conf(self, event): """Return the slurm.conf.""" # Determine if we have an override config. override_slurm_conf = self._stored.override_slurm_conf if override_slurm_conf: slurm_conf = override_slurm_conf else: slurm_conf = self._slurm_manager.get_slurm_conf() # Return the slurm.conf as the result of the action. event.set_results({"slurm.conf": slurm_conf}) def _on_set_slurm_conf(self, event): """Set the override slurm.conf.""" self._stored.override_slurm_conf = event.params["slurm-conf"] def _on_install(self, event): """Install the slurm snap and capture the munge key.""" self._slurm_manager.install(self.config["snapstore-channel"]) self._stored.munge_key = self._slurm_manager.get_munge_key() self._stored.slurm_installed = True self.unit.status = ActiveStatus("slurm installed") def _on_upgrade(self, event): """Upgrade the charm.""" slurm_config = \ self._stored.override_slurm_conf or self._assemble_slurm_config() if not slurm_config: self.unit.status = BlockedStatus( "Cannot generate slurm_config, defering upgrade.") event.defer() return self._slurm_manager.upgrade(slurm_config, self.config["snapstore-channel"]) def _on_grafana_available(self, event): """Create the grafana-source if we are the leader and have influxdb.""" leader = self._is_leader() influxdb_info = self._get_influxdb_info() grafana = self._grafana if leader and influxdb_info: grafana.set_grafana_source_info(influxdb_info) def _on_influxdb_available(self, event): """Create the grafana-source if we have all the things.""" grafana = self._grafana influxdb_info = self._get_influxdb_info() leader = self._is_leader() if leader and grafana.is_joined and influxdb_info: grafana.set_grafana_source_info(influxdb_info) self._on_check_status_and_write_config(event) def _on_slurmrestd_available(self, event): """Set slurm_config on the relation when slurmrestd available.""" if not self._check_status(): event.defer() return # Generate the slurm_config slurm_config = self._assemble_slurm_config() if not slurm_config: self.unit.status = BlockedStatus( "Cannot generate slurm_config - defering event.") event.defer() return if self._stored.slurmrestd_available: self._slurmrestd.set_slurm_config_on_app_relation_data( slurm_config, ) self._slurmrestd.restart_slurmrestd() def _on_check_status_and_write_config(self, event): """Check that we have what we need before we proceed.""" if not self._check_status(): event.defer() return # Generate the slurm_config slurm_config = self._assemble_slurm_config() if not slurm_config: self.unit.status = BlockedStatus( "Cannot generate slurm_config - defering event.") event.defer() return self._slurmctld.set_slurm_config_on_app_relation_data(slurm_config, ) self._slurmctld.restart_slurmctld() self._slurmd.set_slurm_config_on_app_relation_data(slurm_config, ) if self._stored.slurmrestd_available: self._slurmrestd.set_slurm_config_on_app_relation_data( slurm_config, ) self._slurmrestd.restart_slurmrestd() self._slurm_manager.render_slurm_configs(slurm_config) if not self._stored.slurmd_restarted: self._slurm_manager.restart_slurm_component() self._stored.slurmd_restarted = True self._slurmctld.scontrol_reconfigure() def _assemble_slurm_config(self): """Assemble and return the slurm config.""" slurmctld_info = self._slurmctld.get_slurmctld_info() slurmdbd_info = self._slurmdbd.get_slurmdbd_info() slurmd_info = self._slurmd.get_slurmd_info() if not (slurmd_info and slurmctld_info and slurmdbd_info): return {} addons_info = self._assemble_addons() partitions_info = self._assemble_partitions(slurmd_info) logger.debug(addons_info) logger.debug(partitions_info) logger.debug(slurmctld_info) logger.debug(slurmdbd_info) return { "partitions": partitions_info, **slurmctld_info, **slurmdbd_info, **addons_info, **self.config, } def _assemble_partitions(self, slurmd_info): """Make any needed modifications to partition data.""" slurmd_info_tmp = copy.deepcopy(slurmd_info) default_partition_from_config = self.config.get("default_partition") for partition in slurmd_info: # Deep copy the partition to a tmp var so we can modify it as # needed whilst not modifying the object we are iterating over. partition_tmp = copy.deepcopy(partition) # Extract the partition_name from the partition. partition_name = partition["partition_name"] # Check that the default_partition isn't defined in the charm # config. # If the user hasn't provided a default partition, then we infer # the partition_default by defaulting to the "configurator" # partition. if not default_partition_from_config: if partition["partition_name"] == "configurator": partition_tmp["partition_default"] = "YES" else: if default_partition_from_config == partition_name: partition_tmp["partition_default"] = "YES" slurmd_info_tmp.remove(partition) slurmd_info_tmp.append(partition_tmp) return slurmd_info_tmp def _assemble_addons(self): """Assemble any addon components.""" acct_gather = self._get_influxdb_info() elasticsearch_ingress = self._elasticsearch.get_elasticsearch_ingress() nhc_info = self._nhc.get_nhc_info() prolog_epilog = self._prolog_epilog.get_prolog_epilog() ctxt = dict() if prolog_epilog: ctxt["prolog_epilog"] = prolog_epilog if acct_gather: ctxt["acct_gather"] = acct_gather acct_gather_custom = self.config.get("acct_gather_custom") if acct_gather_custom: ctxt["acct_gather"]["custom"] = acct_gather_custom if nhc_info: ctxt["nhc"] = { "nhc_bin": nhc_info["nhc_bin"], "health_check_interval": nhc_info["health_check_interval"], "health_check_node_state": nhc_info["health_check_node_state"], } if elasticsearch_ingress: ctxt["elasticsearch_address"] = elasticsearch_ingress return ctxt def _check_status(self): """Check that the core components we need exist.""" slurm_component_statuses = { "slurmctld": { "available": self._stored.slurmctld_available, "joined": self._slurmctld.is_joined, }, "slurmd": { "available": self._stored.slurmd_available, "joined": self._slurmd.is_joined, }, "slurmdbd": { "available": self._stored.slurmdbd_available, "joined": self._slurmdbd.is_joined, }, } relations_needed = [] waiting_on = [] msg = str() for slurm_component in slurm_component_statuses.keys(): if not slurm_component_statuses[slurm_component]["joined"]: relations_needed.append(slurm_component) elif not slurm_component_statuses[slurm_component]["available"]: waiting_on.append(slurm_component) relations_needed_len = len(relations_needed) waiting_on_len = len(waiting_on) if relations_needed_len > 0: msg += f"Needed relations: {','.join(relations_needed)} " if waiting_on_len > 0: msg += f"Waiting on: {','.join(waiting_on)}" # Using what we have gathered about the status of each slurm component, # determine the application status. if relations_needed_len > 0: self.unit.status = BlockedStatus(msg) elif waiting_on_len > 0: self.unit.status = WaitingStatus(msg) else: self.unit.status = ActiveStatus("slurm-configurator available") return True return False def _get_influxdb_info(self): """Return influxdb info.""" return self._influxdb.get_influxdb_info() def _is_leader(self): return self.model.unit.is_leader() def get_munge_key(self): """Return the slurmdbd_info from stored state.""" return self._stored.munge_key def is_slurm_installed(self): """Return true/false based on whether or not slurm is installed.""" return self._stored.slurm_installed def set_slurmctld_available(self, slurmctld_available): """Set slurmctld_available.""" self._stored.slurmctld_available = slurmctld_available def set_slurmdbd_available(self, slurmdbd_available): """Set slurmdbd_available.""" self._stored.slurmdbd_available = slurmdbd_available def set_slurmd_available(self, slurmd_available): """Set slurmd_available.""" self._stored.slurmd_available = slurmd_available def set_slurmrestd_available(self, slurmrestd_available): """Set slurmrestd_available.""" self._stored.slurmrestd_available = slurmrestd_available
class SlurmdbdCharm(CharmBase): """Slurmdbd Charm.""" _stored = StoredState() def __init__(self, *args): """Set the default class attributes.""" super().__init__(*args) self._stored.set_default(munge_key=str()) self._stored.set_default(db_info=dict()) self._stored.set_default(slurm_installed=False) self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmdbd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer") self._db = MySQLClient(self, "db") event_handler_bindings = { self.on.install: self._on_install, self.on.config_changed: self._write_config_and_restart_slurmdbd, self._db.on.database_available: self._write_config_and_restart_slurmdbd, self._slurmdbd_peer.on.slurmdbd_peer_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmdbd_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmdbd_unavailable: self._on_slurmdbd_unavailable, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): self._slurm_manager.install() self._stored.slurm_installed = True self.unit.status = ActiveStatus("Slurm Installed") def _on_upgrade(self, event): """Handle upgrade charm event.""" self._slurm_manager.upgrade() def _on_leader_elected(self, event): self._slurmdbd_peer._on_relation_changed(event) def _on_slurmdbd_unavailable(self, event): self._check_status() def _check_status(self) -> bool: """Check that we have the things we need.""" db_info = self._stored.db_info munge_key = self._stored.munge_key slurm_installed = self._stored.slurm_installed slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() deps = [ slurmdbd_info, db_info, slurm_installed, munge_key, ] if not all(deps): if not db_info: self.unit.status = BlockedStatus("Need relation to MySQL.") elif not munge_key: self.unit.status = BlockedStatus( "Need relation to slurm-configurator.") return False return True def _write_config_and_restart_slurmdbd(self, event): """Check for prereqs before writing config/restart of slurmdbd.""" if not self._check_status(): event.defer() return db_info = self._stored.db_info slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() slurmdbd_config = { 'munge_key': self._stored.munge_key, **self.model.config, **slurmdbd_info, **db_info, } if self.model.unit.is_leader(): self._slurmdbd.set_slurmdbd_info_on_app_relation_data( slurmdbd_info) self._slurm_manager.render_config_and_restart(slurmdbd_config) self.unit.status = ActiveStatus("Slurmdbd Available") def get_port(self): """Return the port from slurm-ops-manager.""" return self._slurm_manager.port def get_hostname(self): """Return the hostname from slurm-ops-manager.""" return self._slurm_manager.hostname def get_slurm_component(self): """Return the slurm component.""" return self._slurm_manager.slurm_component def set_munge_key(self, munge_key): """Set the munge key in the stored state.""" self._stored.munge_key = munge_key def set_db_info(self, db_info): """Set the db_info in the stored state.""" self._stored.db_info = db_info
class SlurmdbdCharm(CharmBase): """Slurmdbd Charm.""" _stored = StoredState() def __init__(self, *args): """Set the default class attributes.""" super().__init__(*args) self._stored.set_default(munge_key=str()) self._stored.set_default(db_info=dict()) self._stored.set_default(slurm_installed=False) self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmdbd") self._slurmdbd = Slurmdbd(self, "slurmdbd") self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer") self._db = MySQLClient(self, "db") event_handler_bindings = { self.on.install: self._on_install, self.on.config_changed: self._write_config_and_restart_slurmdbd, self._db.on.database_available: self._write_config_and_restart_slurmdbd, self._slurmdbd_peer.on.slurmdbd_peer_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmdbd_available: self._write_config_and_restart_slurmdbd, self._slurmdbd.on.slurmdbd_unavailable: self._on_slurmdbd_unavailable, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): self._slurm_manager.install() self._stored.slurm_installed = True self.unit.status = ActiveStatus("Slurm Installed") def _on_upgrade(self, event): """Handle upgrade charm event.""" self._slurm_manager.upgrade() def _on_leader_elected(self, event): self._slurmdbd_peer._on_relation_changed(event) def _on_slurmdbd_unavailable(self, event): self._check_status() def _check_status(self) -> bool: """Check that we have the things we need.""" db_info = self._stored.db_info munge_key = self._stored.munge_key slurm_installed = self._stored.slurm_installed slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() deps = [ slurmdbd_info, db_info, slurm_installed, munge_key, ] if not all(deps): if not db_info: self.unit.status = BlockedStatus("Need relation to MySQL.") elif not munge_key: self.unit.status = BlockedStatus( "Need relation to slurm-configurator.") return False return True def _write_config_and_restart_slurmdbd(self, event): """Check for prereqs before writing config/restart of slurmdbd.""" # Ensure all pre-conditions are met with _check_statu(), if not # defer the event. if not self._check_status(): event.defer() return db_info = self._stored.db_info slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info() slurmdbd_config = { 'munge_key': self._stored.munge_key, **self.model.config, **slurmdbd_info, **db_info, } self._slurm_manager.render_config_and_restart(slurmdbd_config) logger.debug("rendering config and restarting") # Only the leader can set relation data on the application. # Enforce that no one other then the leader trys to set # application relation data. if self.model.unit.is_leader(): self._slurmdbd.set_slurmdbd_info_on_app_relation_data({ # Juju, and subsequently the operator framework do not # emit relation-changed events if data hasn't actually # changed on the other side of the relation. Even if we set # the data multiple times, it doesn't mean anything unless # the data being set is different then what already exists # in the relation data. # # We use 'slurmdbd_info_id' to ensure the slurmdbd_info # is unique each time it is set on the application relation # data. This is needed so that that related applications # (namely slurm-configurator) will observe a # relation-changed event. # # This event (_write_config_and_restart_slurmdbd) may be # invoked multiple times once _check_status() returns True # (aka pre-conditions are met that account for the deffered # invocations.) # This means that the same slurmdbd_info data may be set on # application data multiple times and slurmdbd may be # reconfigured and restarted while slurmctld and the rest # of the stack are trying to come up and create the clustr. # # We need slurm-configurator to emit the relation-changed # event for the slurmdbd relation every time data is set, # not just when data has changed. # slurm-configurator need to re-emit its chain # of observed events to ensure all services end up getting # reconfigured *and* restarted *after* slurmdbd, for each # time that slurmdbd gets reconfigured and restarted. # # For this reason, 'slurmdbd_info_id' only # matters in the context of making sure the application # relation data actually changes so that relation-changed # event is observed on the other side. 'slurmdbd_info_id': str(uuid.uuid4()), **slurmdbd_info }) self.unit.status = ActiveStatus("Slurmdbd Available") def get_port(self): """Return the port from slurm-ops-manager.""" return self._slurm_manager.port def get_hostname(self): """Return the hostname from slurm-ops-manager.""" return self._slurm_manager.hostname def get_slurm_component(self): """Return the slurm component.""" return self._slurm_manager.slurm_component def set_munge_key(self, munge_key): """Set the munge key in the stored state.""" self._stored.munge_key = munge_key def set_db_info(self, db_info): """Set the db_info in the stored state.""" self._stored.db_info = db_info