예제 #1
0
class SlurmdbdCharm(CharmBase):
    """Slurmdbd Charm."""

    _stored = StoredState()

    def __init__(self, *args):
        """Set the default class attributes."""
        super().__init__(*args)

        self._stored.set_default(
            db_info=dict(),
            slurmdbd_config=dict(),
            munge_key_available=False,
            slurm_installed=False,
        )

        self._db = MySQLClient(self, "db")
        self._nrpe = Nrpe(self, "nrpe-external-master")
        self._slurm_manager = SlurmManager(self, "slurmdbd")
        self._slurmdbd = Slurmdbd(self, "slurmdbd")
        self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.config_changed: self._write_config_and_restart_slurmdbd,
            self._db.on.database_available: self._write_config_and_restart_slurmdbd,
            self._slurmdbd_peer.on.slurmdbd_peer_available: self._write_config_and_restart_slurmdbd,
            self._slurmdbd.on.slurmdbd_available: self._write_config_and_restart_slurmdbd,
            self._slurmdbd.on.slurmdbd_unavailable: self._on_slurm_configurator_unavailable,
            self._slurmdbd.on.munge_key_available: self._on_munge_key_available,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        self._slurm_manager.install(self.config["snapstore-channel"])
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("slurm snap successfully installed")

    def _on_upgrade(self, event):
        """Handle upgrade charm event."""
        self._slurm_manager.upgrade()

    def _on_munge_key_available(self, event):
        if not self._stored.slurm_installed:
            event.defer()
            return
        munge_key = self._slurmdbd.get_munge_key()
        self._slurm_manager.configure_munge_key(munge_key)
        self._slurm_manager.restart_munged()
        self._stored.munge_key_available = True

    def _on_slurm_configurator_unavailable(self, event):
        self._stored.munge_key_available = False
        self._check_status()

    def _check_status(self) -> bool:
        """Check that we have the things we need."""
        db_info = self._stored.db_info
        munge_key_available = self._stored.munge_key_available
        slurm_installed = self._stored.slurm_installed
        slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info()

        deps = [
            slurmdbd_info,
            db_info,
            slurm_installed,
            munge_key_available,
        ]

        if not all(deps):
            if not db_info:
                self.unit.status = BlockedStatus(
                    "Need relation to MySQL."
                )
            elif not munge_key_available:
                self.unit.status = BlockedStatus(
                    "Need relation to slurm-configurator."
                )
            return False
        return True

    def _write_config_and_restart_slurmdbd(self, event):
        """Check for prereqs before writing config/restart of slurmdbd."""
        # Ensure all pre-conditions are met with _check_status(), if not
        # defer the event.
        if not self._check_status():
            event.defer()
            return

        db_info = self._stored.db_info
        slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info()
        slurmdbd_stored_config = dict(self._stored.slurmdbd_config)

        slurmdbd_config = {
            **self.config,
            **slurmdbd_info,
            **db_info,
        }

        if slurmdbd_config != slurmdbd_stored_config:
            self._stored.slurmdbd_config = slurmdbd_config
            self._slurm_manager.render_slurm_configs(slurmdbd_config)
            self._slurm_manager.restart_slurm_component()

            # Only the leader can set relation data on the application.
            # Enforce that no one other then the leader trys to set
            # application relation data.
            if self.model.unit.is_leader():
                self._slurmdbd.set_slurmdbd_info_on_app_relation_data(
                    slurmdbd_config,
                )
        self.unit.status = ActiveStatus("slurmdbd available")

    def get_port(self):
        """Return the port from slurm-ops-manager."""
        return self._slurm_manager.port

    def get_hostname(self):
        """Return the hostname from slurm-ops-manager."""
        return self._slurm_manager.hostname

    def set_db_info(self, db_info):
        """Set the db_info in the stored state."""
        self._stored.db_info = db_info
예제 #2
0
class SlurmrestdCharm(CharmBase):
    """Operator charm responsible for lifecycle operations for slurmrestd."""

    _stored = StoredState()

    def __init__(self, *args):
        """Initialize charm and configure states and events to observe."""
        super().__init__(*args)
        self._stored.set_default(
            slurm_installed=False,
            slurmrestd_restarted=False,
        )
        self._slurm_manager = SlurmManager(self, "slurmrestd")
        self._slurmrestd = SlurmrestdRequires(self, 'slurmrestd')

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.upgrade_charm: self._on_upgrade,
            self._slurmrestd.on.config_available:
            self._on_check_status_and_write_config,
            self._slurmrestd.on.munge_key_available:
            self._on_configure_munge_key,
            self._slurmrestd.on.restart_slurmrestd:
            self._on_restart_slurmrestd,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        self._slurm_manager.install(self.config["snapstore-channel"])
        self.unit.status = ActiveStatus("slurm installed")
        self._stored.slurm_installed = True

    def _on_upgrade(self, event):
        """Upgrade charm event handler."""
        slurm_config = self._check_status()
        snapstore_channel = self.config["snapstore-channel"]
        self._slurm_manager.upgrade(slurm_config, snapstore_channel)

    def _on_restart_slurmrestd(self, event):
        """Resart the slurmrestd component."""
        self._slurm_manager.restart_slurm_component()

    def _on_configure_munge_key(self, event):
        """Configure the munge key.

        1) Get the munge key from the stored state of the slurmrestd relation
        2) Write the munge key to the munge key path and chmod
        3) Restart munged
        4) Set munge_key_available in charm stored state
        """
        if not self._stored.slurm_installed:
            event.defer()
            return
        munge_key = self._slurmrestd.get_stored_munge_key()
        self._slurm_manager.configure_munge_key(munge_key)
        self._slurm_manager.restart_munged()
        self._stored.munge_key_available = True

    def _check_status(self):
        slurm_config = self._slurmrestd.get_stored_slurm_config()
        munge_key_available = self._stored.munge_key_available

        slurm_configurator_joined = self._slurmrestd.is_joined

        # Check and see if we have what we need for operation.
        if not slurm_configurator_joined:
            self.unit.status = BlockedStatus(
                "Needed relations: slurm-configurator")
            return None
        elif not (munge_key_available and slurm_config):
            self.unit.status = WaitingStatus("Waiting on: configuration")
            return None

        return dict(slurm_config)

    def _on_check_status_and_write_config(self, event):
        slurm_config = self._check_status()
        if not slurm_config:
            event.defer()
            return

        self._slurm_manager.render_slurm_configs(slurm_config)

        # Only restart slurmrestd the first time the node is brought up.
        if not self._stored.slurmrestd_restarted:
            self._slurm_manager.restart_slurm_component()
            self._stored.slurmrestd_restarted = True

        self.unit.status = ActiveStatus("slurmrestd available")
예제 #3
0
class SlurmdCharm(CharmBase):
    """Slurmd lifecycle events."""

    _stored = StoredState()

    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(
            munge_key_available=False,
            slurmd_restarted=False,
            user_node_state=str(),
            partition_name=str(),
        )

        self._nrpe = Nrpe(self, "nrpe-external-master")

        self._slurm_manager = SlurmManager(self, "slurmd")

        self._slurmd = Slurmd(self, "slurmd")
        self._slurmd_peer = SlurmdPeer(self, "slurmd-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.upgrade_charm: self._on_upgrade,
            self.on.start: self._on_check_status_and_write_config,
            self.on.config_changed: self._on_config_changed,
            self._slurmd_peer.on.slurmd_peer_available:
            self._on_set_partition_info_on_app_relation_data,
            self._slurmd_peer.on.slurmd_peer_departed:
            self._on_set_partition_info_on_app_relation_data,
            self._slurmd.on.slurm_config_available:
            self._on_check_status_and_write_config,
            self._slurmd.on.slurm_config_unavailable:
            self._on_check_status_and_write_config,
            self._slurmd.on.restart_slurmd:
            self._on_restart_slurmd,
            self._slurmd.on.munge_key_available: self._on_write_munge_key,
            self.on.set_node_state_action: self._on_set_node_state_action,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        self._slurm_manager.install(self.config["snapstore-channel"])

        if self.model.unit.is_leader():
            self._get_set_partition_name()
            logger.debug(f"PARTITION_NAME: {self._stored.partition_name}")
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("Slurm installed")

    def _on_upgrade(self, event):
        slurm_config = self._check_status()
        if not slurm_config:
            event.defer()
            return

        self._slurm_manager.upgrade(
            slurm_config,
            self.config["snapstore-channel"]
        )

    def _on_config_changed(self, event):
        if self.model.unit.is_leader():
            self._get_set_partition_name()
            if self._check_status():
                self._on_set_partition_info_on_app_relation_data(
                    event
                )

    def _on_write_munge_key(self, event):
        if not self._stored.slurm_installed:
            event.defer()
            return
        munge_key = self._slurmd.get_stored_munge_key()
        self._slurm_manager.configure_munge_key(munge_key)
        self._slurm_manager.restart_munged()
        self._stored.munge_key_available = True

    def _on_check_status_and_write_config(self, event):
        slurm_config = self._check_status()
        if not slurm_config:
            event.defer()
            return

        # if slurm_config['configless']:
        #    slurmctld_hostname = slurm_config['active_controller_hostname']
        #    self._slurm_manager.configure_slurmctld_hostname(
        #        slurmctld_hostname
        #    )
        #    self._slurm_manager.restart_slurm_component()
        # else:

        # Ensure we aren't dealing with a StoredDict before trying
        # to render the slurm.conf.
        slurm_config = dict(slurm_config)
        self._slurm_manager.render_slurm_configs(slurm_config)

        # Only restart slurmd the first time the node is brought up.
        if not self._stored.slurmd_restarted:
            self._slurm_manager.restart_slurm_component()
            self._stored.slurmd_restarted = True

        self.unit.status = ActiveStatus("slurmd available")

    def _on_restart_slurmd(self, event):
        self._slurm_manager.restart_slurm_component()

    def _check_status(self):
        munge_key_available = self._stored.munge_key_available
        slurm_installed = self._stored.slurm_installed
        slurm_config = self._slurmd.get_stored_slurm_config()

        slurmd_joined = self._slurmd.is_joined

        if not slurmd_joined:
            self.unit.status = BlockedStatus(
                "Needed relations: slurm-configurator"
            )
            return None

        elif not (munge_key_available and slurm_config and slurm_installed):
            self.unit.status = WaitingStatus(
                "Waiting on: configuration"
            )
            return None

        return dict(slurm_config)

    def _on_set_node_state_action(self, event):
        """Set the node state."""
        self._stored.user_node_state = event.params["node-state"]
        self._on_set_partition_info_on_app_relation_data(event)

    def _on_set_partition_info_on_app_relation_data(self, event):
        """Set the slurm partition info on the application relation data."""
        # Only the leader can set data on the relation.
        if self.framework.model.unit.is_leader():
            # If the relation with slurm-configurator exists then set our
            # partition info on the application relation data.
            # This handler shouldn't fire if the relation isn't made,
            # but add this extra check here just incase.
            if self._slurmd.is_joined:
                partition = self._assemble_partition()
                if partition:
                    self._slurmd.set_partition_info_on_app_relation_data(
                        partition
                    )
                    return
            event.defer()
            return

    def _assemble_partition(self):
        """Assemble the partition info."""
        partition_name = self._stored.partition_name
        partition_config = self.config.get("partition-config")
        partition_state = self.config.get("partition-state")

        slurmd_inventory = self._assemble_slurmd_inventory()

        return {
            "inventory": slurmd_inventory,
            "partition_name": partition_name,
            "partition_state": partition_state,
            "partition_config": partition_config,
        }

    def _assemble_slurmd_inventory(self):
        """Apply mutations to nodes in the partition, return slurmd nodes."""
        slurmd_inventory = self._slurmd_peer.get_slurmd_inventory()
        if not slurmd_inventory:
            return None

        # If the user has set custom state for nodes
        # ensure we update the state for the targeted nodes.
        user_node_state = self._stored.user_node_state
        if user_node_state:
            node_states = {
                item.split("=")[0]: item.split("=")[1]
                for item in user_node_state.split(",")
            }

            # Copy the slurmd_inventory returned from the the slurmd-peer
            # relation to a temporary variable that we will use to
            # iterate over while we conditionally make modifications to the
            # original inventory.
            slurmd_inventory_tmp = copy.deepcopy(slurmd_inventory)

            # Iterate over the slurmd nodes in the partition and check
            # for nodes that need their state modified.
            for partition in slurmd_inventory_tmp:
                partition_tmp = copy.deepcopy(partition)
                for slurmd_node in partition["inventory"]:
                    if slurmd_node["hostname"] in node_states.keys():
                        slurmd_node_tmp = copy.deepcopy(slurmd_node)
                        slurmd_node_tmp["state"] = \
                            node_states[slurmd_node["hostname"]]
                        partition_tmp["inventory"].remove(slurmd_node)
                        partition_tmp["inventory"].append(slurmd_node_tmp)
                slurmd_inventory.remove(partition)
                slurmd_inventory.append(partition_tmp)

        return slurmd_inventory

    def _get_set_partition_name(self):
        """Set the partition name."""
        # Determine if a partition-name config exists, if so
        # ensure the self._stored.partition_name is consistent with the
        # supplied config.
        # If no partition name has been specified then generate one.
        partition_name = self.config.get("partition-name")
        if partition_name:
            if partition_name != self._stored.partition_name:
                self._stored.partition_name = partition_name
        elif not self._stored.partition_name:
            self._stored.partition_name = f"juju-compute-{random_string()}"
        return

    def get_partition_name(self):
        """Return the partition_name."""
        return self._stored.partition_name

    def get_slurm_component(self):
        """Return the slurm component."""
        return self._slurm_manager.slurm_component

    def get_hostname(self):
        """Return the hostname."""
        return self._slurm_manager.hostname

    def get_port(self):
        """Return the port."""
        return self._slurm_manager.port
예제 #4
0
class SlurmctldCharm(CharmBase):
    """Slurmctld lifecycle events."""

    _stored = StoredState()

    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(
            munge_key_available=False,
            slurmctld_controller_type=str(),
        )

        self._nrpe = Nrpe(self, "nrpe-external-master")

        self._slurm_manager = SlurmManager(self, "slurmctld")

        self._slurmctld = Slurmctld(self, "slurmctld")
        self._slurmctld_peer = SlurmctldPeer(self, "slurmctld-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self._slurmctld.on.slurm_config_available: self._on_check_status_and_write_config,
            self._slurmctld.on.scontrol_reconfigure: self._on_scontrol_reconfigure,
            self._slurmctld.on.restart_slurmctld: self._on_restart_slurmctld,
            self._slurmctld.on.munge_key_available: self._on_write_munge_key,
            self._slurmctld_peer.on.slurmctld_peer_available: self._on_slurmctld_peer_available,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        self._slurm_manager.install(self.config["snapstore-channel"])
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("slurm snap successfully installed")

    def _on_upgrade(self, event):
        slurm_config = dict(self._check_status())
        snapstore_channel = self.config["snapstore-channel"]
        self._slurm_manager.upgrade(slurm_config, snapstore_channel)

    def _on_write_munge_key(self, event):
        if not self._stored.slurm_installed:
            event.defer()
            return
        munge_key = self._slurmctld.get_stored_munge_key()
        self._slurm_manager.configure_munge_key(munge_key)
        self._slurm_manager.restart_munged()
        self._stored.munge_key_available = True

    def _on_slurmctld_peer_available(self, event):
        if self.framework.model.unit.is_leader():
            if self._slurmctld.is_joined:
                slurmctld_info = self._slurmctld_peer.get_slurmctld_info()
                if slurmctld_info:
                    self._slurmctld.set_slurmctld_info_on_app_relation_data(
                        slurmctld_info
                    )
                    return
            event.defer()
            return

    def _on_check_status_and_write_config(self, event):
        slurm_config = self._check_status()
        if not slurm_config:
            event.defer()
            return

        self._slurm_manager.render_slurm_configs(dict(slurm_config))
        self.unit.status = ActiveStatus("slurmctld available")

    def _on_restart_slurmctld(self, event):
        self._slurm_manager.restart_slurm_component()

    def _on_scontrol_reconfigure(self, event):
        self._slurm_manager.slurm_cmd("scontrol", "reconfigure")

    def _check_status(self):
        munge_key_available = self._stored.munge_key_available
        slurm_installed = self._stored.slurm_installed
        slurm_config = self._slurmctld.get_stored_slurm_config()

        slurmctld_joined = self._slurmctld.is_joined

        if not slurmctld_joined:
            self.unit.status = BlockedStatus(
                "Relations needed: slurm-configurator"
            )
            return None

        elif not (munge_key_available and slurm_installed and slurm_config):
            self.unit.status = WaitingStatus(
                "Waiting on: configuration"
            )
            return None

        return slurm_config

    def get_slurm_component(self):
        """Return the slurm component."""
        return self._slurm_manager.slurm_component

    def get_hostname(self):
        """Return the hostname."""
        return self._slurm_manager.hostname

    def get_port(self):
        """Return the port."""
        return self._slurm_manager.port
예제 #5
0
class SlurmdCharm(CharmBase):
    """Slurmd lifecycle events."""

    _stored = StoredState()
    on = SlurmdCharmEvents()

    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(nhc_conf=str(),
                                 slurm_installed=False,
                                 slurmctld_available=False,
                                 slurmctld_started=False,
                                 cluster_name=str())

        self._slurm_manager = SlurmManager(self, "slurmd")
        self._fluentbit = FluentbitClient(self, "fluentbit")

        # interface to slurmctld, should only have one slurmctld per slurmd app
        self._slurmd = Slurmd(self, "slurmd")
        self._slurmd_peer = SlurmdPeer(self, "slurmd-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.upgrade_charm: self._on_upgrade,
            self.on.update_status: self._on_update_status,
            self.on.config_changed: self._on_config_changed,
            self.on.slurmctld_started: self._on_slurmctld_started,
            self.on.slurmd_start: self._on_slurmd_start,
            self.on.check_etcd: self._on_check_etcd,
            self._slurmd.on.slurmctld_available: self._on_slurmctld_available,
            self._slurmd.on.slurmctld_unavailable:
            self._on_slurmctld_unavailable,
            # fluentbit
            self.on["fluentbit"].relation_created:
            self._on_configure_fluentbit,
            # actions
            self.on.version_action: self._on_version_action,
            self.on.node_configured_action: self._on_node_configured_action,
            self.on.get_node_inventory_action:
            self._on_get_node_inventory_action,
            self.on.show_nhc_config_action: self._on_show_nhc_config,
            # infiniband actions
            self.on.get_infiniband_repo_action: self.get_infiniband_repo,
            self.on.set_infiniband_repo_action: self.set_infiniband_repo,
            self.on.install_infiniband_action: self.install_infiniband,
            self.on.uninstall_infiniband_action: self.uninstall_infiniband,
            self.on.start_infiniband_action: self.start_infiniband,
            self.on.enable_infiniband_action: self.enable_infiniband,
            self.on.stop_infiniband_action: self.stop_infiniband,
            self.on.is_active_infiniband_action: self.is_active_infiniband,
            # nvdia actions
            self.on.nvidia_repo_action: self.nvidia_repo,
            self.on.nvidia_package_action: self.nvidia_package,
            self.on.nvidia_install_action: self.nvidia_install,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        """Perform installation operations for slurmd."""
        self.unit.set_workload_version(Path("version").read_text().strip())
        self.unit.status = WaitingStatus("Installing slurmd")

        custom_repo = self.config.get("custom-slurm-repo")
        successful_installation = self._slurm_manager.install(custom_repo)
        logger.debug(f"### slurmd installed: {successful_installation}")

        if successful_installation:
            self._stored.slurm_installed = True
        else:
            self.unit.status = BlockedStatus("Error installing slurmd")
            event.defer()

        self._check_status()

    def _on_configure_fluentbit(self, event):
        """Set up Fluentbit log forwarding."""
        self._configure_fluentbit()

    def _configure_fluentbit(self):
        logger.debug("## Configuring fluentbit")
        cfg = list()
        cfg.extend(self._slurm_manager.fluentbit_config_nhc)
        cfg.extend(self._slurm_manager.fluentbit_config_slurm)
        self._fluentbit.configure(cfg)

    def _on_upgrade(self, event):
        """Perform upgrade operations."""
        self.unit.set_workload_version(Path("version").read_text().strip())

    def _on_update_status(self, event):
        """Handle update status."""
        self._check_status()

    def _check_status(self) -> bool:
        """Check if we heve all needed components.

        - partition name
        - slurm installed
        - slurmctld available and working
        - munge key configured and working
        """
        if self._slurm_manager.needs_reboot:
            self.unit.status = BlockedStatus("Machine needs reboot")
            return False

        if not self.get_partition_name():
            self.unit.status = WaitingStatus("Waiting on charm configuration")
            return False

        if not self._stored.slurm_installed:
            self.unit.status = BlockedStatus("Error installing slurmd")
            return False

        if not self._slurmd.is_joined:
            self.unit.status = BlockedStatus("Need relations: slurmctld")
            return False

        if not self._stored.slurmctld_available:
            self.unit.status = WaitingStatus("Waiting on: slurmctld")
            return False

        if not self._slurm_manager.check_munged():
            self.unit.status = BlockedStatus("Error configuring munge key")
            return False

        if not self._stored.slurmctld_started:
            self.unit.status = WaitingStatus("Waiting slurmctld to start")
            return False

        self.unit.status = ActiveStatus("slurmd available")
        return True

    def ensure_slurmd_starts(self, max_attemps=10) -> bool:
        """Ensure slurmd is up and running."""
        logger.debug("## Stoping slurmd")
        self._slurm_manager.slurm_systemctl('stop')

        for i in range(max_attemps):
            if self._slurm_manager.slurm_is_active():
                logger.debug("## Slurmd running")
                break
            else:
                logger.warning("## Slurmd not running, trying to start it")
                self.unit.status = WaitingStatus("Starting slurmd")
                self._slurm_manager.restart_slurm_component()
                sleep(2 + i)

        if self._slurm_manager.slurm_is_active():
            return True
        else:
            self.unit.status = BlockedStatus("Cannot start slurmd")
            return False

    def _set_slurmctld_available(self, flag: bool):
        """Change stored value for slurmctld availability."""
        self._stored.slurmctld_available = flag

    def _set_slurmctld_started(self, flag: bool):
        """Change stored value for slurmctld started."""
        self._stored.slurmctld_started = flag

    def _on_slurmctld_available(self, event):
        """Get data from slurmctld and send inventory."""
        if not self._stored.slurm_installed:
            event.defer()
            return

        logger.debug(
            '#### Slurmctld available - setting overrides for configless')
        # get slurmctld host:port from relation and override systemd services
        host = self._slurmd.slurmctld_hostname
        port = self._slurmd.slurmctld_port
        self._slurm_manager.create_configless_systemd_override(host, port)
        self._slurm_manager.daemon_reload()

        self._write_munge_key_and_restart_munge()

        self._set_slurmctld_available(True)
        self._on_set_partition_info_on_app_relation_data(event)
        self._check_status()

        # check etcd for hostnames
        self.on.check_etcd.emit()

    def _on_check_etcd(self, event):
        """Check if node is accounted for.

        Check if slurmctld accounted for this node's inventory for the first
        time, if so, emit slurmctld_started event, so the node can start the
        daemon.
        """

        host = self._slurmd.slurmctld_address
        port = self._slurmd.etcd_port
        logger.debug(f"## Connecting to etcd3 in {host}:{port}")
        client = Etcd3Client(host=host, port=port, api_path="/v3/")

        logger.debug("## Querying etcd3 for node list")
        try:
            v = client.get(key="all_nodes")
            logger.debug(f"## Got: {v}")
        except Exception as e:
            logger.error(
                f"## Unable to connect to {host} to get list of nodes: {e}")
            event.defer()
            return

        node_accounted = False
        if v:
            hostnames = json.loads(v[0])
            logger.debug(f"### etcd3 node list: {hostnames}")
            if self.hostname in hostnames:
                self.on.slurmctld_started.emit()
                node_accounted = True

        if not node_accounted:
            logger.debug("## Node not accounted for. Deferring.")
            event.defer()

    def _on_slurmctld_unavailable(self, event):
        logger.debug("## Slurmctld unavailable")
        self._set_slurmctld_available(False)
        self._set_slurmctld_started(False)
        self._slurm_manager.slurm_systemctl('stop')
        self._check_status()

    def _on_slurmctld_started(self, event):
        """Set flag to True and emit slurmd_start event."""
        self._set_slurmctld_started(True)
        self.on.slurmd_start.emit()

    def _on_slurmd_start(self, event):
        if not self._check_status():
            event.defer()
            return

        # only set up fluentbit if we have a relation to it
        if self._fluentbit._relation is not None:
            self._configure_fluentbit()

        # at this point, we have slurm installed, munge configured, and we know
        # slurmctld accounted for this node. It should be safe to start slurmd
        if self.ensure_slurmd_starts():
            logger.debug("## slurmctld started and slurmd is running")
        else:
            event.defer()
        self._check_status()

    def _on_config_changed(self, event):
        """Handle charm configuration changes."""
        if self.model.unit.is_leader():
            logger.debug("## slurmd config changed - leader")
            self._on_set_partition_info_on_app_relation_data(event)

        nhc_conf = self.model.config.get('nhc-conf')
        if nhc_conf:
            if nhc_conf != self._stored.nhc_conf:
                self._stored.nhc_conf = nhc_conf
                self._slurm_manager.render_nhc_config(nhc_conf)

    def get_partition_name(self) -> str:
        """Return the partition_name in the slurmd relation."""
        # Determine if a user-supplied partition-name config exists, if so
        # ensure the partition_name is consistent with the supplied config.
        # If no partition name has been specified then generate one.
        partition_name = self._slurmd_peer.partition_name
        partition_name_from_config = self.config.get("partition-name")
        if partition_name:
            if partition_name_from_config:
                partition_name_from_config = partition_name_from_config.replace(
                    ' ', '-')
                if partition_name != partition_name_from_config:
                    self._set_partition_name(partition_name_from_config)
                    partition_name = partition_name_from_config
                else:
                    logger.debug("Partition name unchanged.")
            else:
                logger.debug("Partition name unchanged.")
        else:
            partition_name = f"osd-{self.app.name}"
            logger.debug(f"Partition name: {partition_name}")
            self._set_partition_name(partition_name)

        return partition_name

    def _set_partition_name(self, name: str):
        """Set the partition_name in the slurmd relation."""
        if self.model.unit.is_leader():
            self._slurmd_peer.partition_name = name

    def _write_munge_key_and_restart_munge(self):
        logger.debug('#### slurmd charm - writting munge key')

        self._slurm_manager.configure_munge_key(
            self._slurmd.get_stored_munge_key())

        if self._slurm_manager.restart_munged():
            logger.debug("## Munge restarted succesfully")
        else:
            logger.error("## Unable to restart munge")

    def _on_version_action(self, event):
        """Return version of installed components.

        - Slurm
        - munge
        - NHC
        - infiniband
        """
        version = {}
        version['slurm'] = self._slurm_manager.slurm_version()
        version['munge'] = self._slurm_manager.munge_version()
        version['nhc'] = self._slurm_manager.nhc_version()
        version['infiniband'] = self._slurm_manager.infiniband_version()

        event.set_results(version)

    def _on_node_configured_action(self, event):
        """Remove node from DownNodes."""
        # trigger reconfig
        self._slurmd.configure_new_node()
        logger.debug('### This node is not new anymore')

    def _on_get_node_inventory_action(self, event):
        """Return node inventory."""
        inventory = self._slurmd.node_inventory
        event.set_results({'inventory': inventory})

    def get_infiniband_repo(self, event):
        """Return the currently used infiniband repository."""
        repo = self._slurm_manager.infiniband.repository
        event.set_results({'infiniband-repo': repo})

    def set_infiniband_repo(self, event):
        """Set the infiniband repository."""
        repo = event.params["repo"]
        logger.debug(f"#### setting custom infiniband repo: {repo}")
        repo = base64.b64decode(repo).decode()
        self._slurm_manager.infiniband.repository = repo

    def install_infiniband(self, event):
        """Install infiniband."""
        logger.debug("#### Installing Infiniband")
        self._slurm_manager.infiniband.install()
        event.set_results({'installation': 'Successfull. Please reboot node.'})
        self.unit.status = BlockedStatus("Need reboot for Infiniband")

    def uninstall_infiniband(self, event):
        """Install infiniband."""
        logger.debug("#### Uninstalling Infiniband")
        self._slurm_manager.infiniband.uninstall()

    def start_infiniband(self, event):
        """Start Infiniband systemd service."""
        logger.debug("#### Starting Infiniband service")
        self._slurm_manager.infiniband.start()

    def enable_infiniband(self, event):
        """Enable Infiniband systemd service."""
        logger.debug("#### Enabling Infiniband service")
        self._slurm_manager.infiniband.enable()

    def stop_infiniband(self, event):
        """Stop Infiniband systemd service."""
        logger.debug("#### Stoping Infiniband service")
        self._slurm_manager.infiniband.stop()

    def is_active_infiniband(self, event):
        """Check if Infiniband systemd service is arctive."""
        status = self._slurm_manager.infiniband.is_active()
        logger.debug(f"#### Infiniband service is-active: {status}")
        event.set_results({'infiniband-is-active': status})

    def nvidia_repo(self, event):
        """Set or get the used nvidia repository."""
        repo = event.params.get("repo", None)
        if repo:
            self._slurm_manager.nvidia.repository = base64.b64decode(
                repo).decode()

        event.set_results(
            {'nvidia-repo': self._slurm_manager.nvidia.repository})

    def nvidia_package(self, event):
        """Set or get the used nvidia package."""
        package = event.params.get("package", None)
        if package or package == "":
            # user supplied a package name -> store it
            self._slurm_manager.nvidia.package = package

        event.set_results(
            {'nvidia-package': self._slurm_manager.nvidia.package})

    def nvidia_install(self, event):
        """Install nvidia drivers."""
        logger.debug("#### Installing nvidia drivers: %s",
                     self._slurm_manager.nvidia.package)
        self._slurm_manager.nvidia.install()
        event.set_results({'installation': 'Successfull. Please reboot node.'})
        self.unit.status = BlockedStatus("Need reboot for nvidia")

    def _on_show_nhc_config(self, event):
        """Show current nhc.conf."""
        nhc_conf = self._slurm_manager.get_nhc_config()
        event.set_results({"nhc.conf": nhc_conf})

    def _on_set_partition_info_on_app_relation_data(self, event):
        """Set the slurm partition info on the application relation data."""
        # Only the leader can set data on the relation.
        if self.model.unit.is_leader():
            # If the relation with slurmctld exists then set our
            # partition info on the application relation data.
            # This handler shouldn't fire if the relation isn't made,
            # but add this extra check here just incase.
            if self._slurmd.is_joined:
                partition = self._assemble_partition()
                if partition:
                    self._slurmd.set_partition_info_on_app_relation_data(
                        partition)
                else:
                    event.defer()
            else:
                event.defer()

    def _assemble_partition(self):
        """Assemble the partition info."""
        partition_name = self.get_partition_name()
        partition_config = self.config.get("partition-config")
        partition_state = self.config.get("partition-state")
        logger.debug(f"## partition_name: {partition_name}")

        return {
            "partition_name": partition_name,
            "partition_state": partition_state,
            "partition_config": partition_config,
        }

    @property
    def hostname(self) -> str:
        """Return the hostname."""
        return self._slurm_manager.hostname

    @property
    def cluster_name(self) -> str:
        """Return the cluster-name."""
        return self._stored.cluster_name

    @cluster_name.setter
    def cluster_name(self, name: str):
        """Set the cluster-name."""
        self._stored.cluster_name = name
예제 #6
0
class SlurmdbdCharm(CharmBase):
    """Slurmdbd Charm."""

    _stored = StoredState()
    on = SlurmdbdCharmEvents()

    def __init__(self, *args):
        """Set the default class attributes."""
        super().__init__(*args)

        self._stored.set_default(db_info=dict(),
                                 jwt_available=False,
                                 munge_available=False,
                                 slurm_installed=False,
                                 cluster_name=str())

        self._db = MySQLClient(self, "db")
        # self._nrpe = Nrpe(self, "nrpe-external-master")
        self._slurm_manager = SlurmManager(self, "slurmdbd")
        self._slurmdbd = Slurmdbd(self, "slurmdbd")
        self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer")
        self._fluentbit = FluentbitClient(self, "fluentbit")

        event_handler_bindings = {
            self.on.install:
            self._on_install,
            self.on.upgrade_charm:
            self._on_upgrade,
            self.on.update_status:
            self._on_update_status,
            self.on.config_changed:
            self._write_config_and_restart_slurmdbd,
            self.on.jwt_available:
            self._on_jwt_available,
            self.on.munge_available:
            self._on_munge_available,
            self.on.write_config:
            self._write_config_and_restart_slurmdbd,
            self._db.on.database_available:
            self._write_config_and_restart_slurmdbd,
            self._db.on.database_unavailable:
            self._on_db_unavailable,
            self._slurmdbd_peer.on.slurmdbd_peer_available:
            self._write_config_and_restart_slurmdbd,
            self._slurmdbd.on.slurmctld_available:
            self._on_slurmctld_available,
            self._slurmdbd.on.slurmctld_unavailable:
            self._on_slurmctld_unavailable,
            # fluentbit
            self.on["fluentbit"].relation_created:
            self._on_fluentbit_relation_created,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        """Perform installation operations for slurmdbd."""
        self.unit.set_workload_version(Path("version").read_text().strip())

        self.unit.status = WaitingStatus("Installing slurmdbd")

        custom_repo = self.config.get("custom-slurm-repo")
        successful_installation = self._slurm_manager.install(custom_repo)

        if successful_installation:
            self._stored.slurm_installed = True
            self.unit.status = ActiveStatus("slurmdbd successfully installed")
        else:
            self.unit.status = BlockedStatus("Error installing slurmdbd")
            event.defer()
            return

        self._check_status()

    def _on_fluentbit_relation_created(self, event):
        """Set up Fluentbit log forwarding."""
        self._configure_fluentbit()

    def _configure_fluentbit(self):
        logger.debug("## Configuring fluentbit")
        cfg = list()
        cfg.extend(self._slurm_manager.fluentbit_config_nhc)
        cfg.extend(self._slurm_manager.fluentbit_config_slurm)
        self._fluentbit.configure(cfg)

    def _on_upgrade(self, event):
        """Perform upgrade operations."""
        self.unit.set_workload_version(Path("version").read_text().strip())

    def _on_update_status(self, event):
        """Handle update status."""
        self._check_status()

    def _on_jwt_available(self, event):
        """Retrieve and configure the jwt_rsa key."""
        # jwt rsa lives in slurm spool dir, it is created when slurm is installed
        if not self._stored.slurm_installed:
            event.defer()
            return

        jwt_rsa = self._slurmdbd.get_jwt_rsa()
        self._slurm_manager.configure_jwt_rsa(jwt_rsa)
        self._stored.jwt_available = True

    def _on_munge_available(self, event):
        """Retrieve munge key and start munged."""
        # munge is installed together with slurm
        if not self._stored.slurm_installed:
            event.defer()
            return

        munge_key = self._slurmdbd.get_munge_key()
        self._slurm_manager.configure_munge_key(munge_key)

        if self._slurm_manager.restart_munged():
            logger.debug("## Munge restarted succesfully")
            self._stored.munge_available = True
        else:
            logger.error("## Unable to restart munge")
            self.unit.status = BlockedStatus("Error restarting munge")
            event.defer()

    def _on_db_unavailable(self, event):
        self._stored.db_info = dict()
        # TODO tell slurmctld that slurmdbd left?
        self._check_status()

    def _on_slurmctld_available(self, event):
        self.on.jwt_available.emit()
        self.on.munge_available.emit()

        self.on.write_config.emit()
        if self._fluentbit._relation is not None:
            self._configure_fluentbit()

    def _on_slurmctld_unavailable(self, event):
        """Reset state and charm status when slurmctld broken."""
        self._stored.jwt_available = False
        self._stored.munge_available = False
        self._check_status()

    def _is_leader(self):
        return self.model.unit.is_leader()

    def _write_config_and_restart_slurmdbd(self, event):
        """Check for prereqs before writing config/restart of slurmdbd."""
        # Ensure all pre-conditions are met with _check_status(), if not
        # defer the event.
        if not self._check_status():
            event.defer()
            return

        db_info = self._stored.db_info
        slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info()

        # settings from the config.yaml
        config = {"slurmdbd_debug": self.config.get("slurmdbd-debug")}

        slurmdbd_config = {
            **config,
            **slurmdbd_info,
            **db_info,
        }

        self._slurm_manager.slurm_systemctl("stop")
        self._slurm_manager.render_slurm_configs(slurmdbd_config)

        # At this point, we must guarantee that slurmdbd is correctly
        # initialized. Its startup might take a while, so we have to wait
        # for it.
        self._check_slurmdbd()

        # Only the leader can set relation data on the application.
        # Enforce that no one other then the leader trys to set
        # application relation data.
        if self.model.unit.is_leader():
            self._slurmdbd.set_slurmdbd_info_on_app_relation_data(
                slurmdbd_config, )

        self._check_status()

    def _check_slurmdbd(self, max_attemps=3) -> None:
        """Ensure slurmdbd is up and running."""
        logger.debug("## Checking if slurmdbd is active")

        for i in range(max_attemps):
            if self._slurm_manager.slurm_is_active():
                logger.debug("## Slurmdbd running")
                break
            else:
                logger.warning("## Slurmdbd not running, trying to start it")
                self.unit.status = WaitingStatus("Starting slurmdbd")
                self._slurm_manager.restart_slurm_component()
                sleep(1 + i)

        if self._slurm_manager.slurm_is_active():
            self._check_status()
        else:
            self.unit.status = BlockedStatus("Cannot start slurmdbd")

    def _check_status(self) -> bool:
        """Check that we have the things we need."""
        if self._slurm_manager.needs_reboot:
            self.unit.status = BlockedStatus("Machine needs reboot")
            return False

        slurm_installed = self._stored.slurm_installed
        if not slurm_installed:
            self.unit.status = BlockedStatus("Error installing slurm")
            return False

        # we must be sure to initialize the charms correctly. Slurmdbd must
        # first connect to the db to be able to connect to slurmctld correctly
        slurmctld_available = (self._stored.jwt_available
                               and self._stored.munge_available)
        statuses = {
            "MySQL": {
                "available": self._stored.db_info != dict(),
                "joined": self._db.is_joined
            },
            "slurcmtld": {
                "available": slurmctld_available,
                "joined": self._slurmdbd.is_joined
            }
        }

        relations_needed = list()
        waiting_on = list()
        for component in statuses.keys():
            if not statuses[component]["joined"]:
                relations_needed.append(component)
            if not statuses[component]["available"]:
                waiting_on.append(component)

        if len(relations_needed):
            msg = f"Need relations: {','.join(relations_needed)}"
            self.unit.status = BlockedStatus(msg)
            return False

        if len(waiting_on):
            msg = f"Wating on: {','.join(waiting_on)}"
            self.unit.status = WaitingStatus(msg)
            return False

        slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info()
        if not slurmdbd_info:
            self.unit.status = WaitingStatus("slurmdbd starting")
            return False

        if not self._slurm_manager.check_munged():
            self.unit.status = WaitingStatus("munged starting")
            return False

        self.unit.status = ActiveStatus("slurmdbd available")
        return True

    def get_port(self):
        """Return the port from slurm-ops-manager."""
        return self._slurm_manager.port

    def get_hostname(self):
        """Return the hostname from slurm-ops-manager."""
        return self._slurm_manager.hostname

    def set_db_info(self, db_info):
        """Set the db_info in the stored state."""
        self._stored.db_info = db_info

    @property
    def cluster_name(self) -> str:
        """Return the cluster-name."""
        return self._stored.cluster_name

    @cluster_name.setter
    def cluster_name(self, name: str):
        """Set the cluster-name."""
        self._stored.cluster_name = name
예제 #7
0
class SlurmrestdCharm(CharmBase):
    """Operator charm responsible for lifecycle operations for slurmrestd."""

    _stored = StoredState()

    def __init__(self, *args):
        """Initialize charm and configure states and events to observe."""
        super().__init__(*args)

        self._stored.set_default(slurm_installed=False,
                                 slurmrestd_restarted=False,
                                 cluster_name=str())

        self._slurm_manager = SlurmManager(self, "slurmrestd")
        self._slurmrestd = SlurmrestdRequires(self, 'slurmrestd')
        self._fluentbit = FluentbitClient(self, "fluentbit")

        event_handler_bindings = {
            self.on.install:
            self._on_install,
            self.on.upgrade_charm:
            self._on_upgrade,
            self.on.update_status:
            self._on_update_status,
            self._slurmrestd.on.config_available:
            self._on_check_status_and_write_config,
            self._slurmrestd.on.config_unavailable:
            self._on_config_unavailable,
            self._slurmrestd.on.munge_key_available:
            self._on_configure_munge_key,
            self._slurmrestd.on.jwt_rsa_available:
            self._on_configure_jwt_rsa,
            self._slurmrestd.on.restart_slurmrestd:
            self._on_restart_slurmrestd,
            # fluentbit
            self.on["fluentbit"].relation_created:
            self._on_fluentbit_relation_created,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        """Perform installation operations for slurmrestd."""
        self.unit.set_workload_version(Path("version").read_text().strip())

        self.unit.status = WaitingStatus("Installing slurmrestd")

        custom_repo = self.config.get("custom-slurm-repo")
        successful_installation = self._slurm_manager.install(custom_repo)

        if successful_installation:
            self.unit.status = ActiveStatus("slurmrestd installed")
            self._stored.slurm_installed = True

            self._slurm_manager.start_munged()
        else:
            self.unit.status = BlockedStatus("Error installing slurmrestd")
            event.defer()

        self._check_status()

    def _on_fluentbit_relation_created(self, event):
        """Set up Fluentbit log forwarding."""
        self._configure_fluentbit()

    def _configure_fluentbit(self):
        logger.debug("## Configuring fluentbit")
        cfg = list()
        cfg.extend(self._slurm_manager.fluentbit_config_nhc)
        cfg.extend(self._slurm_manager.fluentbit_config_slurm)
        self._fluentbit.configure(cfg)

    def _on_upgrade(self, event):
        """Perform upgrade operations."""
        self.unit.set_workload_version(Path("version").read_text().strip())
        self._check_status()

    def _on_update_status(self, event):
        """Handle update status."""
        self._check_status()

    def _on_config_unavailable(self, event):
        """Handle the config unavailable due to relation broken."""
        # when the config becomes unavailable, we have to set this flag to False,
        # so the next time the config becoms avaiable, the daemon restarts
        self._stored.slurmrestd_restarted = False
        self._check_status()

    def _on_restart_slurmrestd(self, event):
        """Resart the slurmrestd component."""
        logger.debug("## _on_restart_slurmrestd")

        if not self._check_status():
            event.defer()
            return

        self._slurm_manager.restart_slurm_component()
        self._stored.slurmrestd_restarted = True

    def _on_configure_munge_key(self, event):
        """Configure the munge key.

        1) Get the munge key from the stored state of the slurmrestd relation
        2) Write the munge key to the munge key path and chmod
        3) Restart munged
        """
        if not self._stored.slurm_installed:
            event.defer()
            return

        logger.debug("## configuring new munge key")
        munge_key = self._slurmrestd.get_stored_munge_key()
        self._slurm_manager.configure_munge_key(munge_key)
        self._slurm_manager.restart_munged()

    def _on_configure_jwt_rsa(self, event):
        if not self._stored.slurm_installed:
            event.defer()
            return

        logger.debug("## configuring new jwt rsa")
        jwt_rsa = self._slurmrestd.get_stored_jwt_rsa()
        self._slurm_manager.configure_jwt_rsa(jwt_rsa)

    def _check_status(self) -> bool:
        if self._slurm_manager.needs_reboot:
            self.unit.status = BlockedStatus("Machine needs reboot")
            return False

        if not self._stored.slurm_installed:
            self.unit.status = BlockedStatus("Error installing slurmrestd")
            return False

        # Check and see if we have what we need for operation.
        if not self._slurmrestd.is_joined:
            self.unit.status = BlockedStatus("Need relations: slurmctld")
            return False

        slurmctld_available = (self._slurmrestd.get_stored_munge_key()
                               and self._slurmrestd.get_stored_jwt_rsa()
                               and self._slurmrestd.get_stored_slurm_config())
        if not slurmctld_available:
            self.unit.status = WaitingStatus("Waiting on: slurmctld")
            return True

        self.unit.status = ActiveStatus("slurmrestd available")

        return True

    def _on_check_status_and_write_config(self, event):
        if not self._check_status():
            event.defer()
            return

        slurm_config = self._slurmrestd.get_stored_slurm_config()
        if slurm_config:
            self._slurm_manager.render_slurm_configs(slurm_config)
            self.cluster_name = slurm_config.get("cluster_name")
        else:
            logger.error(f"## weird slurmconfig: {slurm_config}")

        # Only restart slurmrestd the first time the node is brought up.
        if not self._stored.slurmrestd_restarted:
            self._on_restart_slurmrestd(event)

        if self._fluentbit._relation is not None:
            self._configure_fluentbit()

    @property
    def cluster_name(self) -> str:
        """Return the cluster-name."""
        return self._stored.cluster_name

    @cluster_name.setter
    def cluster_name(self, name: str):
        """Set the cluster-name."""
        self._stored.cluster_name = name