예제 #1
0
class SlurmLoginCharm(CharmBase):
    """Operator charm responsible for lifecycle operations for slurmctld."""

    _stored = StoredState()

    def __init__(self, *args):
        """Initialize charm and configure states and events to observe."""
        super().__init__(*args)
        self._stored.set_default(
            slurm_installed=False,
            config_available=False,
        )
        self.slurm_manager = SlurmManager(self, "slurmrestd")
        self._slurmrestd = SlurmrestdRequires(self, 'slurmrestd')

        event_handler_bindings = {
            self.on.install:
            self._on_install,
            self.on.upgrade_charm:
            self._on_upgrade,
            self._slurmrestd.on.config_available:
            self._on_check_status_and_write_config,
            self._slurmrestd.on.config_unavailable:
            self._on_check_status_and_write_config,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        self.slurm_manager.install()
        self.unit.status = ActiveStatus("slurm installed")
        self._stored.slurm_installed = True

    def _on_upgrade(self, event):
        """Upgrade charm event handler."""
        self.slurm_manager.upgrade()

    def _on_check_status_and_write_config(self, event):
        slurm_installed = self._stored.slurm_installed
        slurm_config = self._stored.config_available
        logger.debug("##### inside check status and write config ######")
        if not (slurm_installed and slurm_config):
            if not slurm_config:
                self.unit.status = BlockedStatus(
                    "NEED RELATION TO SLURM-CONFIGURATOR")
            else:
                self.unit.status = BlockedStatus("SLURM NOT INSTALLED")
            event.defer()
            return
        else:
            logger.debug("##### STATUS CONFIRMED ######")
            config = dict(self._slurmrestd.get_slurm_config())
            logger.debug(config)
            self.slurm_manager.render_config_and_restart(config)
            self.unit.status = ActiveStatus("Slurmrestd Available")

    def set_config_available(self, boolean):
        """Set self._stored.slurmctld_available."""
        self._stored.config_available = boolean
예제 #2
0
class SlurmConfiguratorCharm(CharmBase):
    """Facilitate slurm configuration operations."""

    _stored = StoredState()

    def __init__(self, *args):
        """Init charm, _stored defaults, interfaces and observe events."""
        super().__init__(*args)

        self._stored.set_default(
            default_partition=str(),
            munge_key=str(),
            slurm_installed=False,
            slurmctld_available=False,
            slurmdbd_available=False,
            slurmd_available=False,
            slurmrestd_available=False,
        )

        self._elasticsearch = Elasticsearch(self, "elasticsearch")
        self._grafana = GrafanaSource(self, "grafana-source")
        self._influxdb = InfluxDB(self, "influxdb-api")
        self._nhc = Nhc(self, "nhc")
        self._slurmrestd = Slurmrestd(self, "slurmrestd")
        self._slurm_manager = SlurmManager(self, "slurmd")
        self._slurmctld = Slurmctld(self, "slurmctld")
        self._slurmdbd = Slurmdbd(self, "slurmdbd")
        self._slurmd = Slurmd(self, "slurmd")
        self._prolog_epilog = PrologEpilog(self, "prolog-epilog")

        # #### Charm lifecycle events #### #
        event_handler_bindings = {
            # #### Juju lifecycle events #### #
            self.on.install:
            self._on_install,

            # self.on.start:
            # self._on_check_status_and_write_config,
            self.on.config_changed:
            self._on_check_status_and_write_config,
            self.on.upgrade_charm:
            self._on_upgrade,

            # ######## Addons lifecycle events ######## #
            self._elasticsearch.on.elasticsearch_available:
            self._on_check_status_and_write_config,
            self._elasticsearch.on.elasticsearch_unavailable:
            self._on_check_status_and_write_config,
            self._grafana.on.grafana_available:
            self._on_grafana_available,
            self._influxdb.on.influxdb_available:
            self._on_influxdb_available,
            self._influxdb.on.influxdb_unavailable:
            self._on_check_status_and_write_config,
            self._nhc.on.nhc_bin_available:
            self._on_check_status_and_write_config,

            # ######## Slurm component lifecycle events ######## #
            self._slurmctld.on.slurmctld_available:
            self._on_check_status_and_write_config,
            self._slurmctld.on.slurmctld_unavailable:
            self._on_check_status_and_write_config,
            self._slurmdbd.on.slurmdbd_available:
            self._on_check_status_and_write_config,
            self._slurmdbd.on.slurmdbd_unavailable:
            self._on_check_status_and_write_config,
            self._slurmd.on.slurmd_available:
            self._on_check_status_and_write_config,
            self._slurmd.on.slurmd_unavailable:
            self._on_check_status_and_write_config,
            self._slurmrestd.on.slurmrestd_available:
            self._on_check_status_and_write_config,
            self._slurmrestd.on.slurmrestd_unavailable:
            self._on_check_status_and_write_config,
            self._prolog_epilog.on.prolog_epilog_available:
            self._on_check_status_and_write_config,
            self._prolog_epilog.on.prolog_epilog_unavailable:
            self._on_check_status_and_write_config,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        """Install the slurm snap and set the munge key."""
        self._slurm_manager.install()
        self._stored.munge_key = self._slurm_manager.get_munge_key()
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("Slurm Installed")

    def _on_upgrade(self, event):
        """Upgrade the charm."""
        slurm_config = self._assemble_slurm_config()

        if not slurm_config:
            self.unit.status = BlockedStatus(
                "Cannot generate slurm_config, defering upgrade.")
            event.defer()
            return

        self._slurm_manager.upgrade(slurm_config)

    def _on_grafana_available(self, event):
        """Create the grafana-source if we are the leader and have influxdb."""
        leader = self._is_leader()
        influxdb_info = self._get_influxdb_info()
        grafana = self._grafana

        if leader and influxdb_info:
            grafana.set_grafana_source_info(influxdb_info)

    def _on_influxdb_available(self, event):
        """Create the grafana-source if we have all the things."""
        grafana = self._grafana
        influxdb_info = self._get_influxdb_info()
        leader = self._is_leader()

        if leader and grafana.is_joined and influxdb_info:
            grafana.set_grafana_source_info(influxdb_info)

        self._on_check_status_and_write_config(event)

    def _on_check_status_and_write_config(self, event):
        """Check that we have what we need before we proceed."""
        if not self._check_status():
            event.defer()
            return

        # Generate the slurm_config
        slurm_config = self._assemble_slurm_config()

        if not slurm_config:
            self.unit.status = BlockedStatus(
                "Cannot generate slurm_config - defering event.")
            event.defer()
            return

        self._slurmctld.set_slurm_config_on_app_relation_data(slurm_config, )
        self._slurmd.set_slurm_config_on_app_relation_data(slurm_config, )
        if self._stored.slurmrestd_available:
            self._slurmrestd.set_slurm_config_on_app_relation_data(
                slurm_config, )
        self._slurm_manager.render_config_and_restart({
            **slurm_config, 'munge_key':
            self.get_munge_key()
        })

    def _assemble_slurm_config(self):
        """Assemble and return the slurm config."""
        slurmctld_info = self._slurmctld.get_slurmctld_info()
        slurmdbd_info = self._slurmdbd.get_slurmdbd_info()
        slurmd_info = self._slurmd.get_slurmd_info()

        if not (slurmd_info and slurmctld_info and slurmdbd_info):
            return {}

        addons_info = self._assemble_addons()
        partitions_info = self._assemble_partitions(slurmd_info)

        logger.debug(addons_info)
        logger.debug(partitions_info)
        logger.debug(slurmctld_info)
        logger.debug(slurmdbd_info)

        return {
            'munge_key': self._stored.munge_key,
            'partitions': partitions_info,
            **slurmctld_info,
            **slurmdbd_info,
            **addons_info,
            **self.model.config,
        }

    def _assemble_partitions(self, slurmd_info):
        """Make any needed modifications to partition data."""
        slurmd_info_tmp = copy.deepcopy(slurmd_info)

        for partition in slurmd_info:

            # Deep copy the partition to a tmp var so we can modify it as
            # needed whilst not modifying the object we are iterating over.
            partition_tmp = copy.deepcopy(partition)
            # Extract the partition_name from the partition and from the charm
            # config.
            partition_name = partition['partition_name']
            default_partition_from_config = self.model.config.get(
                'default_partition')

            # Check that the default_partition isn't defined in the charm
            # config.
            # If the user hasn't provided a default partition, then we infer
            # the partition_default by defaulting to the first related slurmd
            # application.
            if not default_partition_from_config:
                if partition['partition_name'] ==\
                   self._stored.default_partition:
                    partition_tmp['partition_default'] = 'YES'
            else:
                if default_partition_from_config == partition_name:
                    partition_tmp['partition_default'] = 'YES'

            slurmd_info_tmp.remove(partition)
            slurmd_info_tmp.append(partition_tmp)

        return slurmd_info_tmp

    def _assemble_addons(self):
        """Assemble any addon components."""
        acct_gather = self._get_influxdb_info()
        elasticsearch_ingress = \
            self._elasticsearch.get_elasticsearch_ingress()
        nhc_info = self._nhc.get_nhc_info()
        prolog_epilog = self._prolog_epilog.get_prolog_epilog()

        ctxt = dict()

        if prolog_epilog:
            ctxt['prolog_epilog'] = prolog_epilog

        if acct_gather:
            ctxt['acct_gather'] = acct_gather
            acct_gather_custom = self.model.config.get('acct_gather_custom')
            if acct_gather_custom:
                ctxt['acct_gather']['custom'] = acct_gather_custom

        if nhc_info:
            ctxt['nhc'] = {
                'nhc_bin': nhc_info['nhc_bin'],
                'health_check_interval': nhc_info['health_check_interval'],
                'health_check_node_state': nhc_info['health_check_node_state'],
            }

        if elasticsearch_ingress:
            ctxt['elasticsearch_address'] = elasticsearch_ingress

        return ctxt

    def _check_status(self):
        """Check that the core components we need exist."""
        slurmctld_available = self._stored.slurmctld_available
        slurmdbd_available = self._stored.slurmdbd_available
        slurmd_available = self._stored.slurmd_available
        slurm_installed = self._stored.slurm_installed
        default_partition = self._stored.default_partition

        deps = [
            default_partition,
            slurmctld_available,
            slurmdbd_available,
            slurmd_available,
            slurm_installed,
        ]

        if not all(deps):
            if not slurmctld_available:
                self.unit.status = BlockedStatus("NEED RELATION TO SLURMCTLD")
            elif not slurmdbd_available:
                self.unit.status = BlockedStatus("NEED RELATION TO SLURMDBD")
            elif not slurmd_available:
                self.unit.status = BlockedStatus("NEED RELATION TO SLURMD")
            elif not slurm_installed:
                self.unit.status = BlockedStatus("SLURM NOT INSTALLED")
            else:
                self.unit.status = BlockedStatus("PARTITION NAME UNAVAILABLE")
            return False
        else:
            self.unit.status = ActiveStatus("")
            return True

    def _get_influxdb_info(self):
        """Return influxdb info."""
        return self._influxdb.get_influxdb_info()

    def _is_leader(self):
        return self.model.unit.is_leader()

    def get_munge_key(self):
        """Return the slurmdbd_info from stored state."""
        return self._stored.munge_key

    def get_default_partition(self):
        """Return self._stored.default_partition."""
        return self._stored.default_partition

    def is_slurm_installed(self):
        """Return true/false based on whether or not slurm is installed."""
        return self._stored.slurm_installed

    def set_slurmctld_available(self, slurmctld_available):
        """Set slurmctld_available."""
        self._stored.slurmctld_available = slurmctld_available

    def set_slurmdbd_available(self, slurmdbd_available):
        """Set slurmdbd_available."""
        self._stored.slurmdbd_available = slurmdbd_available

    def set_default_partition(self, partition_name):
        """Set self._stored.default_partition."""
        self._stored.default_partition = partition_name

    def set_slurmd_available(self, slurmd_available):
        """Set slurmd_available."""
        self._stored.slurmd_available = slurmd_available

    def set_slurmrestd_available(self, slurmrestd_available):
        """Set slurmrestd_available."""
        self._stored.slurmrestd_available = slurmrestd_available
예제 #3
0
파일: charm.py 프로젝트: ducks23/rev2
class SlurmdCharm(CharmBase):
    """Slurmd lifecycle events."""

    _stored = StoredState()

    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(
            user_node_state=str(),
            partition_name=str(),
            config_available=False,
        )

        self._nrpe = Nrpe(self, "nrpe-external-master")

        self._slurm_manager = SlurmManager(self, "slurmd")

        self._slurmd = Slurmd(self, "slurmd")
        self._slurmd_peer = SlurmdPeer(self, "slurmd-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.upgrade_charm: self._on_upgrade,
            self.on.config_changed: self._on_send_slurmd_info,
            self._slurmd_peer.on.slurmd_peer_available:
            self._on_send_slurmd_info,
            self._slurmd.on.slurm_config_available:
            self._on_check_status_and_write_config,
            self.on.set_node_state_action: self._on_set_node_state_action,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_config_changed(self, event):
        self.get_set_return_partition_name()
        self._on_send_slurmd_info(event)

    def _on_install(self, event):
        self._slurm_manager.install()
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("Slurm Installed")

    def _on_upgrade(self, event):
        self._slurm_manager.upgrade()

    def _on_set_node_state_action(self, event):
        """Set the node state."""
        self._stored.user_node_state = event.params["node-state"]
        self._on_send_slurm_info(event)

    def _on_send_slurmd_info(self, event):
        if self.framework.model.unit.is_leader():
            if self._slurmd.is_joined:
                partition = self._assemble_partition()
                if partition:
                    self._slurmd.set_slurmd_info_on_app_relation_data(
                        partition)
                    return
            event.defer()
            return

    def _on_check_status_and_write_config(self, event):
        if not self._check_status():
            event.defer()
            return
        slurm_config = dict(self._slurmd.get_slurm_config())
        self._slurm_manager.render_config_and_restart(slurm_config)
        self.unit.status = ActiveStatus("Slurmd Available")

    def _check_status(self):
        slurm_installed = self._stored.slurm_installed
        config_available = self._stored.config_available

        if not (slurm_installed and config_available):
            self.unit.status = BlockedStatus(
                "NEED RELATION TO SLURM CONFIGURATOR")
            return False
        else:
            return True

    def _assemble_partition(self):
        """Assemble the partition info."""
        partition_name = self._stored.partition_name
        partition_config = self.model.config.get('partition-config')
        partition_state = self.model.config.get('partition-state')

        slurmd_info = self._assemble_slurmd_info()

        return {
            'inventory': slurmd_info,
            'partition_name': partition_name,
            'partition_state': partition_state,
            'partition_config': partition_config,
        }

    def _assemble_slurmd_info(self):
        """Apply mutations to nodes in the partition, return slurmd nodes."""
        slurmd_info = self._slurmd_peer.get_slurmd_info()
        if not slurmd_info:
            return None

        # If the user has set custom state for nodes
        # ensure we update the state for the targeted nodes.
        user_node_state = self._stored.user_node_state
        if user_node_state:
            node_states = {
                item.split("=")[0]: item.split("=")[1]
                for item in user_node_state.split(",")
            }

            # Copy the slurmd_info returned from the the slurmd-peer relation
            # to a temporary variable to which we will make modifications.
            slurmd_info_tmp = copy.deepcopy(slurmd_info)

            # Iterate over the slurmd nodes in the partition and check
            # for nodes that need their state modified.
            for partition in slurmd_info:
                partition_tmp = copy.deepcopy(partition)
                for slurmd_node in partition['inventory']:
                    if slurmd_node['hostname'] in node_states.keys():
                        slurmd_node_tmp = copy.deepcopy(slurmd_node)
                        slurmd_node_tmp['state'] = \
                            node_states[slurmd_node['hostname']]
                        partition_tmp['inventory'].remove(slurmd_node)
                        partition_tmp['inventory'].append(slurmd_node_tmp)
                slurmd_info_tmp.remove(partition)
                slurmd_info_tmp.append(partition_tmp)
        else:
            slurmd_info_tmp = slurmd_info

        return slurmd_info_tmp

    def get_set_return_partition_name(self):
        """Set the partition name."""
        # Determine if a partition-name config exists, if so
        # ensure the partition_name known by the charm is consistent.
        # If no partition name has been specified then generate one.
        partition_name = self.model.config.get('partition-name')
        if partition_name:
            if partition_name != self._stored.partition_name:
                self._stored.partition_name = partition_name
        elif not self._stored.partition_name:
            self._stored.partition_name = f"juju-compute-{random_string()}"
        return self._stored.partition_name

    def get_slurm_component(self):
        """Return the slurm component."""
        return self._slurm_manager.slurm_component

    def get_hostname(self):
        """Return the hostname."""
        return self._slurm_manager.hostname

    def get_port(self):
        """Return the port."""
        return self._slurm_manager.port
예제 #4
0
class SlurmrestdCharm(CharmBase):
    """Operator charm responsible for lifecycle operations for slurmrestd."""

    _stored = StoredState()

    def __init__(self, *args):
        """Initialize charm and configure states and events to observe."""
        super().__init__(*args)
        self._stored.set_default(
            slurm_installed=False,
            slurmrestd_restarted=False,
        )
        self._slurm_manager = SlurmManager(self, "slurmrestd")
        self._slurmrestd = SlurmrestdRequires(self, 'slurmrestd')

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.upgrade_charm: self._on_upgrade,
            self._slurmrestd.on.config_available:
            self._on_check_status_and_write_config,
            self._slurmrestd.on.munge_key_available:
            self._on_configure_munge_key,
            self._slurmrestd.on.restart_slurmrestd:
            self._on_restart_slurmrestd,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        self._slurm_manager.install(self.config["snapstore-channel"])
        self.unit.status = ActiveStatus("slurm installed")
        self._stored.slurm_installed = True

    def _on_upgrade(self, event):
        """Upgrade charm event handler."""
        slurm_config = self._check_status()
        snapstore_channel = self.config["snapstore-channel"]
        self._slurm_manager.upgrade(slurm_config, snapstore_channel)

    def _on_restart_slurmrestd(self, event):
        """Resart the slurmrestd component."""
        self._slurm_manager.restart_slurm_component()

    def _on_configure_munge_key(self, event):
        """Configure the munge key.

        1) Get the munge key from the stored state of the slurmrestd relation
        2) Write the munge key to the munge key path and chmod
        3) Restart munged
        4) Set munge_key_available in charm stored state
        """
        if not self._stored.slurm_installed:
            event.defer()
            return
        munge_key = self._slurmrestd.get_stored_munge_key()
        self._slurm_manager.configure_munge_key(munge_key)
        self._slurm_manager.restart_munged()
        self._stored.munge_key_available = True

    def _check_status(self):
        slurm_config = self._slurmrestd.get_stored_slurm_config()
        munge_key_available = self._stored.munge_key_available

        slurm_configurator_joined = self._slurmrestd.is_joined

        # Check and see if we have what we need for operation.
        if not slurm_configurator_joined:
            self.unit.status = BlockedStatus(
                "Needed relations: slurm-configurator")
            return None
        elif not (munge_key_available and slurm_config):
            self.unit.status = WaitingStatus("Waiting on: configuration")
            return None

        return dict(slurm_config)

    def _on_check_status_and_write_config(self, event):
        slurm_config = self._check_status()
        if not slurm_config:
            event.defer()
            return

        self._slurm_manager.render_slurm_configs(slurm_config)

        # Only restart slurmrestd the first time the node is brought up.
        if not self._stored.slurmrestd_restarted:
            self._slurm_manager.restart_slurm_component()
            self._stored.slurmrestd_restarted = True

        self.unit.status = ActiveStatus("slurmrestd available")
예제 #5
0
class SlurmctldCharm(CharmBase):
    """Slurmctld lifecycle events."""

    _stored = StoredState()

    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(
            munge_key=str(),
            slurmctld_controller_type=str(),
        )

        self._nrpe = Nrpe(self, "nrpe-external-master")

        self._slurm_manager = SlurmManager(self, "slurmctld")

        self._slurmctld = Slurmctld(self, "slurmctld")
        self._slurmctld_peer = SlurmctldPeer(self, "slurmctld-peer")

        event_handler_bindings = {
            self.on.install:
            self._on_install,
            self._slurmctld.on.slurm_config_available:
            self._on_check_status_and_write_config,
            self._slurmctld_peer.on.slurmctld_peer_available:
            self._on_slurmctld_peer_available,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        self._slurm_manager.install()
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("Slurm Installed")

    def _on_upgrade(self, event):
        self._slurm_manager.upgrade()

    def _on_slurmctld_peer_available(self, event):
        if self.framework.model.unit.is_leader():
            if self._slurmctld.is_joined:
                slurmctld_info = self._slurmctld_peer.get_slurmctld_info()
                if slurmctld_info:
                    self._slurmctld.set_slurmctld_info_on_app_relation_data(
                        slurmctld_info)
                    return
            event.defer()
            return

    def _on_check_status_and_write_config(self, event):
        if not self._check_status():
            event.defer()
            return

        slurm_config = self._slurmctld.get_slurm_config_from_relation()
        if not slurm_config:
            event.defer()
            return

        munge_key = self._stored.munge_key
        if not munge_key:
            event.defer()
            return

        self._slurm_manager.render_config_and_restart({
            **slurm_config, 'munge_key':
            munge_key
        })
        self.unit.status = ActiveStatus("Slurmctld Available")

    def _check_status(self):
        munge_key = self._stored.munge_key
        slurm_installed = self._stored.slurm_installed
        slurm_config = self._slurmctld.get_slurm_config_from_relation()

        if not (munge_key and slurm_installed and slurm_config):
            if not munge_key:
                self.unit.status = BlockedStatus(
                    "NEED RELATION TO SLURM CONFIGURATOR")
            elif not slurm_config:
                self.unit.status = BlockedStatus("WAITING ON SLURM CONFIG")
            else:
                self.unit.status = BlockedStatus("SLURM NOT INSTALLED")
            return False
        else:
            return True

    def set_munge_key(self, munge_key):
        """Set the munge_key in _stored state."""
        self._stored.munge_key = munge_key

    def get_slurm_component(self):
        """Return the slurm component."""
        return self._slurm_manager.slurm_component

    def get_hostname(self):
        """Return the hostname."""
        return self._slurm_manager.hostname

    def get_port(self):
        """Return the port."""
        return self._slurm_manager.port
예제 #6
0
class SlurmdCharm(CharmBase):
    """Operator charm responsible for facilitating slurmd lifecycle events."""

    _stored = StoredState()

    def __init__(self, *args):
        """Initialize charm state, and observe charm lifecycle events."""
        super().__init__(*args)

        self.config = self.model.config
        self.slurm_manager = SlurmManager(self, 'slurmd')
        self.slurmd = SlurmdProvides(self, "slurmd")

        self._stored.set_default(
            slurm_installed=False,
            slurm_config_available=False,
            slurm_config=dict(),
        )

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.config_changed: self._on_config_changed,
            self.on.upgrade_charm: self._on_upgrade,
            self.slurmd.on.slurmctld_available:
            self._on_render_config_and_restart,
            self.slurmd.on.slurmctld_unavailable:
            self._on_render_config_and_restart,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        """Install the slurm scheduler as snap or tar file."""
        self.slurm_manager.install()
        self.unit.status = ActiveStatus("Slurm Installed")
        self._stored.slurm_installed = True

    def _on_upgrade(self, event):
        """Upgrade charm event handler."""
        slurm_config = dict(self._stored.slurm_config)
        self.slurm_manager.upgrade(slurm_config, resource=False)

    def _on_config_changed(self, event):
        self.slurmd.force_set_config_on_app_relation_data()

    def _on_render_config_and_restart(self, event):
        """Retrieve slurm_config from controller and write slurm.conf."""
        slurm_installed = self._stored.slurm_installed
        slurm_config_available = self._stored.slurm_config_available

        if (slurm_installed and slurm_config_available):
            # cast StoredState -> python dict
            slurm_config = dict(self._stored.slurm_config)
            self.slurm_manager.render_config_and_restart(slurm_config)
            self.unit.status = ActiveStatus("Slurmd Available")
        else:
            self.unit.status = BlockedStatus(
                "Blocked need relation to slurmctld."
            )
            event.defer()
            return

    def is_slurm_installed(self):
        """Return true/false based on whether or not slurm is installed."""
        return self._stored.slurm_installed

    def set_slurm_config_available(self, config_available):
        """Set slurm_config_available in local stored state."""
        self._stored.slurm_config_available = config_available

    def set_slurm_config(self, slurm_config):
        """Set the slurm_config in local stored state."""
        self._stored.slurm_config = slurm_config
예제 #7
0
class SlurmdbdCharm(CharmBase):
    """Slurmdbd Charm."""

    _stored = StoredState()

    def __init__(self, *args):
        """Set the default class attributes."""
        super().__init__(*args)

        self._stored.set_default(
            db_info=dict(),
            slurmdbd_config=dict(),
            munge_key_available=False,
            slurm_installed=False,
        )

        self._db = MySQLClient(self, "db")
        self._nrpe = Nrpe(self, "nrpe-external-master")
        self._slurm_manager = SlurmManager(self, "slurmdbd")
        self._slurmdbd = Slurmdbd(self, "slurmdbd")
        self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.config_changed: self._write_config_and_restart_slurmdbd,
            self._db.on.database_available: self._write_config_and_restart_slurmdbd,
            self._slurmdbd_peer.on.slurmdbd_peer_available: self._write_config_and_restart_slurmdbd,
            self._slurmdbd.on.slurmdbd_available: self._write_config_and_restart_slurmdbd,
            self._slurmdbd.on.slurmdbd_unavailable: self._on_slurm_configurator_unavailable,
            self._slurmdbd.on.munge_key_available: self._on_munge_key_available,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        self._slurm_manager.install(self.config["snapstore-channel"])
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("slurm snap successfully installed")

    def _on_upgrade(self, event):
        """Handle upgrade charm event."""
        self._slurm_manager.upgrade()

    def _on_munge_key_available(self, event):
        if not self._stored.slurm_installed:
            event.defer()
            return
        munge_key = self._slurmdbd.get_munge_key()
        self._slurm_manager.configure_munge_key(munge_key)
        self._slurm_manager.restart_munged()
        self._stored.munge_key_available = True

    def _on_slurm_configurator_unavailable(self, event):
        self._stored.munge_key_available = False
        self._check_status()

    def _check_status(self) -> bool:
        """Check that we have the things we need."""
        db_info = self._stored.db_info
        munge_key_available = self._stored.munge_key_available
        slurm_installed = self._stored.slurm_installed
        slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info()

        deps = [
            slurmdbd_info,
            db_info,
            slurm_installed,
            munge_key_available,
        ]

        if not all(deps):
            if not db_info:
                self.unit.status = BlockedStatus(
                    "Need relation to MySQL."
                )
            elif not munge_key_available:
                self.unit.status = BlockedStatus(
                    "Need relation to slurm-configurator."
                )
            return False
        return True

    def _write_config_and_restart_slurmdbd(self, event):
        """Check for prereqs before writing config/restart of slurmdbd."""
        # Ensure all pre-conditions are met with _check_status(), if not
        # defer the event.
        if not self._check_status():
            event.defer()
            return

        db_info = self._stored.db_info
        slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info()
        slurmdbd_stored_config = dict(self._stored.slurmdbd_config)

        slurmdbd_config = {
            **self.config,
            **slurmdbd_info,
            **db_info,
        }

        if slurmdbd_config != slurmdbd_stored_config:
            self._stored.slurmdbd_config = slurmdbd_config
            self._slurm_manager.render_slurm_configs(slurmdbd_config)
            self._slurm_manager.restart_slurm_component()

            # Only the leader can set relation data on the application.
            # Enforce that no one other then the leader trys to set
            # application relation data.
            if self.model.unit.is_leader():
                self._slurmdbd.set_slurmdbd_info_on_app_relation_data(
                    slurmdbd_config,
                )
        self.unit.status = ActiveStatus("slurmdbd available")

    def get_port(self):
        """Return the port from slurm-ops-manager."""
        return self._slurm_manager.port

    def get_hostname(self):
        """Return the hostname from slurm-ops-manager."""
        return self._slurm_manager.hostname

    def set_db_info(self, db_info):
        """Set the db_info in the stored state."""
        self._stored.db_info = db_info
예제 #8
0
class SlurmdCharm(CharmBase):
    """Slurmd lifecycle events."""

    _stored = StoredState()

    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(
            munge_key_available=False,
            slurmd_restarted=False,
            user_node_state=str(),
            partition_name=str(),
        )

        self._nrpe = Nrpe(self, "nrpe-external-master")

        self._slurm_manager = SlurmManager(self, "slurmd")

        self._slurmd = Slurmd(self, "slurmd")
        self._slurmd_peer = SlurmdPeer(self, "slurmd-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.upgrade_charm: self._on_upgrade,
            self.on.start: self._on_check_status_and_write_config,
            self.on.config_changed: self._on_config_changed,
            self._slurmd_peer.on.slurmd_peer_available:
            self._on_set_partition_info_on_app_relation_data,
            self._slurmd_peer.on.slurmd_peer_departed:
            self._on_set_partition_info_on_app_relation_data,
            self._slurmd.on.slurm_config_available:
            self._on_check_status_and_write_config,
            self._slurmd.on.slurm_config_unavailable:
            self._on_check_status_and_write_config,
            self._slurmd.on.restart_slurmd:
            self._on_restart_slurmd,
            self._slurmd.on.munge_key_available: self._on_write_munge_key,
            self.on.set_node_state_action: self._on_set_node_state_action,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        self._slurm_manager.install(self.config["snapstore-channel"])

        if self.model.unit.is_leader():
            self._get_set_partition_name()
            logger.debug(f"PARTITION_NAME: {self._stored.partition_name}")
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("Slurm installed")

    def _on_upgrade(self, event):
        slurm_config = self._check_status()
        if not slurm_config:
            event.defer()
            return

        self._slurm_manager.upgrade(
            slurm_config,
            self.config["snapstore-channel"]
        )

    def _on_config_changed(self, event):
        if self.model.unit.is_leader():
            self._get_set_partition_name()
            if self._check_status():
                self._on_set_partition_info_on_app_relation_data(
                    event
                )

    def _on_write_munge_key(self, event):
        if not self._stored.slurm_installed:
            event.defer()
            return
        munge_key = self._slurmd.get_stored_munge_key()
        self._slurm_manager.configure_munge_key(munge_key)
        self._slurm_manager.restart_munged()
        self._stored.munge_key_available = True

    def _on_check_status_and_write_config(self, event):
        slurm_config = self._check_status()
        if not slurm_config:
            event.defer()
            return

        # if slurm_config['configless']:
        #    slurmctld_hostname = slurm_config['active_controller_hostname']
        #    self._slurm_manager.configure_slurmctld_hostname(
        #        slurmctld_hostname
        #    )
        #    self._slurm_manager.restart_slurm_component()
        # else:

        # Ensure we aren't dealing with a StoredDict before trying
        # to render the slurm.conf.
        slurm_config = dict(slurm_config)
        self._slurm_manager.render_slurm_configs(slurm_config)

        # Only restart slurmd the first time the node is brought up.
        if not self._stored.slurmd_restarted:
            self._slurm_manager.restart_slurm_component()
            self._stored.slurmd_restarted = True

        self.unit.status = ActiveStatus("slurmd available")

    def _on_restart_slurmd(self, event):
        self._slurm_manager.restart_slurm_component()

    def _check_status(self):
        munge_key_available = self._stored.munge_key_available
        slurm_installed = self._stored.slurm_installed
        slurm_config = self._slurmd.get_stored_slurm_config()

        slurmd_joined = self._slurmd.is_joined

        if not slurmd_joined:
            self.unit.status = BlockedStatus(
                "Needed relations: slurm-configurator"
            )
            return None

        elif not (munge_key_available and slurm_config and slurm_installed):
            self.unit.status = WaitingStatus(
                "Waiting on: configuration"
            )
            return None

        return dict(slurm_config)

    def _on_set_node_state_action(self, event):
        """Set the node state."""
        self._stored.user_node_state = event.params["node-state"]
        self._on_set_partition_info_on_app_relation_data(event)

    def _on_set_partition_info_on_app_relation_data(self, event):
        """Set the slurm partition info on the application relation data."""
        # Only the leader can set data on the relation.
        if self.framework.model.unit.is_leader():
            # If the relation with slurm-configurator exists then set our
            # partition info on the application relation data.
            # This handler shouldn't fire if the relation isn't made,
            # but add this extra check here just incase.
            if self._slurmd.is_joined:
                partition = self._assemble_partition()
                if partition:
                    self._slurmd.set_partition_info_on_app_relation_data(
                        partition
                    )
                    return
            event.defer()
            return

    def _assemble_partition(self):
        """Assemble the partition info."""
        partition_name = self._stored.partition_name
        partition_config = self.config.get("partition-config")
        partition_state = self.config.get("partition-state")

        slurmd_inventory = self._assemble_slurmd_inventory()

        return {
            "inventory": slurmd_inventory,
            "partition_name": partition_name,
            "partition_state": partition_state,
            "partition_config": partition_config,
        }

    def _assemble_slurmd_inventory(self):
        """Apply mutations to nodes in the partition, return slurmd nodes."""
        slurmd_inventory = self._slurmd_peer.get_slurmd_inventory()
        if not slurmd_inventory:
            return None

        # If the user has set custom state for nodes
        # ensure we update the state for the targeted nodes.
        user_node_state = self._stored.user_node_state
        if user_node_state:
            node_states = {
                item.split("=")[0]: item.split("=")[1]
                for item in user_node_state.split(",")
            }

            # Copy the slurmd_inventory returned from the the slurmd-peer
            # relation to a temporary variable that we will use to
            # iterate over while we conditionally make modifications to the
            # original inventory.
            slurmd_inventory_tmp = copy.deepcopy(slurmd_inventory)

            # Iterate over the slurmd nodes in the partition and check
            # for nodes that need their state modified.
            for partition in slurmd_inventory_tmp:
                partition_tmp = copy.deepcopy(partition)
                for slurmd_node in partition["inventory"]:
                    if slurmd_node["hostname"] in node_states.keys():
                        slurmd_node_tmp = copy.deepcopy(slurmd_node)
                        slurmd_node_tmp["state"] = \
                            node_states[slurmd_node["hostname"]]
                        partition_tmp["inventory"].remove(slurmd_node)
                        partition_tmp["inventory"].append(slurmd_node_tmp)
                slurmd_inventory.remove(partition)
                slurmd_inventory.append(partition_tmp)

        return slurmd_inventory

    def _get_set_partition_name(self):
        """Set the partition name."""
        # Determine if a partition-name config exists, if so
        # ensure the self._stored.partition_name is consistent with the
        # supplied config.
        # If no partition name has been specified then generate one.
        partition_name = self.config.get("partition-name")
        if partition_name:
            if partition_name != self._stored.partition_name:
                self._stored.partition_name = partition_name
        elif not self._stored.partition_name:
            self._stored.partition_name = f"juju-compute-{random_string()}"
        return

    def get_partition_name(self):
        """Return the partition_name."""
        return self._stored.partition_name

    def get_slurm_component(self):
        """Return the slurm component."""
        return self._slurm_manager.slurm_component

    def get_hostname(self):
        """Return the hostname."""
        return self._slurm_manager.hostname

    def get_port(self):
        """Return the port."""
        return self._slurm_manager.port
예제 #9
0
class SlurmctldCharm(CharmBase):
    """Slurmctld lifecycle events."""

    _stored = StoredState()

    def __init__(self, *args):
        """Init _stored attributes and interfaces, observe events."""
        super().__init__(*args)

        self._stored.set_default(
            munge_key_available=False,
            slurmctld_controller_type=str(),
        )

        self._nrpe = Nrpe(self, "nrpe-external-master")

        self._slurm_manager = SlurmManager(self, "slurmctld")

        self._slurmctld = Slurmctld(self, "slurmctld")
        self._slurmctld_peer = SlurmctldPeer(self, "slurmctld-peer")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self._slurmctld.on.slurm_config_available: self._on_check_status_and_write_config,
            self._slurmctld.on.scontrol_reconfigure: self._on_scontrol_reconfigure,
            self._slurmctld.on.restart_slurmctld: self._on_restart_slurmctld,
            self._slurmctld.on.munge_key_available: self._on_write_munge_key,
            self._slurmctld_peer.on.slurmctld_peer_available: self._on_slurmctld_peer_available,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        self._slurm_manager.install(self.config["snapstore-channel"])
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("slurm snap successfully installed")

    def _on_upgrade(self, event):
        slurm_config = dict(self._check_status())
        snapstore_channel = self.config["snapstore-channel"]
        self._slurm_manager.upgrade(slurm_config, snapstore_channel)

    def _on_write_munge_key(self, event):
        if not self._stored.slurm_installed:
            event.defer()
            return
        munge_key = self._slurmctld.get_stored_munge_key()
        self._slurm_manager.configure_munge_key(munge_key)
        self._slurm_manager.restart_munged()
        self._stored.munge_key_available = True

    def _on_slurmctld_peer_available(self, event):
        if self.framework.model.unit.is_leader():
            if self._slurmctld.is_joined:
                slurmctld_info = self._slurmctld_peer.get_slurmctld_info()
                if slurmctld_info:
                    self._slurmctld.set_slurmctld_info_on_app_relation_data(
                        slurmctld_info
                    )
                    return
            event.defer()
            return

    def _on_check_status_and_write_config(self, event):
        slurm_config = self._check_status()
        if not slurm_config:
            event.defer()
            return

        self._slurm_manager.render_slurm_configs(dict(slurm_config))
        self.unit.status = ActiveStatus("slurmctld available")

    def _on_restart_slurmctld(self, event):
        self._slurm_manager.restart_slurm_component()

    def _on_scontrol_reconfigure(self, event):
        self._slurm_manager.slurm_cmd("scontrol", "reconfigure")

    def _check_status(self):
        munge_key_available = self._stored.munge_key_available
        slurm_installed = self._stored.slurm_installed
        slurm_config = self._slurmctld.get_stored_slurm_config()

        slurmctld_joined = self._slurmctld.is_joined

        if not slurmctld_joined:
            self.unit.status = BlockedStatus(
                "Relations needed: slurm-configurator"
            )
            return None

        elif not (munge_key_available and slurm_installed and slurm_config):
            self.unit.status = WaitingStatus(
                "Waiting on: configuration"
            )
            return None

        return slurm_config

    def get_slurm_component(self):
        """Return the slurm component."""
        return self._slurm_manager.slurm_component

    def get_hostname(self):
        """Return the hostname."""
        return self._slurm_manager.hostname

    def get_port(self):
        """Return the port."""
        return self._slurm_manager.port
예제 #10
0
class SlurmConfiguratorCharm(CharmBase):
    """Facilitate slurm configuration operations."""

    _stored = StoredState()

    def __init__(self, *args):
        """Init charm, _stored defaults, interfaces and observe events."""
        super().__init__(*args)

        self._stored.set_default(
            munge_key=str(),
            override_slurm_conf=None,
            slurm_installed=False,
            slurmd_restarted=False,
            slurmctld_available=False,
            slurmdbd_available=False,
            slurmd_available=False,
            slurmrestd_available=False,
        )

        self._elasticsearch = Elasticsearch(self, "elasticsearch")
        self._grafana = GrafanaSource(self, "grafana-source")
        self._influxdb = InfluxDB(self, "influxdb-api")
        self._nhc = Nhc(self, "nhc")
        self._slurmrestd = Slurmrestd(self, "slurmrestd")
        self._slurm_manager = SlurmManager(self, "slurmd")
        self._slurmctld = Slurmctld(self, "slurmctld")
        self._slurmdbd = Slurmdbd(self, "slurmdbd")
        self._slurmd = Slurmd(self, "slurmd")
        self._prolog_epilog = PrologEpilog(self, "prolog-epilog")

        # #### Charm lifecycle events #### #
        event_handler_bindings = {
            # #### Juju lifecycle events #### #
            self.on.install:
            self._on_install,
            self.on.config_changed:
            self._on_check_status_and_write_config,
            self.on.upgrade_charm:
            self._on_upgrade,
            # ######## Addons lifecycle events ######## #
            self._elasticsearch.on.elasticsearch_available:
            self._on_check_status_and_write_config,
            self._elasticsearch.on.elasticsearch_unavailable:
            self._on_check_status_and_write_config,
            self._grafana.on.grafana_available:
            self._on_grafana_available,
            self._influxdb.on.influxdb_available:
            self._on_influxdb_available,
            self._influxdb.on.influxdb_unavailable:
            self._on_check_status_and_write_config,
            self._nhc.on.nhc_bin_available:
            self._on_check_status_and_write_config,
            # ######## Slurm component lifecycle events ######## #
            self._slurmctld.on.slurmctld_available:
            self._on_check_status_and_write_config,
            self._slurmctld.on.slurmctld_unavailable:
            self._on_check_status_and_write_config,
            self._slurmdbd.on.slurmdbd_available:
            self._on_check_status_and_write_config,
            self._slurmdbd.on.slurmdbd_unavailable:
            self._on_check_status_and_write_config,
            self._slurmd.on.slurmd_available:
            self._on_check_status_and_write_config,
            self._slurmd.on.slurmd_unavailable:
            self._on_check_status_and_write_config,
            self._slurmrestd.on.slurmrestd_available:
            self._on_slurmrestd_available,
            self._slurmrestd.on.slurmrestd_unavailable:
            self._on_check_status_and_write_config,
            self._prolog_epilog.on.prolog_epilog_available:
            self._on_check_status_and_write_config,
            self._prolog_epilog.on.prolog_epilog_unavailable:
            self._on_check_status_and_write_config,
            # Actions
            self.on.scontrol_reconfigure_action:
            self._on_scontrol_reconfigure,
            self.on.get_slurm_conf_action:
            self._on_get_slurm_conf,
            self.on.set_slurm_conf_action:
            self._on_set_slurm_conf,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_scontrol_reconfigure(self, event):
        """Run 'scontrol reconfigure' on slurmctld."""
        self._slurmctld.scontrol_reconfigure()

    def _on_get_slurm_conf(self, event):
        """Return the slurm.conf."""
        # Determine if we have an override config.
        override_slurm_conf = self._stored.override_slurm_conf
        if override_slurm_conf:
            slurm_conf = override_slurm_conf
        else:
            slurm_conf = self._slurm_manager.get_slurm_conf()

        # Return the slurm.conf as the result of the action.
        event.set_results({"slurm.conf": slurm_conf})

    def _on_set_slurm_conf(self, event):
        """Set the override slurm.conf."""
        self._stored.override_slurm_conf = event.params["slurm-conf"]

    def _on_install(self, event):
        """Install the slurm snap and capture the munge key."""
        self._slurm_manager.install(self.config["snapstore-channel"])
        self._stored.munge_key = self._slurm_manager.get_munge_key()
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("slurm installed")

    def _on_upgrade(self, event):
        """Upgrade the charm."""
        slurm_config = \
            self._stored.override_slurm_conf or self._assemble_slurm_config()

        if not slurm_config:
            self.unit.status = BlockedStatus(
                "Cannot generate slurm_config, defering upgrade.")
            event.defer()
            return

        self._slurm_manager.upgrade(slurm_config,
                                    self.config["snapstore-channel"])

    def _on_grafana_available(self, event):
        """Create the grafana-source if we are the leader and have influxdb."""
        leader = self._is_leader()
        influxdb_info = self._get_influxdb_info()
        grafana = self._grafana

        if leader and influxdb_info:
            grafana.set_grafana_source_info(influxdb_info)

    def _on_influxdb_available(self, event):
        """Create the grafana-source if we have all the things."""
        grafana = self._grafana
        influxdb_info = self._get_influxdb_info()
        leader = self._is_leader()

        if leader and grafana.is_joined and influxdb_info:
            grafana.set_grafana_source_info(influxdb_info)

        self._on_check_status_and_write_config(event)

    def _on_slurmrestd_available(self, event):
        """Set slurm_config on the relation when slurmrestd available."""
        if not self._check_status():
            event.defer()
            return

        # Generate the slurm_config
        slurm_config = self._assemble_slurm_config()

        if not slurm_config:
            self.unit.status = BlockedStatus(
                "Cannot generate slurm_config - defering event.")
            event.defer()
            return

        if self._stored.slurmrestd_available:
            self._slurmrestd.set_slurm_config_on_app_relation_data(
                slurm_config, )
            self._slurmrestd.restart_slurmrestd()

    def _on_check_status_and_write_config(self, event):
        """Check that we have what we need before we proceed."""
        if not self._check_status():
            event.defer()
            return

        # Generate the slurm_config
        slurm_config = self._assemble_slurm_config()

        if not slurm_config:
            self.unit.status = BlockedStatus(
                "Cannot generate slurm_config - defering event.")
            event.defer()
            return

        self._slurmctld.set_slurm_config_on_app_relation_data(slurm_config, )
        self._slurmctld.restart_slurmctld()

        self._slurmd.set_slurm_config_on_app_relation_data(slurm_config, )

        if self._stored.slurmrestd_available:
            self._slurmrestd.set_slurm_config_on_app_relation_data(
                slurm_config, )
            self._slurmrestd.restart_slurmrestd()

        self._slurm_manager.render_slurm_configs(slurm_config)

        if not self._stored.slurmd_restarted:
            self._slurm_manager.restart_slurm_component()
            self._stored.slurmd_restarted = True

        self._slurmctld.scontrol_reconfigure()

    def _assemble_slurm_config(self):
        """Assemble and return the slurm config."""
        slurmctld_info = self._slurmctld.get_slurmctld_info()
        slurmdbd_info = self._slurmdbd.get_slurmdbd_info()
        slurmd_info = self._slurmd.get_slurmd_info()

        if not (slurmd_info and slurmctld_info and slurmdbd_info):
            return {}

        addons_info = self._assemble_addons()
        partitions_info = self._assemble_partitions(slurmd_info)

        logger.debug(addons_info)
        logger.debug(partitions_info)
        logger.debug(slurmctld_info)
        logger.debug(slurmdbd_info)

        return {
            "partitions": partitions_info,
            **slurmctld_info,
            **slurmdbd_info,
            **addons_info,
            **self.config,
        }

    def _assemble_partitions(self, slurmd_info):
        """Make any needed modifications to partition data."""
        slurmd_info_tmp = copy.deepcopy(slurmd_info)
        default_partition_from_config = self.config.get("default_partition")

        for partition in slurmd_info:
            # Deep copy the partition to a tmp var so we can modify it as
            # needed whilst not modifying the object we are iterating over.
            partition_tmp = copy.deepcopy(partition)
            # Extract the partition_name from the partition.
            partition_name = partition["partition_name"]

            # Check that the default_partition isn't defined in the charm
            # config.
            # If the user hasn't provided a default partition, then we infer
            # the partition_default by defaulting to the "configurator"
            # partition.
            if not default_partition_from_config:
                if partition["partition_name"] == "configurator":
                    partition_tmp["partition_default"] = "YES"
            else:
                if default_partition_from_config == partition_name:
                    partition_tmp["partition_default"] = "YES"

            slurmd_info_tmp.remove(partition)
            slurmd_info_tmp.append(partition_tmp)

        return slurmd_info_tmp

    def _assemble_addons(self):
        """Assemble any addon components."""
        acct_gather = self._get_influxdb_info()
        elasticsearch_ingress = self._elasticsearch.get_elasticsearch_ingress()
        nhc_info = self._nhc.get_nhc_info()
        prolog_epilog = self._prolog_epilog.get_prolog_epilog()

        ctxt = dict()

        if prolog_epilog:
            ctxt["prolog_epilog"] = prolog_epilog

        if acct_gather:
            ctxt["acct_gather"] = acct_gather
            acct_gather_custom = self.config.get("acct_gather_custom")
            if acct_gather_custom:
                ctxt["acct_gather"]["custom"] = acct_gather_custom

        if nhc_info:
            ctxt["nhc"] = {
                "nhc_bin": nhc_info["nhc_bin"],
                "health_check_interval": nhc_info["health_check_interval"],
                "health_check_node_state": nhc_info["health_check_node_state"],
            }

        if elasticsearch_ingress:
            ctxt["elasticsearch_address"] = elasticsearch_ingress

        return ctxt

    def _check_status(self):
        """Check that the core components we need exist."""
        slurm_component_statuses = {
            "slurmctld": {
                "available": self._stored.slurmctld_available,
                "joined": self._slurmctld.is_joined,
            },
            "slurmd": {
                "available": self._stored.slurmd_available,
                "joined": self._slurmd.is_joined,
            },
            "slurmdbd": {
                "available": self._stored.slurmdbd_available,
                "joined": self._slurmdbd.is_joined,
            },
        }

        relations_needed = []
        waiting_on = []

        msg = str()

        for slurm_component in slurm_component_statuses.keys():
            if not slurm_component_statuses[slurm_component]["joined"]:
                relations_needed.append(slurm_component)
            elif not slurm_component_statuses[slurm_component]["available"]:
                waiting_on.append(slurm_component)

        relations_needed_len = len(relations_needed)
        waiting_on_len = len(waiting_on)

        if relations_needed_len > 0:
            msg += f"Needed relations: {','.join(relations_needed)} "

        if waiting_on_len > 0:
            msg += f"Waiting on: {','.join(waiting_on)}"

        # Using what we have gathered about the status of each slurm component,
        # determine the application status.
        if relations_needed_len > 0:
            self.unit.status = BlockedStatus(msg)
        elif waiting_on_len > 0:
            self.unit.status = WaitingStatus(msg)
        else:
            self.unit.status = ActiveStatus("slurm-configurator available")
            return True
        return False

    def _get_influxdb_info(self):
        """Return influxdb info."""
        return self._influxdb.get_influxdb_info()

    def _is_leader(self):
        return self.model.unit.is_leader()

    def get_munge_key(self):
        """Return the slurmdbd_info from stored state."""
        return self._stored.munge_key

    def is_slurm_installed(self):
        """Return true/false based on whether or not slurm is installed."""
        return self._stored.slurm_installed

    def set_slurmctld_available(self, slurmctld_available):
        """Set slurmctld_available."""
        self._stored.slurmctld_available = slurmctld_available

    def set_slurmdbd_available(self, slurmdbd_available):
        """Set slurmdbd_available."""
        self._stored.slurmdbd_available = slurmdbd_available

    def set_slurmd_available(self, slurmd_available):
        """Set slurmd_available."""
        self._stored.slurmd_available = slurmd_available

    def set_slurmrestd_available(self, slurmrestd_available):
        """Set slurmrestd_available."""
        self._stored.slurmrestd_available = slurmrestd_available
예제 #11
0
class SlurmdbdCharm(CharmBase):
    """Slurmdbd Charm."""

    _stored = StoredState()

    def __init__(self, *args):
        """Set the default class attributes."""
        super().__init__(*args)

        self._stored.set_default(munge_key=str())
        self._stored.set_default(db_info=dict())
        self._stored.set_default(slurm_installed=False)

        self._nrpe = Nrpe(self, "nrpe-external-master")

        self._slurm_manager = SlurmManager(self, "slurmdbd")

        self._slurmdbd = Slurmdbd(self, "slurmdbd")
        self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer")

        self._db = MySQLClient(self, "db")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.config_changed: self._write_config_and_restart_slurmdbd,
            self._db.on.database_available:
            self._write_config_and_restart_slurmdbd,
            self._slurmdbd_peer.on.slurmdbd_peer_available:
            self._write_config_and_restart_slurmdbd,
            self._slurmdbd.on.slurmdbd_available:
            self._write_config_and_restart_slurmdbd,
            self._slurmdbd.on.slurmdbd_unavailable:
            self._on_slurmdbd_unavailable,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        self._slurm_manager.install()
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("Slurm Installed")

    def _on_upgrade(self, event):
        """Handle upgrade charm event."""
        self._slurm_manager.upgrade()

    def _on_leader_elected(self, event):
        self._slurmdbd_peer._on_relation_changed(event)

    def _on_slurmdbd_unavailable(self, event):
        self._check_status()

    def _check_status(self) -> bool:
        """Check that we have the things we need."""
        db_info = self._stored.db_info
        munge_key = self._stored.munge_key
        slurm_installed = self._stored.slurm_installed
        slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info()

        deps = [
            slurmdbd_info,
            db_info,
            slurm_installed,
            munge_key,
        ]

        if not all(deps):
            if not db_info:
                self.unit.status = BlockedStatus("Need relation to MySQL.")
            elif not munge_key:
                self.unit.status = BlockedStatus(
                    "Need relation to slurm-configurator.")
            return False
        return True

    def _write_config_and_restart_slurmdbd(self, event):
        """Check for prereqs before writing config/restart of slurmdbd."""
        if not self._check_status():
            event.defer()
            return

        db_info = self._stored.db_info
        slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info()

        slurmdbd_config = {
            'munge_key': self._stored.munge_key,
            **self.model.config,
            **slurmdbd_info,
            **db_info,
        }
        if self.model.unit.is_leader():
            self._slurmdbd.set_slurmdbd_info_on_app_relation_data(
                slurmdbd_info)
        self._slurm_manager.render_config_and_restart(slurmdbd_config)
        self.unit.status = ActiveStatus("Slurmdbd Available")

    def get_port(self):
        """Return the port from slurm-ops-manager."""
        return self._slurm_manager.port

    def get_hostname(self):
        """Return the hostname from slurm-ops-manager."""
        return self._slurm_manager.hostname

    def get_slurm_component(self):
        """Return the slurm component."""
        return self._slurm_manager.slurm_component

    def set_munge_key(self, munge_key):
        """Set the munge key in the stored state."""
        self._stored.munge_key = munge_key

    def set_db_info(self, db_info):
        """Set the db_info in the stored state."""
        self._stored.db_info = db_info
예제 #12
0
class SlurmdbdCharm(CharmBase):
    """Slurmdbd Charm."""

    _stored = StoredState()

    def __init__(self, *args):
        """Set the default class attributes."""
        super().__init__(*args)

        self._stored.set_default(munge_key=str())
        self._stored.set_default(db_info=dict())
        self._stored.set_default(slurm_installed=False)

        self._nrpe = Nrpe(self, "nrpe-external-master")

        self._slurm_manager = SlurmManager(self, "slurmdbd")

        self._slurmdbd = Slurmdbd(self, "slurmdbd")
        self._slurmdbd_peer = SlurmdbdPeer(self, "slurmdbd-peer")

        self._db = MySQLClient(self, "db")

        event_handler_bindings = {
            self.on.install: self._on_install,
            self.on.config_changed: self._write_config_and_restart_slurmdbd,
            self._db.on.database_available:
            self._write_config_and_restart_slurmdbd,
            self._slurmdbd_peer.on.slurmdbd_peer_available:
            self._write_config_and_restart_slurmdbd,
            self._slurmdbd.on.slurmdbd_available:
            self._write_config_and_restart_slurmdbd,
            self._slurmdbd.on.slurmdbd_unavailable:
            self._on_slurmdbd_unavailable,
        }
        for event, handler in event_handler_bindings.items():
            self.framework.observe(event, handler)

    def _on_install(self, event):
        self._slurm_manager.install()
        self._stored.slurm_installed = True
        self.unit.status = ActiveStatus("Slurm Installed")

    def _on_upgrade(self, event):
        """Handle upgrade charm event."""
        self._slurm_manager.upgrade()

    def _on_leader_elected(self, event):
        self._slurmdbd_peer._on_relation_changed(event)

    def _on_slurmdbd_unavailable(self, event):
        self._check_status()

    def _check_status(self) -> bool:
        """Check that we have the things we need."""
        db_info = self._stored.db_info
        munge_key = self._stored.munge_key
        slurm_installed = self._stored.slurm_installed
        slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info()

        deps = [
            slurmdbd_info,
            db_info,
            slurm_installed,
            munge_key,
        ]

        if not all(deps):
            if not db_info:
                self.unit.status = BlockedStatus("Need relation to MySQL.")
            elif not munge_key:
                self.unit.status = BlockedStatus(
                    "Need relation to slurm-configurator.")
            return False
        return True

    def _write_config_and_restart_slurmdbd(self, event):
        """Check for prereqs before writing config/restart of slurmdbd."""
        # Ensure all pre-conditions are met with _check_statu(), if not
        # defer the event.
        if not self._check_status():
            event.defer()
            return

        db_info = self._stored.db_info
        slurmdbd_info = self._slurmdbd_peer.get_slurmdbd_info()

        slurmdbd_config = {
            'munge_key': self._stored.munge_key,
            **self.model.config,
            **slurmdbd_info,
            **db_info,
        }

        self._slurm_manager.render_config_and_restart(slurmdbd_config)
        logger.debug("rendering config and restarting")
        # Only the leader can set relation data on the application.
        # Enforce that no one other then the leader trys to set
        # application relation data.
        if self.model.unit.is_leader():
            self._slurmdbd.set_slurmdbd_info_on_app_relation_data({
                # Juju, and subsequently the operator framework do not
                # emit relation-changed events if data hasn't actually
                # changed on the other side of the relation. Even if we set
                # the data multiple times, it doesn't mean anything unless
                # the data being set is different then what already exists
                # in the relation data.
                #
                # We use 'slurmdbd_info_id' to ensure the slurmdbd_info
                # is unique each time it is set on the application relation
                # data. This is needed so that that related applications
                # (namely slurm-configurator) will observe a
                # relation-changed event.
                #
                # This event (_write_config_and_restart_slurmdbd) may be
                # invoked multiple times once _check_status() returns True
                # (aka pre-conditions are met that account for the deffered
                # invocations.)
                # This means that the same slurmdbd_info data may be set on
                # application data multiple times and slurmdbd may be
                # reconfigured and restarted while slurmctld and the rest
                # of the stack are trying to come up and create the clustr.
                #
                # We need slurm-configurator to emit the relation-changed
                # event for the slurmdbd relation every time data is set,
                # not just when data has changed.
                # slurm-configurator need to re-emit its chain
                # of observed events to ensure all services end up getting
                # reconfigured *and* restarted *after* slurmdbd, for each
                # time that slurmdbd gets reconfigured and restarted.
                #
                # For this reason, 'slurmdbd_info_id' only
                # matters in the context of making sure the application
                # relation data actually changes so that relation-changed
                # event is observed on the other side.
                'slurmdbd_info_id':
                str(uuid.uuid4()),
                **slurmdbd_info
            })
        self.unit.status = ActiveStatus("Slurmdbd Available")

    def get_port(self):
        """Return the port from slurm-ops-manager."""
        return self._slurm_manager.port

    def get_hostname(self):
        """Return the hostname from slurm-ops-manager."""
        return self._slurm_manager.hostname

    def get_slurm_component(self):
        """Return the slurm component."""
        return self._slurm_manager.slurm_component

    def set_munge_key(self, munge_key):
        """Set the munge key in the stored state."""
        self._stored.munge_key = munge_key

    def set_db_info(self, db_info):
        """Set the db_info in the stored state."""
        self._stored.db_info = db_info