class RabbitMQOperatorPeers(Object): """RabbitMQ Operator Peer interface""" on = RabbitMQOperatorPeersEvents() state = StoredState() OPERATOR_PASSWORD = "******" OPERATOR_USER_CREATED = "operator_user_created" ERLANG_COOKIE = "erlang_cookie" def __init__(self, charm, relation_name): super().__init__(charm, relation_name) self.relation_name = relation_name self.framework.observe(charm.on[relation_name].relation_created, self.on_created) self.framework.observe(charm.on[relation_name].relation_changed, self.on_changed) self.framework.observe(charm.on[relation_name].relation_broken, self.on_broken) @property def peers_rel(self): return self.framework.model.get_relation(self.relation_name) def on_created(self, event): logging.debug("RabbitMQOperatorPeers on_created") self.on.connected.emit() def on_broken(self, event): logging.debug("RabbitMQOperatorPeers on_broken") self.on.gonewaway.emit() def on_changed(self, event): logging.debug("RabbitMQOperatorPeers on_changed") if self.operator_password and self.erlang_cookie: self.on.ready.emit() def set_operator_password(self, password: str): logging.debug("Setting operator password") self.peers_rel.data[self.peers_rel.app][ self.OPERATOR_PASSWORD] = password def set_operator_user_created(self, user: str): logging.debug("Setting operator user created") self.peers_rel.data[self.peers_rel.app][ self.OPERATOR_USER_CREATED] = user def set_erlang_cookie(self, cookie: str): """Set Erlang cookie for RabbitMQ clustering.""" logging.debug("Setting erlang cookie") self.peers_rel.data[self.peers_rel.app][self.ERLANG_COOKIE] = cookie def store_password(self, username: str, password: str): """Store username and password.""" logging.debug(f"Storing password for {username}") self.peers_rel.data[self.peers_rel.app][username] = password def retrieve_password(self, username: str) -> str: """Retrieve persisted password for provided username""" if not self.peers_rel: return None return str(self.peers_rel.data[self.peers_rel.app].get(username)) @property def operator_password(self) -> str: if not self.peers_rel: return None return self.peers_rel.data[self.peers_rel.app].get( self.OPERATOR_PASSWORD) @property def operator_user_created(self) -> str: if not self.peers_rel: return None return self.peers_rel.data[self.peers_rel.app].get( self.OPERATOR_USER_CREATED) @property def erlang_cookie(self) -> str: if not self.peers_rel: return None return self.peers_rel.data[self.peers_rel.app].get(self.ERLANG_COOKIE)
class AmfCharm(CharmBase): """AMF charm events class definition""" state = StoredState() def __init__(self, *args) -> NoReturn: """AMF charm constructor.""" super().__init__(*args) # Internal state initialization self.state.set_default(pod_spec=None) self.image = OCIImageResource(self, "image") # Registering regular events self.framework.observe(self.on.config_changed, self.configure_pod) # Registering required relation changed events self.framework.observe(self.on.nrf_relation_changed, self._on_nrf_relation_changed) # Registering required relation broken events self.framework.observe(self.on.nrf_relation_broken, self._on_nrf_relation_broken) # -- initialize states -- self.state.set_default(nrf_host=None) def publish_amf_info(self, _=None) -> NoReturn: """Publishes AMF information relation.7. """ if not self.unit.is_leader(): return relation_id = self.model.relations.__getitem__("amf") for i in relation_id: relation = self.model.get_relation("amf", i.id) relation.data[self.model.app]["hostname"] = self.model.app.name def _on_nrf_relation_changed(self, event: EventBase) -> NoReturn: """Reads information about the NRF relation. Args: event (EventBase): NRF relation event. """ if event.app not in event.relation.data: return nrf_host = event.relation.data[event.app].get("hostname") if nrf_host and self.state.nrf_host != nrf_host: self.state.nrf_host = nrf_host self.configure_pod() def _on_nrf_relation_broken(self, _=None) -> NoReturn: """Clears data from NRF relation departed.""" self.state.nrf_host = None self.configure_pod() def _missing_relations(self) -> str: """Checks if there missing relations. Returns: str: string with missing relations. """ data_status = {"nrf": self.state.nrf_host} missing_relations = [k for k, v in data_status.items() if not v] return ", ".join(missing_relations) @property def relation_state(self) -> Dict[str, Any]: """Collects relation state configuration for pod spec assembly. Returns: Dict[str, Any]: relation state information. """ relation_state = {"nrf_host": self.state.nrf_host} return relation_state def configure_pod(self, _=None) -> NoReturn: """Assemble the pod spec and apply it, if possible.""" missing = self._missing_relations() if missing: status = "Waiting for {0} relation{1}" self.unit.status = BlockedStatus( status.format(missing, "s" if "," in missing else "")) return if not self.unit.is_leader(): self.unit.status = ActiveStatus("ready") return self.unit.status = MaintenanceStatus("Assembling pod spec") # Fetch image information try: self.unit.status = MaintenanceStatus("Fetching image information") image_info = self.image.fetch() except OCIImageResourceError: self.unit.status = BlockedStatus( "Error fetching image information") return try: pod_spec = make_pod_spec( image_info, self.model.config, self.model.app.name, self.relation_state, ) except ValueError as exc: logger.exception("Config/Relation data validation error") self.unit.status = BlockedStatus(str(exc)) return if self.state.pod_spec != pod_spec: self.model.pod.set_spec(pod_spec) self.state.pod_spec = pod_spec self.unit.status = ActiveStatus("ready") self.publish_amf_info()
class RabbitMQAMQPRequires(Object): """ RabbitMQAMQPRequires class """ on = RabbitMQAMQPServerEvents() _stored = StoredState() def __init__(self, charm, relation_name): super().__init__(charm, relation_name) self.charm = charm self.relation_name = relation_name self.framework.observe( self.charm.on[relation_name].relation_joined, self._on_amqp_relation_joined ) self.framework.observe( self.charm.on[relation_name].relation_changed, self._on_amqp_relation_changed ) self.framework.observe( self.charm.on[relation_name].relation_broken, self._on_amqp_relation_broken ) @property def _amqp_rel(self): """The AMQP relation.""" return self.framework.model.get_relation(self.relation_name) def _on_amqp_relation_joined(self, event): """AMQP relation joined.""" logging.debug("RabbitMQAMQPRequires on_joined") self.event = event self.on.has_amqp_servers.relation_event = event self.on.has_amqp_servers.emit() # TODO Move to charm code once the emit has this event attached self.request_access(event, self.charm.username, self.charm.vhost) def _on_amqp_relation_changed(self, event): """AMQP relation changed.""" logging.debug("RabbitMQAMQPRequires on_changed") self.event = event self.request_access(event, self.charm.username, self.charm.vhost) if self.password(event): self.on.ready_amqp_servers.emit() def _on_amqp_relation_broken(self, event): """AMQP relation broken.""" # TODO clear data on the relation logging.debug("RabbitMQAMQPRequires on_departed") def password(self, event): """Return the AMQP password from the server side of the relation.""" return event.relation.data[self._amqp_rel.app].get("password") def request_access(self, event, username, vhost): """Request access to the AMQP server. :param event: The current event :type EventsBase :param username: The requested username :type username: str :param vhost: The requested vhost :type vhost: str :returns: None :rtype: None """ logging.debug("Requesting AMQP user and vhost") event.relation.data[self.charm.app]['username'] = username event.relation.data[self.charm.app]['vhost'] = vhost
class PrometheusCharm(CharmBase): """A Juju Charm for Prometheus """ stored = StoredState() def __init__(self, *args): logger.debug('Initializing Charm') super().__init__(*args) self.stored.set_default(alertmanagers=dict()) self.framework.observe(self.on.config_changed, self._on_config_changed) self.framework.observe(self.on.stop, self._on_stop) self.framework.observe(self.on['alertmanager'].relation_changed, self.on_alertmanager_changed) self.framework.observe(self.on['alertmanager'].relation_departed, self.on_alertmanager_departed) self.framework.observe(self.on['grafana-source'].relation_changed, self.on_grafana_changed) def _on_config_changed(self, _): """Set a new Juju pod specification """ self.configure_pod() def _on_stop(self, _): """Mark unit is inactive """ self.unit.status = MaintenanceStatus('Pod is terminating.') def on_grafana_changed(self, event): """Provide Grafana with data source information """ event.relation.data[self.unit]['port'] = str( self.model.config['advertised-port']) event.relation.data[self.unit]['source-type'] = 'prometheus' def on_alertmanager_changed(self, event): """Set an alertmanager configuation """ if not self.unit.is_leader(): logger.debug('{} is not leader. ' 'Not handling alertmanager change.'.format( self.unit.name)) return if event.unit is None: self.stored.alertmanagers.pop(event.relation.id) logger.warning('Got null event unit on alertmanager changed') return alerting_config = event.relation.data[event.unit].get( 'alerting_config', {}) logger.debug('Received alerting config: {}'.format(alerting_config)) if not alerting_config: logger.warning( 'Got empty alerting config for relation id {}'.format( event.relation.id)) return self.stored.alertmanagers.update({event.relation.id: alerting_config}) self.configure_pod() def on_alertmanager_departed(self, event): """Remove an alertmanager configuration """ if not self.unit.is_leader(): logger.debug('{} is not leader. ' 'Not handling alertmanager departed.'.format( self.unit.name)) return self.stored.alertmanagers.pop(event.relation.id) self.configure_pod() def _cli_args(self): """Construct command line arguments for Prometheus """ config = self.model.config args = [ '--config.file=/etc/prometheus/prometheus.yml', '--storage.tsdb.path=/prometheus', '--web.enable-lifecycle', '--web.console.templates=/usr/share/prometheus/consoles', '--web.console.libraries=/usr/share/prometheus/console_libraries' ] # get log level allowed_log_levels = ['debug', 'info', 'warn', 'error', 'fatal'] if config.get('log-level'): log_level = config['log-level'].lower() else: log_level = 'info' # If log level is invalid set it to debug if log_level not in allowed_log_levels: logging.error('Invalid loglevel: {0} given, {1} allowed. ' 'defaulting to DEBUG loglevel.'.format( log_level, '/'.join(allowed_log_levels))) log_level = 'debug' # set log level args.append('--log.level={0}'.format(log_level)) # Expose Prometheus Adminstration API only if requested if config.get('web-enable-admin-api'): args.append('--web.enable-admin-api') # User specified Prometheus web page title if config.get('web-page-title'): # TODO: Validate and sanitize input args.append('--web.page-title="{0}"'.format( config['web-page-title'])) # Enable time series database compression if config.get('tsdb-wal-compression'): args.append('--storage.tsdb.wal-compression') # Set time series retention time if config.get('tsdb-retention-time') and self._is_valid_timespec( config['tsdb-retention-time']): args.append('--storage.tsdb.retention.time={}'.format( config['tsdb-retention-time'])) # Set maximum number of connections to prometheus server if config.get('web-max-connections'): args.append('--web.max-connections={}'.format( config['web-max-connections'])) # Set maximum number of pending alerts if config.get('alertmanager-notification-queue-capacity'): args.append('--alertmanager.notification-queue-capacity={}'.format( config['alertmanager-notification-queue-capacity'])) # Set timeout for alerts if config.get('alertmanager-timeout') and self._is_valid_timespec( config['alertmanager-timeout']): args.append('--alertmanager.timeout={}'.format( config['alertmanager-timeout'])) logger.debug("CLI args: {0}".format(' '.join(args))) return args def _is_valid_timespec(self, timeval): """Is a time interval unit and value valid """ if not timeval: return False time, unit = timeval[:-1], timeval[-1] if unit not in ['y', 'w', 'd', 'h', 'm', 's']: logger.error('Invalid unit {} in time spec'.format(unit)) return False try: int(time) except ValueError: logger.error('Can not convert time {} to integer'.format(time)) return False if not int(time) > 0: logger.error('Expected positive time spec but got {}'.format(time)) return False return True def _are_valid_labels(self, json_data): """Are Prometheus external labels valid """ if not json_data: return False try: labels = json.loads(json_data) except (ValueError, TypeError): logger.error( 'Can not parse external labels : {}'.format(json_data)) return False if not isinstance(labels, dict): logger.error( 'Expected label dictionary but got : {}'.format(labels)) return False for key, value in labels.items(): if not isinstance(key, str) or not isinstance(value, str): logger.error('External label keys/values must be strings') return False return True def _external_labels(self): """Extract external labels for Prometheus from configuration """ config = self.model.config labels = {} if config.get('external-labels') and self._are_valid_labels( config['external-labels']): labels = json.loads(config['external-labels']) return labels def _prometheus_global_config(self): """Construct Prometheus global configuration """ config = self.model.config global_config = {} labels = self._external_labels() if labels: global_config['external_labels'] = labels if config.get('scrape-interval') and self._is_valid_timespec( config['scrape-interval']): global_config['scrape_interval'] = config['scrape-interval'] if config.get('scrape-timeout') and self._is_valid_timespec( config['scrape-timeout']): global_config['scrape_timeout'] = config['scrape-timeout'] if config.get('evaluation-interval') and self._is_valid_timespec( config['evaluation-interval']): global_config['evaluation_interval'] = config[ 'evaluation-interval'] return global_config def _alerting_config(self): """Construct Prometheus altering configuation """ alerting_config = '' if len(self.stored.alertmanagers) < 1: logger.debug('No alertmanagers available') return alerting_config if len(self.stored.alertmanagers) > 1: logger.warning('More than one altermanager found. Using first!') manager = list(self.stored.alertmanagers.keys())[0] alerting_config = self.stored.alertmanagers.get(manager, '') return alerting_config def _prometheus_config(self): """Construct Prometheus configuration """ config = self.model.config scrape_config = { 'global': self._prometheus_global_config(), 'scrape_configs': [] } alerting_config = self._alerting_config() if alerting_config: scrape_config['alerting'] = alerting_config # By default only monitor prometheus server itself default_config = { 'job_name': 'prometheus', 'scrape_interval': '5s', 'scrape_timeout': '5s', 'metrics_path': '/metrics', 'honor_timestamps': True, 'scheme': 'http', 'static_configs': [{ 'targets': ['localhost:{}'.format(config['advertised-port'])] }] } scrape_config['scrape_configs'].append(default_config) # If monitoring of k8s is requested gather all scraping configuration for k8s if config.get('monitor-k8s'): with open('config/prometheus-k8s.yml') as yaml_file: k8s_scrape_configs = yaml.safe_load(yaml_file).get( 'scrape_configs', []) for k8s_config in k8s_scrape_configs: scrape_config['scrape_configs'].append(k8s_config) logger.debug('Prometheus config : {}'.format(scrape_config)) return yaml.dump(scrape_config) def _build_pod_spec(self): """Construct a Juju pod specification for Prometheus """ logger.debug('Building Pod Spec') config = self.model.config spec = { 'containers': [{ 'name': self.app.name, 'imageDetails': { 'imagePath': config['prometheus-image-path'], 'username': config.get('prometheus-image-username', ''), 'password': config.get('prometheus-image-password', '') }, 'args': self._cli_args(), 'readinessProbe': { 'httpGet': { 'path': '/-/ready', 'port': config['advertised-port'] }, 'initialDelaySeconds': 10, 'timeoutSeconds': 30 }, 'livenessProbe': { 'httpGet': { 'path': '/-/healthy', 'port': config['advertised-port'] }, 'initialDelaySeconds': 30, 'timeoutSeconds': 30 }, 'ports': [{ 'containerPort': config['advertised-port'], 'name': 'prometheus-http', 'protocol': 'TCP' }], 'files': [{ 'name': 'prometheus-config', 'mountPath': '/etc/prometheus', 'files': { 'prometheus.yml': self._prometheus_config() } }] }] } return spec def _check_config(self): """Identify missing but required items in configuation :returns: list of missing configuration items (configuration keys) """ logger.debug('Checking Config') config = self.model.config missing = [] if not config.get('prometheus-image-path'): missing.append('prometheus-image-path') if config.get('prometheus-image-username') \ and not config.get('prometheus-image-password'): missing.append('prometheus-image-password') return missing def configure_pod(self): """Setup a new Prometheus pod specification """ logger.debug('Configuring Pod') missing_config = self._check_config() if missing_config: logger.error('Incomplete Configuration : {}. ' 'Application will be blocked.'.format(missing_config)) self.unit.status = \ BlockedStatus('Missing configuration: {}'.format(missing_config)) return if not self.unit.is_leader(): self.unit.status = ActiveStatus('Prometheus unit is ready') return self.unit.status = MaintenanceStatus('Setting pod spec.') pod_spec = self._build_pod_spec() self.model.pod.set_spec(pod_spec) self.app.status = ActiveStatus('Prometheus Application is ready') self.unit.status = ActiveStatus('Prometheus leader unit is ready')
class SlurmdRequires(Object): """SlurmdRequires.""" on = SlurmdRequiresEvents() _state = StoredState() def __init__(self, charm, relation_name): """Set self._relation_name and self.charm.""" super().__init__(charm, relation_name) self._charm = charm self._relation_name = relation_name self._state.set_default(ingress_address=None) self.framework.observe( self._charm.on[self._relation_name].relation_created, self._on_relation_created) self.framework.observe( self._charm.on[self._relation_name].relation_changed, self._on_relation_changed) self.framework.observe( self._charm.on[self._relation_name].relation_broken, self._on_relation_broken) self.framework.observe( self._charm.on[self._relation_name].relation_departed, self._on_relation_departed) def _on_relation_created(self, event): unit_data = event.relation.data[self.model.unit] self._state.ingress_address = unit_data['ingress-address'] def _on_relation_changed(self, event): """Check for slurmdbd and slurmd, write config, set relation data.""" logger.debug('_on_relation_changed(): entering') if len(self.framework.model.relations['slurmd']) > 0: if not self._charm.is_slurmd_available(): self._charm.set_slurmd_available(True) self.on.slurmd_available.emit() else: self._charm.unit.status = BlockedStatus("Need > 0 units of slurmd") event.defer() return def _on_relation_departed(self, event): """Account for relation departed activity.""" relations = len(_get_slurmd_active_units()) logger.debug(f"number of slurmd relations: {relations}") if relations < 1: self._charm.set_slurmd_available(False) self.on.slurmd_departed.emit() def _on_relation_broken(self, event): """Account for relation broken activity.""" pass def _get_partitions(self, node_data): """Parse the node_data and return the hosts -> partition mapping.""" part_dict = collections.defaultdict(dict) for node in node_data: part_dict[node['partition_name']].setdefault('hosts', []) part_dict[node['partition_name']]['hosts'].append(node['hostname']) part_dict[node['partition_name']]['partition_default'] = \ True if node['partition_default'] == "true" else False if node.get('partition_config'): part_dict[node['partition_name']]['partition_config'] = \ node['partition_config'] return dict(part_dict) def _get_slurmd_node_data(self): """Return the node info for units of applications on the relation.""" nodes_info = list() relations = self.framework.model.relations['slurmd'] slurmd_active_units = _get_slurmd_active_units() for relation in relations: app = relation.app for unit in relation.units: if unit.name in slurmd_active_units: unit_data = relation.data[unit] app_data = relation.data[app] ctxt = { 'ingress_address': unit_data['ingress-address'], 'hostname': unit_data['hostname'], 'inventory': unit_data['inventory'], 'partition_name': app_data['partition_name'], 'partition_default': app_data['partition_default'], } # Related slurmd units don't specify custom # partition_config by default. # Only get partition_config if it exists on in the # related unit's unit data. if app_data.get('partition_config'): ctxt['partition_config'] = \ app_data['partition_config'] nodes_info.append(ctxt) return nodes_info def set_slurm_config_on_app_relation_data( self, relation, slurm_config, ): """Set the slurm_conifg to the app data on the relation. Setting data on the relation forces the units of related applications to observe the relation-changed event so they can acquire and render the updated slurm_config. """ relations = self._charm.framework.model.relations[relation] for relation in relations: relation.data[self.model.app]['slurm_config'] = json.dumps( slurm_config) def get_slurm_config(self): """Assemble and return the slurm_config.""" slurmctld_ingress_address = self._state.ingress_address slurmctld_hostname = socket.gethostname().split(".")[0] slurmdbd_info = dict(self._charm.get_slurmdbd_info()) slurmd_node_data = self._get_slurmd_node_data() partitions = self._get_partitions(slurmd_node_data) if slurmd_node_data and partitions: return { 'nodes': slurmd_node_data, 'partitions': partitions, 'slurmdbd_port': slurmdbd_info['port'], 'slurmdbd_hostname': slurmdbd_info['hostname'], 'slurmdbd_ingress_address': slurmdbd_info['ingress_address'], 'active_controller_hostname': slurmctld_hostname, 'active_controller_ingress_address': slurmctld_ingress_address, 'active_controller_port': "6817", 'munge_key': self._charm.get_munge_key(), **self.model.config, } else: return None
class CephISCSIGatewayPeers(Object): on = CephISCSIGatewayPeerEvents() state = StoredState() PASSWORD_KEY = 'admin_password' READY_KEY = 'gateway_ready' FQDN_KEY = 'gateway_fqdn' def __init__(self, charm, relation_name): super().__init__(charm, relation_name) self.relation_name = relation_name self.this_unit = self.framework.model.unit self.framework.observe(charm.on[relation_name].relation_changed, self.on_changed) def on_changed(self, event): logging.info("CephISCSIGatewayPeers on_changed") self.on.has_peers.emit() if self.ready_peer_details: self.on.ready_peers.emit() def set_admin_password(self, password): logging.info("Setting admin password") self.peer_rel.data[self.peer_rel.app][self.PASSWORD_KEY] = password def announce_ready(self): logging.info("announcing ready") self.peer_rel.data[self.this_unit][self.READY_KEY] = 'True' self.peer_rel.data[self.this_unit][self.FQDN_KEY] = self.fqdn @property def ready_peer_details(self): peers = { self.framework.model.unit.name: { 'fqdn': self.fqdn, 'ip': self.cluster_bind_address } } for u in self.peer_rel.units: if self.peer_rel.data[u].get(self.READY_KEY) == 'True': peers[u.name] = { 'fqdn': self.peer_rel.data[u][self.FQDN_KEY], 'ip': self.peer_rel.data[u]['ingress-address'] } return peers @property def fqdn(self): return socket.getfqdn() @property def is_joined(self): return self.peer_rel is not None @property def peer_rel(self): return self.framework.model.get_relation(self.relation_name) @property def peer_binding(self): return self.framework.model.get_binding(self.peer_rel) @property def cluster_bind_address(self): return str(self.peer_binding.network.bind_address) @property def admin_password(self): # https://github.com/canonical/operator/issues/148 # return self.peer_rel.data[self.peer_rel.app].get(self.PASSWORD_KEY) return 'hardcodedpassword' @property def peer_addresses(self): addresses = [self.cluster_bind_address] for u in self.peer_rel.units: addresses.append(self.peer_rel.data[u]['ingress-address']) return sorted(addresses)
"""Operator Charm main library.""" # Load modules from lib directory import logging import setuppath # noqa:F401 from ops.charm import CharmBase from ops.framework import StoredState from ops.main import main from ops.model import ActiveStatus, MaintenanceStatus class ${class}(CharmBase): """Class reprisenting this Operator charm.""" state = StoredState() def __init__(self, *args): """Initialize charm and configure states and events to observe.""" super().__init__(*args) # -- standard hook observation self.framework.observe(self.on.install, self.on_install) self.framework.observe(self.on.start, self.on_start) self.framework.observe(self.on.config_changed, self.on_config_changed) # -- initialize states -- self.state.set_default(installed=False) self.state.set_default(configured=False) self.state.set_default(started=False) def on_install(self, event): """Handle install state."""
class SlurmctldCharm(CharmBase): """Slurmctld lifecycle events.""" _stored = StoredState() def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default( munge_key_available=False, slurmctld_controller_type=str(), ) self._nrpe = Nrpe(self, "nrpe-external-master") self._slurm_manager = SlurmManager(self, "slurmctld") self._slurmctld = Slurmctld(self, "slurmctld") self._slurmctld_peer = SlurmctldPeer(self, "slurmctld-peer") event_handler_bindings = { self.on.install: self._on_install, self._slurmctld.on.slurm_config_available: self._on_check_status_and_write_config, self._slurmctld.on.scontrol_reconfigure: self._on_scontrol_reconfigure, self._slurmctld.on.restart_slurmctld: self._on_restart_slurmctld, self._slurmctld.on.munge_key_available: self._on_write_munge_key, self._slurmctld_peer.on.slurmctld_peer_available: self._on_slurmctld_peer_available, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): self._slurm_manager.install(self.config["snapstore-channel"]) self._stored.slurm_installed = True self.unit.status = ActiveStatus("slurm snap successfully installed") def _on_upgrade(self, event): slurm_config = dict(self._check_status()) snapstore_channel = self.config["snapstore-channel"] self._slurm_manager.upgrade(slurm_config, snapstore_channel) def _on_write_munge_key(self, event): if not self._stored.slurm_installed: event.defer() return munge_key = self._slurmctld.get_stored_munge_key() self._slurm_manager.configure_munge_key(munge_key) self._slurm_manager.restart_munged() self._stored.munge_key_available = True def _on_slurmctld_peer_available(self, event): if self.framework.model.unit.is_leader(): if self._slurmctld.is_joined: slurmctld_info = self._slurmctld_peer.get_slurmctld_info() if slurmctld_info: self._slurmctld.set_slurmctld_info_on_app_relation_data( slurmctld_info ) return event.defer() return def _on_check_status_and_write_config(self, event): slurm_config = self._check_status() if not slurm_config: event.defer() return self._slurm_manager.render_slurm_configs(dict(slurm_config)) self.unit.status = ActiveStatus("slurmctld available") def _on_restart_slurmctld(self, event): self._slurm_manager.restart_slurm_component() def _on_scontrol_reconfigure(self, event): self._slurm_manager.slurm_cmd("scontrol", "reconfigure") def _check_status(self): munge_key_available = self._stored.munge_key_available slurm_installed = self._stored.slurm_installed slurm_config = self._slurmctld.get_stored_slurm_config() slurmctld_joined = self._slurmctld.is_joined if not slurmctld_joined: self.unit.status = BlockedStatus( "Relations needed: slurm-configurator" ) return None elif not (munge_key_available and slurm_installed and slurm_config): self.unit.status = WaitingStatus( "Waiting on: configuration" ) return None return slurm_config def get_slurm_component(self): """Return the slurm component.""" return self._slurm_manager.slurm_component def get_hostname(self): """Return the hostname.""" return self._slurm_manager.hostname def get_port(self): """Return the port.""" return self._slurm_manager.port
class ServingActivatorCharm(CharmBase): _stored = StoredState() def __init__(self, *args): super().__init__(*args) if not self.unit.is_leader(): self.unit.status = WaitingStatus("Waiting for leadership") return # self.image = OCIImageResource(self, 'knative-activator-image') self.framework.observe(self.on.install, self._on_start) # self.framework.observe(self.on.config_changed, self._on_config_changed) # --- initialize states --- # self._stored.set_default(config_hash=self._config_hash()) self._stored.set_default(started=False) # -- base values -- self._stored.set_default(namespace=os.environ["JUJU_MODEL_NAME"]) def _on_start(self, event): """Occurs upon install, start, upgrade, and possibly config changed.""" if self._stored.started: return self.unit.status = MaintenanceStatus("Installing Knative Activator...") # try: #image_info = self.image.fetch() image_info = "gcr.io/knative-releases/knative.dev/serving/cmd/activator@sha256:1e3db4f2eeed42d3ef03f41cc3d07c333edab92af3653a530d6d5f370da96ab6" # except OCIImageResourceError: # logging.exception('An error occured while fetching the image info') # self.unit.status = BlockedStatus("Error fetching image information") # return self.model.pod.set_spec( { 'version': 3, 'containers': [{ 'name': 'activator', 'image': image_info, # 'imageDetails': image_info, 'imagePullPolicy': 'Always', 'ports': [{ 'containerPort': 9090, 'name': 'metrics' }, { 'containerPort': 8008, 'name': 'profiling' }, { 'containerPort': 8012, 'name': 'http1' }, { 'containerPort': 8013, 'name': 'h2c' }, ], 'envConfig': { 'GOGC': '500', 'POD_NAME':{ 'field': { 'path': "metadata.name" } }, 'POD_IP':{ 'field': { 'path': "status.podIP" } }, 'SYSTEM_NAMESPACE':{ 'field': { 'path': "metadata.namespace" } }, 'CONFIG_LOGGING_NAME':'config-logging', 'CONFIG_OBSERVABILITY_NAME': 'config-observability', 'METRICS_DOMAIN':'knative.dev/internal/serving', }, 'kubernetes': { 'securityContext': { 'privileged': False, 'readOnlyRootFilesystem': True, 'runAsNonRoot': True, 'capabilities': { 'drop': ['ALL'] } }, 'readinessProbe': { 'httpGet': { 'port': 8012, 'httpHeaders': [{ 'name': 'k-kubelet-probe', 'value': 'activator', }], }, 'failureThreshold': 12 }, 'livenessProbe': { 'initialDelaySeconds': 15, 'failureThreshold': 12, 'httpGet': { 'port': 8012, 'httpHeaders': [{ 'name': 'k-kubelet-probe', 'value': 'activator', }], } } }, }], }, k8s_resources={ 'kubernetesResources': { 'services': [ { # Need to create a 2nd service because of bug # lp:https://bugs.launchpad.net/juju/+bug/1902000 'name': 'activator-service', 'spec': { 'ports': [ { 'name': 'http-metrics', 'port': 9090, 'targetPort': 9090, }, { 'name': 'http-profiling', 'port': 8008, 'targetPort': 8008, }, { 'name': 'http', 'port': 80, 'targetPort': 8012, }, { 'name': 'http2', 'port': 81, 'targetPort': 8013, } ], 'selector': {'app': 'activator'}, } } ], } } ) self.unit.status = ActiveStatus("Ready")
class CephISCSIGatewayCharmBase(ops_openstack.OSBaseCharm): state = StoredState() PACKAGES = ['ceph-iscsi', 'tcmu-runner', 'ceph-common'] CEPH_CAPABILITIES = ["osd", "allow *", "mon", "allow *", "mgr", "allow r"] RESTART_MAP = { '/etc/ceph/ceph.conf': ['rbd-target-api', 'rbd-target-gw'], '/etc/ceph/iscsi-gateway.cfg': ['rbd-target-api'], '/etc/ceph/ceph.client.ceph-iscsi.keyring': ['rbd-target-api'] } DEFAULT_TARGET = "iqn.2003-01.com.ubuntu.iscsi-gw:iscsi-igw" REQUIRED_RELATIONS = ['ceph-client', 'cluster'] def __init__(self, framework, key): super().__init__(framework, key) logging.info("Using {} class".format(self.release)) self.state.set_default(target_created=False) self.state.set_default(enable_tls=False) self.state.set_default(additional_trusted_ips=[]) self.ceph_client = interface_ceph_client.CephClientRequires( self, 'ceph-client') self.peers = interface_ceph_iscsi_peer.CephISCSIGatewayPeers( self, 'cluster') self.tls = interface_tls_certificates.TlsRequires(self, "certificates") self.adapters = CephISCSIGatewayAdapters( (self.ceph_client, self.peers, self.tls), self) self.framework.observe(self.on.ceph_client_relation_joined, self) self.framework.observe(self.ceph_client.on.pools_available, self) self.framework.observe(self.peers.on.has_peers, self) self.framework.observe(self.peers.on.ready_peers, self) self.framework.observe(self.on.create_target_action, self) self.framework.observe(self.on.add_trusted_ip_action, self) self.framework.observe(self.on.certificates_relation_joined, self) self.framework.observe(self.on.certificates_relation_changed, self) self.framework.observe(self.on.config_changed, self) self.framework.observe(self.on.upgrade_charm, self) def on_add_trusted_ip_action(self, event): self.state.additional_trusted_ips.append( event.params['ips'].split(' ')) logging.info(self.state.additional_trusted_ips) def on_create_target_action(self, event): gw_client = gwcli_client.GatewayClient() target = event.params.get('iqn', self.DEFAULT_TARGET) gateway_units = event.params.get( 'gateway-units', [u for u in self.peers.ready_peer_details.keys()]) gw_client.create_target(target) for gw_unit, gw_config in self.peers.ready_peer_details.items(): added_gateways = [] if gw_unit in gateway_units: gw_client.add_gateway_to_target(target, gw_config['ip'], gw_config['fqdn']) added_gateways.append(gw_unit) gw_client.create_pool(self.model.config['rbd-pool'], event.params['image-name'], event.params['image-size']) gw_client.add_client_to_target(target, event.params['client-initiatorname']) gw_client.add_client_auth(target, event.params['client-initiatorname'], event.params['client-username'], event.params['client-password']) gw_client.add_disk_to_client(target, event.params['client-initiatorname'], self.model.config['rbd-pool'], event.params['image-name']) event.set_results({'iqn': target}) def setup_default_target(self): gw_client = gwcli_client.GatewayClient() gw_client.create_target(self.DEFAULT_TARGET) for gw_unit, gw_config in self.peers.ready_peer_details.items(): gw_client.add_gateway_to_target(self.DEFAULT_TARGET, gw_config['ip'], gw_config['fqdn']) self.state.target_created = True def on_ready_peers(self, event): if not self.unit.is_leader(): logging.info("Leader should do setup") return if not self.state.is_started: logging.info("Cannot perform setup yet, not started") event.defer() return if self.state.target_created: logging.info("Initial target setup already complete") return else: # This appears to race and sometime runs before the # peer is 100% ready. There is probably little value # in this anyway so may just remove it. # self.setup_default_target() return def on_has_peers(self, event): logging.info("Unit has peers") if self.unit.is_leader() and not self.peers.admin_password: logging.info("Setting admin password") alphabet = string.ascii_letters + string.digits password = ''.join(secrets.choice(alphabet) for i in range(8)) self.peers.set_admin_password(password) def on_ceph_client_relation_joined(self, event): logging.info("Requesting replicated pool") self.ceph_client.create_replicated_pool(self.model.config['rbd-pool']) logging.info("Requesting permissions") self.ceph_client.request_ceph_permissions('ceph-iscsi', self.CEPH_CAPABILITIES) self.ceph_client.request_osd_settings({ 'osd heartbeat grace': 20, 'osd heartbeat interval': 5 }) def on_config_changed(self, event): if self.state.is_started: self.on_pools_available(event) self.on_ceph_client_relation_joined(event) def on_upgrade_charm(self, event): if self.state.is_started: self.on_pools_available(event) self.on_ceph_client_relation_joined(event) def on_pools_available(self, event): logging.info("on_pools_available") if not self.peers.admin_password: logging.info("Defering setup") event.defer() return def daemon_reload_and_restart(service_name): subprocess.check_call(['systemctl', 'daemon-reload']) subprocess.check_call(['systemctl', 'restart', service_name]) rfuncs = {'rbd-target-api': daemon_reload_and_restart} @ch_host.restart_on_change(self.RESTART_MAP, restart_functions=rfuncs) def render_configs(): for config_file in self.RESTART_MAP.keys(): ch_templating.render(os.path.basename(config_file), config_file, self.adapters) logging.info("Rendering config") render_configs() logging.info("Setting started state") self.peers.announce_ready() self.state.is_started = True self.update_status() logging.info("on_pools_available: status updated") def on_certificates_relation_joined(self, event): addresses = set() for binding_name in ['public', 'cluster']: binding = self.model.get_binding(binding_name) addresses.add(binding.network.ingress_address) addresses.add(binding.network.bind_address) sans = [str(s) for s in addresses] sans.append(socket.gethostname()) self.tls.request_application_cert(socket.getfqdn(), sans) def on_certificates_relation_changed(self, event): app_certs = self.tls.application_certs if not all([self.tls.root_ca_cert, app_certs]): return if self.tls.chain: # Append chain file so that clients that trust the root CA will # trust certs signed by an intermediate in the chain ca_cert_data = self.tls.root_ca_cert + os.linesep + self.tls.chain pem_data = app_certs['cert'] + os.linesep + app_certs['key'] tls_files = { '/etc/ceph/iscsi-gateway.crt': app_certs['cert'], '/etc/ceph/iscsi-gateway.key': app_certs['key'], '/etc/ceph/iscsi-gateway.pem': pem_data, '/usr/local/share/ca-certificates/vault_ca_cert.crt': ca_cert_data } for tls_file, tls_data in tls_files.items(): with open(tls_file, 'w') as f: f.write(tls_data) subprocess.check_call(['update-ca-certificates']) cert_out = subprocess.check_output( ('openssl x509 -inform pem -in /etc/ceph/iscsi-gateway.pem ' '-pubkey -noout').split()) with open('/etc/ceph/iscsi-gateway-pub.key', 'w') as f: f.write(cert_out.decode('UTF-8')) self.state.enable_tls = True self.on_pools_available(event)
class Slurmdbd(Object): """Slurmdbd.""" _stored = StoredState() on = SlurmdbdEvents() def __init__(self, charm, relation_name): """Observe relation lifecycle events.""" super().__init__(charm, relation_name) self._charm = charm self._relation_name = relation_name self._stored.set_default(munge_key=None, ) self.framework.observe( self._charm.on[self._relation_name].relation_joined, self._on_relation_joined, ) self.framework.observe( self._charm.on[self._relation_name].relation_broken, self._on_relation_broken, ) def _on_relation_joined(self, event): """Handle the relation-joined event. Get the munge_key from slurm-configurator and save it to the charm stored state. """ # Since we are in relation-joined (with the app on the other side) # we can almost guarantee that the app object will exist in # the event, but check for it just in case. event_app_data = event.relation.data.get(event.app) if not event_app_data: event.defer() return # slurm-configurator sets the munge_key on the relation-created event # which happens before relation-joined. We can almost guarantee that # the munge key will exist at this point, but check for it just incase. munge_key = event_app_data.get("munge_key") if not munge_key: event.defer() return # Store the munge_key in the interface's stored state object and emit # munge_key_available. self._store_munge_key(munge_key) self.on.munge_key_available.emit() def _on_relation_broken(self, event): self.set_slurmdbd_info_on_app_relation_data("") self.on.slurmdbd_unavailable.emit() def set_slurmdbd_info_on_app_relation_data(self, slurmdbd_info): """Set slurmdbd_info.""" relations = self.framework.model.relations["slurmdbd"] # Iterate over each of the relations setting the relation data. for relation in relations: if slurmdbd_info != "": relation.data[self.model.app]["slurmdbd_info"] = json.dumps( slurmdbd_info) else: relation.data[self.model.app]["slurmdbd_info"] = "" def _store_munge_key(self, munge_key): """Set the munge key in the stored state.""" self._stored.munge_key = munge_key def get_munge_key(self): """Retrieve the munge key from the stored state.""" return self._stored.munge_key
class CephISCSIGatewayCharmOcto(CephISCSIGatewayCharmBase): state = StoredState() release = 'octopus'
class CephISCSIGatewayCharmJewel(CephISCSIGatewayCharmBase): state = StoredState() release = 'jewel'
class HaCluster(Object): state = StoredState() PACEMAKER_LOGIN_NAME = 'MSSQLPacemaker' PACEMAKER_LOGIN_CREDS_FILE = '/var/opt/mssql/secrets/passwd' APT_PACKAGES = ['fence-agents', 'resource-agents', 'mssql-server-ha'] UNIT_ACTIVE_STATUS = ActiveStatus('Unit is ready and clustered') def __init__(self, charm, relation_name): super().__init__(charm, relation_name) self.state.set_default(pacemaker_login_ready=False, ha_cluster_ready=False) self.relation_name = relation_name self.app = self.model.app self.unit = self.model.unit self.cluster = charm.cluster self.framework.observe(charm.on[relation_name].relation_joined, self.on_joined) self.framework.observe(charm.on[relation_name].relation_changed, self.on_changed) self.framework.observe(charm.cluster.on.created_ag, self.on_created_ag) def on_joined(self, event): if not self.cluster.is_ag_ready: logger.warning('The availability group is not ready. Defering ' 'hacluster on_joined until AG is ready.') event.defer() return logger.info('Installing Microsoft SQL Server HA components') retry_on_error()(apt_install)(packages=self.APT_PACKAGES, fatal=True) self.setup_pacemaker_mssql_login() rel_data = { 'resources': { 'ag_cluster': 'ocf:mssql:ag' }, 'resource_params': { 'ag_cluster': 'params ag_name="{ag_name}" ' 'meta failure-timeout=60s ' 'op start timeout=60s ' 'op stop timeout=60s ' 'op promote timeout=60s ' 'op demote timeout=10s ' 'op monitor timeout=60s interval=10s ' 'op monitor timeout=60s interval=11s role="Master" ' 'op monitor timeout=60s interval=12s role="Slave" ' 'op notify timeout=60s'.format(ag_name=self.cluster.AG_NAME) }, 'ms': { 'ms-ag_cluster': 'ag_cluster meta ' 'master-max="1" master-node-max="1" ' 'clone-max="3" clone-node-max="1" notify="true"' } } update_hacluster_vip('mssql', rel_data) group_name = VIP_GROUP_NAME.format(service='mssql') rel_data.update({ 'colocations': { 'vip_on_master': 'inf: {} ms-ag_cluster:Master'.format(group_name) }, 'orders': { 'ag_first': 'inf: ms-ag_cluster:promote {}:start'.format(group_name) } }) rel = self.model.get_relation(event.relation.name, event.relation.id) for k, v in rel_data.items(): rel.data[self.unit]['json_{}'.format(k)] = json.dumps( v, **JSON_ENCODE_OPTIONS) def on_changed(self, event): rel_data = event.relation.data.get(event.unit) if rel_data.get('clustered'): logger.info('The hacluster relation is ready') self.unit.status = self.UNIT_ACTIVE_STATUS self.state.ha_cluster_ready = True def on_created_ag(self, _): self.setup_pacemaker_mssql_login() self.cluster.mssql_db_client().exec_t_sql(""" GRANT ALTER, CONTROL, VIEW DEFINITION ON AVAILABILITY GROUP::[{ag_name}] TO [{login_name}] GRANT VIEW SERVER STATE TO [{login_name}] """.format(ag_name=self.cluster.AG_NAME, login_name=self.PACEMAKER_LOGIN_NAME)) def setup_pacemaker_mssql_login(self): if self.state.pacemaker_login_ready: logger.info('The pacemaker login is already configured.') return login_password = host.pwgen(32) self.cluster.mssql_db_client().create_login( name=self.PACEMAKER_LOGIN_NAME, password=login_password, server_roles=['sysadmin']) with open(self.PACEMAKER_LOGIN_CREDS_FILE, 'w') as f: f.write('{}\n{}\n'.format(self.PACEMAKER_LOGIN_NAME, login_password)) os.chown(self.PACEMAKER_LOGIN_CREDS_FILE, 0, 0) os.chmod(self.PACEMAKER_LOGIN_CREDS_FILE, 0o400) self.state.pacemaker_login_ready = True @property def is_ha_cluster_ready(self): return self.state.ha_cluster_ready @property def bind_address(self): return self.model.config['vip']
class SlurmDBDCharm(CharmBase): """This charm demonstrates the 'requires' side of the relationship by extending CharmBase with an event object that observes the relation-changed hook event. """ _stored = StoredState() def __init__(self, *args): super().__init__(*args) self._stored.set_default(db_info=dict()) self.framework.observe(self.on.start, self._on_start) self.framework.observe(self.on.install, self._on_install) self.db_info = MySQLClient(self, "db") self.framework.observe(self.db_info.on.db_info_available, self._on_db_info_available) self.slurm_ops = SlurmSnapOps(self, "slurm-config") self.framework.observe(self.slurm_ops.on.configure_slurm, self._on_configure_slurm) self.framework.observe(self.slurm_ops.on.slurm_snap_installed, self._on_slurm_snap_installed) def _on_install(self, event): pass def _on_start(self, event): pass def _on_slurm_snap_installed(self, event): pass def _on_db_info_available(self, event): """Store the db_info in the StoredState for later use. """ db_info = { 'user': event.db_info.user, 'password': event.db_info.password, 'host': event.db_info.host, 'port': event.db_info.port, 'database': event.db_info.database, } self._stored.db_info = db_info self.slurm_config.on.configure_slurm.emit() def _on_configure_slurm(self, event): """Render the slurmdbd.yaml and set the snap.mode. """ hostname = socket.gethostname().split(".")[0] self.slurm_config.render_slurm_config( f"{os.getcwd()}/slurmdbd.yaml.tmpl", #"/var/snap/slurm/common/etc/slurm-configurator/slurmdbd.yaml", "/home/ubuntu/slurmdbd.yaml", context={ **{ "hostname": hostname }, **self._stored.db_info })
class MongoconsumerCharm(CharmBase): _stored = StoredState() def __init__(self, *args): super().__init__(*args) self.mongo_consumer = MongoConsumer(self, 'database', self.consumes) self.image = OCIImageResource(self, "busybox-image") self.framework.observe(self.on.config_changed, self.on_config_changed) self.framework.observe(self.mongo_consumer.on.available, self.on_db_available) self.framework.observe(self.mongo_consumer.on.invalid, self.on_provider_invalid) self.framework.observe(self.mongo_consumer.on.broken, self.on_provider_broken) self._stored.set_default(events=[]) self._stored.set_default(num_dbs=2) self._stored.set_default(requested_dbs=0) def on_stop(self, _): """Mark terminating unit as inactive """ if self.model.config['record_events']: self._stored.events.append("config_chagned") self.unit.status = MaintenanceStatus('Pod is terminating.') def on_config_changed(self, _): if self.model.config['record_events']: self._stored.events.append("config_chagned") if not self.unit.is_leader(): self.unit.status = ActiveStatus() return self.configure_pod() def on_db_available(self, event): if self.model.config['record_events']: self._stored.events.append("db_available") logger.debug("Got Databases: " + str(self.mongo_consumer.databases())) if self._stored.requested_dbs < self._stored.num_dbs: num_dbs = self._stored.num_dbs - self._stored.requested_dbs logger.debug("Requesting additional {} databases".format(num_dbs)) for i in range(num_dbs): self.mongo_consumer.new_database() self._stored.requested_dbs += 1 else: self.test_databases() def on_provider_invalid(self, _): if self.model.config['record_events']: self._stored.events.append("provider_invalid") logger.debug("Failed to get a valid provider") def on_provider_broken(self, _): logger.debug("Database provider relation broken") def test_databases(self): for id in self.mongo_consumer.provider_ids(): creds = self.mongo_consumer.credentials(id) uri = creds['replica_set_uri'] client = pymongo.MongoClient(uri) for dbname in self.mongo_consumer.databases(id): post = {"test": "A test post"} logger.debug("writing {} to {}".format(post, dbname)) db = client[dbname] tbl = db["test"] tbl.insert_one(post) posts = list(tbl.find()) logger.debug("read {} from {}".format(posts, dbname)) def configure_pod(self): logger.debug(str(sorted(os.environ.items()))) # Fetch image information try: self.unit.status = WaitingStatus("Fetching image information") image_info = self.image.fetch() except OCIImageResourceError: self.unit.status = BlockedStatus( "Error fetching image information") return # Build Pod spec self.unit.status = WaitingStatus("Assembling pod spec") pod_spec = { "version": 3, "containers": [{ "name": self.app.name, "imageDetails": image_info, "command": ["sh"], "args": ["-c", "while true; do date; sleep 60;done"], "imagePullPolicy": "Always", "ports": [{ "name": self.app.name, "containerPort": 80, "protocol": "TCP" }] }] } if self.unit.is_leader(): self.model.pod.set_spec(pod_spec) self.unit.status = ActiveStatus() @property def consumes(self): return json.loads(self.model.config['consumes'])
class ZookeeperCharm(CharmBase): on = ZookeeperCharmEvents() state = StoredState() def __init__(self, framework, key): super().__init__(framework, key) self.framework.observe(self.on.start, self) # self.framework.observe(self.on.stop, self) self.framework.observe(self.on.update_status, self) self.framework.observe(self.on.upgrade_charm, self) self.framework.observe(self.on.config_changed, self) self.framework.observe(self.on.cluster_relation_changed, self.on_cluster_modified) self.framework.observe(self.on.zookeeper_relation_joined, self.expose_relation_data) self._unit = 1 self._zookeeperuri = "" self._pod = K8sPod(self.framework.model.app.name) self.cluster = ZookeeperCluster(self, 'cluster') self.client = ZookeeperClient(self, 'zookeeper', self.model.config['client-port']) self.state.set_default(isStarted=False) self.framework.observe(self.on.leader_elected, self) def on_start(self, event): logging.info('START') if (self.model.pod._backend.is_leader()): # if not self.model.config['ha-mode']: #self.model.unit.status = MaintenanceStatus('Starting pod') podSpec = self.makePodSpec() self.model.pod.set_spec(podSpec) self.state.podSpec = podSpec self.on.config_changed.emit() def expose_relation_data(self, event): logging.info('Data Exposed') fqdn = socket.getnameinfo((str(self.cluster.ingress_address), 0), socket.NI_NAMEREQD)[0] logging.info(fqdn) self.client.set_host(fqdn) self.client.set_port(self.model.config['client-port']) self.client.set_rest_port(self.model.config['client-port']) self.client.expose_zookeeper() self.on.config_changed.emit() def on_upgrade_charm(self, event): logging.info('UPGRADE') self.on.config_changed.emit() def on_leader_elected(self, event): logging.info('LEADER ELECTED') self.on.config_changed.emit() def getUnits(self): logging.info('get_units') peer_relation = self.model.get_relation('cluster') units = self._unit if peer_relation is not None: logging.info(peer_relation) if not self.model.config['ha-mode']: self._unit = 1 else: self._unit = len(peer_relation.units) + 1 self.on.update_status.emit() def on_cluster_modified(self, event): logging.info('on_cluster_modified') self.on.config_changed.emit() def on_update_status(self, event): logging.info('UPDATE STATUS') if self._pod.is_ready: logging.info('Pod is ready') self.state.isStarted = True if (self.model.pod._backend.is_leader()): self.model.unit.status = ActiveStatus('ready') else: self.model.unit.status = ActiveStatus('ready Not a Leader') def on_config_changed(self, event): logging.info('CONFIG CHANGED') if self._pod.is_ready: if (self.model.pod._backend.is_leader()): self.getUnits() podSpec = self.makePodSpec() if self.state.podSpec != podSpec: self.model.pod.set_spec(podSpec) self.state.podSpec = podSpec self.on.update_status.emit() def on_new_client(self, event): logging.info('NEW CLIENT') if not self.state.isStarted: logging.info('NEW CLIENT DEFERRED') return event.defer() logging.info('NEW CLIENT SERVING') if (self.model.pod._backend.is_leader()): self.client.expose_zookeeper() def makePodSpec(self): logging.info('MAKING POD SPEC') with open("templates/spec_template.yaml") as spec_file: podSpecTemplate = spec_file.read() dockerImage = self.model.config['image'] logging.info(self._unit) data = { "name": self.model.app.name, "zookeeper-units": int(self._unit), "docker_image_path": dockerImage, "server-port": self.model.config['server-port'], "client-port": self.model.config['client-port'], "leader-election-port": int(self.model.config['leader-election-port']), } logging.info(data) podSpec = podSpecTemplate % data podSpec = yaml.load(podSpec) return podSpec
class SimpleHAProxyCharm(CharmBase): state = StoredState() on = ProxyClusterEvents() def __init__(self, framework, key): super().__init__(framework, key) # An example of setting charm state # that's persistent across events self.state.set_default(is_started=False) self.peers = ProxyCluster(self, "proxypeer") if not self.state.is_started: self.state.is_started = True # Register all of the events we want to observe for event in ( # Charm events self.on.config_changed, self.on.install, self.on.start, self.on.upgrade_charm, # Charm actions (primitives) self.on.touch_action, # OSM actions (primitives) self.on.start_action, self.on.stop_action, self.on.restart_action, self.on.reboot_action, self.on.upgrade_action, # SSH Proxy actions (primitives) self.on.generate_ssh_key_action, self.on.get_ssh_public_key_action, self.on.run_action, self.on.verify_ssh_credentials_action, ): self.framework.observe(event, self) self.framework.observe(self.on.proxypeer_relation_changed, self) def get_ssh_proxy(self): """Get the SSHProxy instance""" proxy = SSHProxy( hostname=self.model.config["ssh-hostname"], username=self.model.config["ssh-username"], password=self.model.config["ssh-password"], ) return proxy def on_proxypeer_relation_changed(self, event): if self.peers.is_cluster_initialized: pubkey = self.peers.ssh_public_key privkey = self.peers.ssh_private_key SSHProxy.write_ssh_keys(public=pubkey, private=privkey) self.on_config_changed(event) else: event.defer() def on_config_changed(self, event): """Handle changes in configuration""" unit = self.model.unit # Unit should go into a waiting state until verify_ssh_credentials is successful unit.status = WaitingStatus("Waiting for SSH credentials") proxy = self.get_ssh_proxy() verified = proxy.verify_credentials() if verified: unit.status = ActiveStatus() else: unit.status = BlockedStatus("Invalid SSH credentials.") def on_install(self, event): pass def on_start(self, event): """Called when the charm is being installed""" if not self.peers.is_joined: event.defer() return unit = self.model.unit if not SSHProxy.has_ssh_key(): unit.status = MaintenanceStatus("Generating SSH keys...") pubkey = None privkey = None if self.is_leader: if self.peers.is_cluster_initialized: SSHProxy.write_ssh_keys( public=self.peers.ssh_public_key, private=self.peers.ssh_private_key, ) else: SSHProxy.generate_ssh_key() self.on.ssh_keys_initialized.emit( SSHProxy.get_ssh_public_key(), SSHProxy.get_ssh_private_key()) unit.status = ActiveStatus() else: unit.status = WaitingStatus( "Waiting for leader to populate the keys") def on_touch_action(self, event): """Touch a file.""" if self.is_leader: filename = event.params["filename"] proxy = self.get_ssh_proxy() stdout, stderr = proxy.run("touch {}".format(filename)) event.set_results({"output": stdout}) else: event.fail("Unit is not leader") return def on_upgrade_charm(self, event): """Upgrade the charm.""" unit = self.model.unit # Mark the unit as under Maintenance. unit.status = MaintenanceStatus("Upgrading charm") self.on_install(event) # When maintenance is done, return to an Active state unit.status = ActiveStatus() ############### # OSM methods # ############### def on_start_action(self, event): """Start the VNF service on the VM.""" pass def on_stop_action(self, event): """Stop the VNF service on the VM.""" pass def on_restart_action(self, event): """Restart the VNF service on the VM.""" pass def on_reboot_action(self, event): """Reboot the VM.""" if self.is_leader: proxy = self.get_ssh_proxy() stdout, stderr = proxy.run("sudo reboot") if len(stderr): event.fail(stderr) else: event.fail("Unit is not leader") return def on_upgrade_action(self, event): """Upgrade the VNF service on the VM.""" pass ##################### # SSH Proxy methods # ##################### def on_generate_ssh_key_action(self, event): """Generate a new SSH keypair for this unit.""" if self.is_leader: if not SSHProxy.generate_ssh_key(): event.fail("Unable to generate ssh key") else: event.fail("Unit is not leader") return def on_get_ssh_public_key_action(self, event): """Get the SSH public key for this unit.""" if self.is_leader: pubkey = SSHProxy.get_ssh_public_key() event.set_results({"pubkey": SSHProxy.get_ssh_public_key()}) else: event.fail("Unit is not leader") return def on_run_action(self, event): """Run an arbitrary command on the remote host.""" if self.is_leader: cmd = event.params["command"] proxy = self.get_ssh_proxy() stdout, stderr = proxy.run(cmd) event.set_results({"output": stdout}) if len(stderr): event.fail(stderr) else: event.fail("Unit is not leader") return def on_verify_ssh_credentials_action(self, event): """Verify the SSH credentials for this unit.""" if self.is_leader: proxy = self.get_ssh_proxy() verified = proxy.verify_credentials() if verified: print("Verified!") event.set_results({"verified": True}) else: print("Verification failed!") event.set_results({"verified": False}) else: event.fail("Unit is not leader") return @property def is_leader(self): # update the framework to include self.unit.is_leader() return self.model.unit.is_leader()
class HelloJujuCharm(CharmBase): """Main 'Hello, Juju' charm class""" _stored = StoredState() def __init__(self, *args): super().__init__(*args) self.framework.observe(self.on.install, self._on_install) self.framework.observe(self.on.start, self._on_start) self.framework.observe(self.on.config_changed, self._on_config_changed) self._stored.set_default(repo="", port="", conn_str="") # Initialise the PostgreSQL Client for the "db" relation self.db = pgsql.PostgreSQLClient(self, "db") self.framework.observe(self.db.on.database_relation_joined, self._on_database_relation_joined) self.framework.observe(self.db.on.master_changed, self._on_database_master_changed) def _on_install(self, _): """Install prerequisites for the application""" # Install some Python packages using apt self.unit.status = MaintenanceStatus("installing pip and virtualenv") self._install_apt_packages(["python3-pip", "python3-virtualenv"]) # Clone application code and install dependencies, setup initial db self._setup_application() # Template out the systemd service file self._render_systemd_unit() def _on_start(self, _): """Start the workload""" check_call(["open-port", f"{self._stored.port}/TCP"]) # Enable and start the "hello-juju" systemd unit systemd.service_resume("hello-juju") self.unit.status = ActiveStatus() def _on_config_changed(self, _): """Handle changes to the application configuration""" restart = False # Check if the application repo has been changed if self.config["application-repo"] != self._stored.repo: logger.info("application repo changed, installing") self._stored.repo = self.config["application-repo"] self._setup_application() restart = True if self.config["port"] != self._stored.port: logger.info("port config changed, configuring") # Close the existing application port check_call(["close-port", f"{self._stored.port}/TCP"]) # Reconfigure the systemd unit to specify the new port self._stored.port = self.config["port"] self._render_systemd_unit() # Ensure the correct port is opened for the application check_call(["open-port", f"{self._stored.port}/TCP"]) restart = True if restart: logger.info("restarting hello-juju application") systemd.service_restart("hello-juju") self.unit.status = ActiveStatus() def _on_database_relation_joined(self, event): """Handle the event where this application is joined with a database""" if self.unit.is_leader(): # Ask the database to create a database with this app's name event.database = self.app.name def _on_database_master_changed(self, event): """Handler the case where a new PostgreSQL DB master is available""" if event.database != self.app.name: # Leader has not yet set the database name/requirements. return # event.master will be none if the master database is unavailable, # or a pgsql.ConnectingString instance if event.master: self.unit.status = MaintenanceStatus( "configuring database settings") # Store the connection uri in state # Replace the first part of the URL with pg8000 equivalent self._stored.conn_str = event.master.uri.replace( "postgresql://", "postgresql+pg8000://") # Render the settings file with the database connection details self._render_settings_file() # Ensure the database tables are created in the master self._create_database_tables() # Restart the service systemd.service_restart("hello-juju") # Set back to active status self.unit.status = ActiveStatus() else: # Defer this event until the master is available event.defer() return def _setup_application(self): """Clone a Flask application into place and setup it's dependencies""" self.unit.status = MaintenanceStatus("fetching application code") # Delete the application directory if it exists already if Path(APP_PATH).is_dir(): shutil.rmtree("/srv/app") # If this is the first time, set the repo in the stored state if not self._stored.repo: self._stored.repo = self.config["application-repo"] # Fetch the code using git Repo.clone_from(self._stored.repo, APP_PATH) # Install application dependencies check_output(["python3", "-m", "virtualenv", f"{VENV_ROOT}"]) check_output([f"{VENV_ROOT}/bin/pip3", "install", "gunicorn"]) check_output([ f"{VENV_ROOT}/bin/pip3", "install", "-r", f"{APP_PATH}/requirements.txt", "--force" ]) # If a connection string exists (and relation is defined) then # render the settings file for the new app with the connection details if self._stored.conn_str: self._render_settings_file() # Create required database tables self._create_database_tables() def _install_apt_packages(self, packages: list): """Simple wrapper around 'apt-get install -y""" try: apt.update() apt.add_package(packages) except apt.PackageNotFoundError: logger.error( "a specified package not found in package cache or on system") self.unit.status = BlockedStatus("Failed to install packages") except apt.PackageError: logger.error("could not install package") self.unit.status = BlockedStatus("Failed to install packages") def _render_systemd_unit(self): """Render the systemd unit for Gunicorn to a file""" # Open the template systemd unit file with open("templates/hello-juju.service.j2", "r") as t: template = Template(t.read()) # If this is the first time, set the port in the stored state if not self._stored.port: self._stored.port = self.config["port"] # Render the template files with the correct values rendered = template.render(port=self._stored.port, project_root=APP_PATH, user="******", group="www-data") # Write the rendered file out to disk with open(UNIT_PATH, "w+") as t: t.write(rendered) # Ensure correct permissions are set on the service os.chmod(UNIT_PATH, 0o755) # Reload systemd units systemd.daemon_reload() def _render_settings_file(self): """Render the application settings file with database connection details""" # Open the template settings files with open("templates/settings.py.j2", "r") as t: template = Template(t.read()) # Render the template file with the correct values rendered = template.render(conn_str=self._stored.conn_str) # Write the rendered file out to disk with open(f"{APP_PATH}/settings.py", "w+") as t: t.write(rendered) # Ensure correct permissions are set on the file os.chmod(f"{APP_PATH}/settings.py", 0o644) # Get the uid/gid for the www-data user u = passwd.user_exists("www-data") # Set the correct ownership for the settings file os.chown(f"{APP_PATH}/settings.py", uid=u.pw_uid, gid=u.pw_gid) def _create_database_tables(self): """Initialise the database and populate with initial tables required""" self.unit.status = MaintenanceStatus("creating database tables") # Call the application's `init.py` file to instantiate the database tables check_call([ "sudo", "-u", "www-data", f"{VENV_ROOT}/bin/python3", f"{APP_PATH}/init.py" ])
class HaproxyInstanceManager(Object): _stored = StoredState() HAPROXY_ENV_FILE = Path('/etc/default/haproxy') def __init__(self, charm, key, tcp_backend_manager, bind_addresses=None): super().__init__(charm, key) self.tcp_backend_manager = tcp_backend_manager self.tcp_pool_adapter = TCPLoadBalancerPoolAdapter( self.tcp_backend_manager.pools, bind_addresses, ) self._stored.set_default(is_started=False) self.haproxy_conf_file = Path( f'/etc/haproxy/juju-{self.model.app.name}.cfg') @property def is_started(self): return self._stored.is_started def install(self): self._install_haproxy() self._update_haproxy_env_file() def _install_haproxy(self): logger.info('Installing the haproxy package') subprocess.check_call(['apt', 'update']) subprocess.check_call(['apt', 'install', '-yq', 'haproxy']) def _update_haproxy_env_file(self): """Update the maintainer-provided environment file. This is done to include the config rendered by us in addition to the default config provided by the package. """ ctxt = {'haproxy_app_config': self.haproxy_conf_file} env = Environment(loader=FileSystemLoader('templates')) template = env.get_template('haproxy.env.j2') rendered_content = template.render(ctxt) self.HAPROXY_ENV_FILE.write_text(rendered_content) self.haproxy_conf_file.write_text('') def start(self): if not self._stored.is_started: logger.info('Starting the haproxy service') self._run_start() self._stored.is_started = True def _run_start(self): subprocess.check_call(['systemctl', 'start', 'haproxy']) def stop(self): if not self._stored.is_started: logger.info('Stopping the haproxy service') subprocess.check_call(['systemctl', 'stop', 'haproxy']) self.state.is_started = False def uninstall(self): logger.info('Uninstalling the haproxy service') subprocess.check_call(['apt', 'purge', '-yq', 'haproxy']) def reconfigure(self): logger.info('Reconfiguring the haproxy service') self._do_reconfigure() self._run_restart() def _run_restart(self): logger.info('Restarting the haproxy service') subprocess.check_call(['systemctl', 'restart', 'haproxy']) def _do_reconfigure(self): logger.info('Rendering the haproxy config file') env = Environment(loader=FileSystemLoader('templates')) template = env.get_template('haproxy.conf.j2') listen_sections = self.tcp_pool_adapter.listen_sections rendered_content = template.render( {'listen_sections': listen_sections}) self.haproxy_conf_file.write_text(rendered_content)
class FileBeatCharm(CharmBase): FILEBEAT_CONFIG = '/etc/filebeat/filebeat.yml' KUBE_CONFIG = '/root/.kube/config' LOGSTASH_SSL_CERT = '/etc/ssl/certs/filebeat-logstash.crt' LOGSTASH_SSL_KEY = '/etc/ssl/private/filebeat-logstash.key' state = StoredState() def __init__(self, *args): super().__init__(*args) self.framework.observe(self.on.install, self.on_install) self.framework.observe(self.on.stop, self.on_remove) self.framework.observe(self.on.config_changed, self.on_config_changed) self.framework.observe(self.on.reinstall_action, self.on_reinstall_action) self.framework.observe(self.beats_server.on.server_ready, self.on_beats_server_available) self.state.set_default(repo_sources_hash=None, repo_keys_hash=None, needs_reinstall=False, logstash_key=None, logstash_cert=None) def on_install(self, event): logger.info('Installing filebeat') sources = self.model.config.get('install_sources', '') keys = self.model.config.get('install_keys', '') self.state.repo_details_hash = hash(sources + keys) configure_sources(update=True, sources_var=sources, keys_var=keys) apt_install('filebeat') self.unit.status = ActiveStatus("Filebeat is installed") def on_config_changed(self, event): sources = self.model.config.get('install_sources', '') keys = self.model.config.get('install_keys', '') new_repo_sources_hash = hash(sources) new_repo_keys_hash = hash(keys) if self.state.repo_sources_hash != new_repo_sources_hash: configure_sources(update=True, sources_var=sources, keys_var=keys) self.state.needs_reinstall = True msg = "Filebeat repo changed, use reinstall action to obtain a new version." self.unit.status = BlockedStatus(msg) return elif self.state.repo_keys_hash != new_repo_keys_hash: configure_sources(update=True, sources_var=sources, keys_var=keys) self.render_filebeat_template() def on_reinstall_action(self, event): if self.state.needs_reinstall: logger.info('Reinstalling filebeat') apt_purge('filebeat') apt_install('filebeat') self.state.needs_reinstall = False self.render_filebeat_template() def render_filebeat_template(self): """Create the filebeat.yaml config file. Renders the appropriate template for the major version of filebeat that is installed. """ if self.model.config['kube_logs']: if os.path.exists(self.KUBE_CONFIG): msg = 'Collecting k8s metadata.' else: msg = ('kube_logs=True, but {} does not exist. ' 'No k8s metadata will be collected.'.format(self.KUBE_CONFIG)) logger.info(msg) self.manage_filebeat_logstash_ssl() pass #TODO def manage_filebeat_logstash_ssl(self): """Manage the ssl cert/key that filebeat uses to connect to logstash. Create the cert/key files when both logstash_ssl options have been set; update when either config option changes; remove if either gets unset. """ logstash_ssl_cert = self.model.config['logstash_ssl_cert'] logstash_ssl_key = self.model.config['logstash_ssl_key'] if logstash_ssl_cert and logstash_ssl_key: cert = base64.b64decode(logstash_ssl_cert).decode('utf8') key = base64.b64decode(logstash_ssl_key).decode('utf8') if cert != self.state.logstash_cert: render(template='{{ data }}', context={'data': cert}, target=self.LOGSTASH_SSL_CERT, perms=0o444) if key != self.state.logstash_key: render(template='{{ data }}', context={'data': key}, target=self.LOGSTASH_SSL_KEY, perms=0o400) else: if not logstash_ssl_cert and os.path.exists(self.LOGSTASH_SSL_CERT): os.remove(self.LOGSTASH_SSL_CERT) if not logstash_ssl_key and os.path.exists(self.LOGSTASH_SSL_KEY): os.remove(self.LOGSTASH_SSL_KEY) def on_beats_server_available(self, event): """Create the Filebeat index in Elasticsearch. Once elasticsearch is available, make 5 attempts to create a filebeat index. Set appropriate charm status so the operator knows when ES is configured to accept data. """ hosts = self.beats_server.socket_addresses[0] for host in hosts: host_string = "{}:{}".format(host['host'], host['port']) max_attempts = 5 for i in range(1, max_attempts): if push_beat_index(elasticsearch=host_string, service='filebeat', fatal=False): logger.info('Filebeat.index.pushed') self.unit.status = ActiveStatus("Filebeat ready") break else: msg = "Attempt {} to push filebeat index failed (retrying)".format(i) self.unit.status = WaitingStatus(msg) time.sleep(i * 30) # back off 30s for each attempt else: msg = "Failed to push filebeat index to http://{}".format(host_string) self.unit.status = BlockedStatus(msg) def on_remove(self, event): logger.info('Removing filebeat') apt_autoremove('filebeat') self.unit.status = MaintenanceStatus('Removing filebeat')
class MetallbSpeakerCharm(CharmBase): _stored = StoredState() NAMESPACE = os.environ["JUJU_MODEL_NAME"] CONTAINER_IMAGE = 'metallb/speaker:v0.9.3' def __init__(self, *args): super().__init__(*args) self.framework.observe(self.on.start, self.on_start) self.framework.observe(self.on.config_changed, self._on_config_changed) self.framework.observe(self.on.remove, self.on_remove) self._stored.set_default(things=[]) def _on_config_changed(self, _): current = self.model.config["thing"] if current not in self._stored.things: logger.debug("found a new thing: %r", current) self._stored.things.append(current) def on_start(self, event): if not self.framework.model.unit.is_leader(): return logging.info('Setting the pod spec') self.framework.model.unit.status = MaintenanceStatus("Configuring pod") secret = utils._random_secret(128) self.framework.model.pod.set_spec( { 'version': 3, 'serviceAccount': { 'roles': [ { 'global': True, 'rules': [ { 'apiGroups': [''], 'resources': ['services', 'endpoints', 'nodes'], 'verbs': ['get', 'list', 'watch'], }, { 'apiGroups': [''], 'resources': ['events'], 'verbs': ['create', 'patch'], }, { 'apiGroups': ['policy'], 'resourceNames': ['speaker'], 'resources': ['podsecuritypolicies'], 'verbs': ['use'], }, ], }, ], }, 'containers': [{ 'name': 'speaker', 'image': self.CONTAINER_IMAGE, 'imagePullPolicy': 'Always', 'ports': [{ 'containerPort': 7472, 'protocol': 'TCP', 'name': 'monitoring' }], 'envConfig': { 'METALLB_NODE_NAME': { 'field': { 'path': 'spec.nodeName', 'api-version': 'v1' } }, 'METALLB_HOST': { 'field': { 'path': 'status.hostIP', 'api-version': 'v1' } }, 'METALLB_ML_BIND_ADDR': { 'field': { 'path': 'status.podIP', 'api-version': 'v1' } }, 'METALLB_ML_LABELS': "app=metallb,component=speaker", 'METALLB_ML_NAMESPACE': { 'field': { 'path': 'metadata.namespace', 'api-version': 'v1' } }, 'METALLB_ML_SECRET_KEY': { 'secret': { 'name': 'memberlist', 'key': 'secretkey' } } }, # TODO: add constraint fields once it exists in pod_spec # bug : https://bugs.launchpad.net/juju/+bug/1893123 # 'resources': { # 'limits': { # 'cpu': '100m', # 'memory': '100Mi', # } # }, 'kubernetes': { 'securityContext': { 'allowPrivilegeEscalation': False, 'readOnlyRootFilesystem': True, 'capabilities': { 'add': ['NET_ADMIN', 'NET_RAW', 'SYS_ADMIN'], 'drop': ['ALL'] }, }, # fields do not exist in pod_spec # 'TerminationGracePeriodSeconds': 2, }, }], 'kubernetesResources': { 'secrets': [{ 'name': 'memberlist', 'type': 'Opaque', 'data': { 'secretkey': b64encode(secret.encode('utf-8')).decode('utf-8') } }] }, 'service': { 'annotations': { 'prometheus.io/port': '7472', 'prometheus.io/scrape': 'true' } }, }, ) response = utils.create_pod_security_policy_with_k8s_api( namespace=self.NAMESPACE, ) if not response: self.framework.model.unit.status = BlockedStatus( "An error occured during init. Please check the logs.") return response = utils.create_namespaced_role_with_api( name='config-watcher', namespace=self.NAMESPACE, labels={'app': 'metallb'}, resources=['configmaps'], verbs=['get', 'list', 'watch']) if not response: self.framework.model.unit.status = BlockedStatus( "An error occured during init. Please check the logs.") return response = utils.create_namespaced_role_with_api( name='pod-lister', namespace=self.NAMESPACE, labels={'app': 'metallb'}, resources=['pods'], verbs=['list']) if not response: self.framework.model.unit.status = BlockedStatus( "An error occured during init. Please check the logs.") return response = utils.bind_role_with_api(name='config-watcher', namespace=self.NAMESPACE, labels={'app': 'metallb'}, subject_name='speaker') if not response: self.framework.model.unit.status = BlockedStatus( "An error occured during init. Please check the logs.") return response = utils.bind_role_with_api(name='pod-lister', namespace=self.NAMESPACE, labels={'app': 'metallb'}, subject_name='speaker') if not response: self.framework.model.unit.status = BlockedStatus( "An error occured during init. Please check the logs.") return self.framework.model.unit.status = ActiveStatus("Ready") def on_remove(self, event): if not self.framework.model.unit.is_leader(): return
class OpenfaasCharm(CharmBase): _stored = StoredState() def __init__(self, *args): super().__init__(*args) self._stored.set_default(namespace=os.environ["JUJU_MODEL_NAME"]) self._stored.set_default(nats_ip="") self.framework.observe(self.on.config_changed, self._on_config_changed) self.framework.observe(self.on["nats-address"].relation_joined, self._on_nats_relation_joined) self.framework.observe(self.on["nats-address"].relation_changed, self._on_nats_relation_changed) def _on_nats_relation_changed(self, event): ip = event.relation.data[event.unit].get("ip") if ip == None: return self._stored.nats_ip = ip logger.info("OF - nats says: {}".format(ip)) self._on_config_changed() def _on_nats_relation_joined(self, event): ip = event.relation.data[event.unit].get("ip") if ip == None: return self._stored.nats_ip = ip logger.info("OF - nats says: {}".format(ip)) self._on_config_changed() def _on_config_changed(self, _=None): logger.info("OpenFaaS config_change") # if not self.unit.is_leader(): # self.unit.status = ActiveStatus() # return nats_ip = self._stored.nats_ip if nats_ip == "": self.unit.status = BlockedStatus("OpenFaaS needs a NATS relation") return self.unit.status = MaintenanceStatus('Setting pod spec.') logger.info("OpenFaaS building pod spec with nats_ip {}".format(nats_ip)) pod_spec = self._build_pod_spec() self.model.pod.set_spec(pod_spec) self.unit.status = ActiveStatus("OpenFaaS pod ready.") def _build_pod_spec(self): namespace = self._stored.namespace # function_crd = {} # profiles_crd = {} rules = [] try: rules = yaml.load(open(Path('files/rbac_rules.yaml'),"r"), Loader=yaml.FullLoader) except yaml.YAMLError as exc: print("Error in configuration file:", exc) # try: # function_crd = yaml.load(open(Path('files/function_crd.yaml'),"r"), Loader=yaml.FullLoader) # except yaml.YAMLError as exc: # print("Error in configuration file:", exc) # try: # profiles_crd = yaml.load(open(Path('files/profiles_crd.yaml'),"r"), Loader=yaml.FullLoader) # except yaml.YAMLError as exc: # print("Error in configuration file:", exc) # logger.debug(json.dumps(function_crd["spec"])) username = self.model.config["admin_username"] password = self.model.config["admin_password"] vol_config = [ { "name": "auth", "mountPath": "/var/secrets", "secret": {"name": "basic-auth"} }, ] # "functions_provider_url": "http://192.168.0.35:8080", spec = { "version": 3, "kubernetesResources": { # "customResourceDefinitions": [ # { # "name": function_crd["metadata"]["name"], # "labels": { # "juju-global-resource-lifecycle": "model", # }, # "spec": function_crd["spec"], # }, # { # "name": profiles_crd["metadata"]["name"], # "labels": { # "juju-global-resource-lifecycle": "model", # }, # "spec": profiles_crd["spec"], # }, # ], 'secrets': [{ 'name': 'basic-auth', 'type': 'Opaque', 'data': { 'basic-auth-user': b64encode(username.encode('utf-8')).decode('utf-8'), 'basic-auth-password': b64encode(password.encode('utf-8')).decode('utf-8'), } }], }, 'serviceAccount': { 'roles': [{ 'global': True, 'rules': rules["rules"], }], }, "containers": [ { "name": self.app.name+"-gateway", "imageDetails": {"imagePath": "openfaas/gateway:0.20.2"}, "ports": [{"containerPort": 8080, "protocol": "TCP","name":"gateway"}], "envConfig": { "faas_nats_address": self._stored.nats_ip, "faas_nats_port": "4222", "functions_provider_url": "http://127.0.0.1:8081/", "direct_functions": "false", "basic_auth": "true", "faas_nats_channel": "faas-request", "secret_mount_path": "/var/secrets", "faas_prometheus_host": "192.168.0.35", "faas_prometheus_port": "9090", "auth_pass_body": "false", "auth_proxy_url": "http://127.0.0.1:8083/validate", "scale_from_zero": "false", "direct_functions": "false", }, "volumeConfig": vol_config, }, { "name": self.app.name+"-auth-plugin", "imageDetails": {"imagePath": "openfaas/basic-auth-plugin:0.20.2"}, "ports": [{"containerPort": 8083, "protocol": "TCP","name":"auth"}], "envConfig": { "basic_auth": "true", "secret_mount_path": "/var/secrets", "port": "8083", }, "volumeConfig": vol_config, }, { "name": self.app.name+"-provider", "imageDetails": {"imagePath": "ghcr.io/openfaas/faas-netes:0.12.9"}, "ports": [{"containerPort": 8081, "protocol": "TCP","name":"provider"}], "command": ["./faas-netes","-operator=true"], "envConfig": { "port": "8081", "operator": "true", "basic_auth": "true", "function_namespace": namespace, "cluster_role": "true", "profiles_namespace": namespace, }, "volumeConfig": vol_config, } ] } return spec
class MetalLBControllerCharm(CharmBase): """MetalLB Controller Charm.""" _stored = StoredState() def __init__(self, *args): """Charm initialization for events observation.""" super().__init__(*args) if not self.unit.is_leader(): self.unit.status = WaitingStatus("Waiting for leadership") return self.image = OCIImageResource(self, 'metallb-controller-image') self.framework.observe(self.on.install, self._on_start) self.framework.observe(self.on.start, self._on_start) self.framework.observe(self.on.leader_elected, self._on_start) self.framework.observe(self.on.upgrade_charm, self._on_upgrade) self.framework.observe(self.on.config_changed, self._on_config_changed) self.framework.observe(self.on.remove, self._on_remove) # -- initialize states -- self._stored.set_default(k8s_objects_created=False) self._stored.set_default(started=False) self._stored.set_default(config_hash=self._config_hash()) # -- base values -- self._stored.set_default(namespace=os.environ["JUJU_MODEL_NAME"]) def _config_hash(self): data = json.dumps({ 'iprange': self.model.config['iprange'], }, sort_keys=True) return md5(data.encode('utf8')).hexdigest() def _on_start(self, event): """Occurs upon install, start, upgrade, and possibly config changed.""" if self._stored.started: return self.unit.status = MaintenanceStatus("Fetching image information") try: image_info = self.image.fetch() except OCIImageResourceError: logging.exception('An error occured while fetching the image info') self.unit.status = BlockedStatus( "Error fetching image information") return if not self._stored.k8s_objects_created: self.unit.status = MaintenanceStatus("Creating supplementary " "Kubernetes objects") utils.create_k8s_objects(self._stored.namespace) self._stored.k8s_objects_created = True self.unit.status = MaintenanceStatus("Configuring pod") self.set_pod_spec(image_info) self.unit.status = ActiveStatus() self._stored.started = True def _on_upgrade(self, event): """Occurs when new charm code or image info is available.""" self._stored.started = False self._on_start(event) def _on_config_changed(self, event): if self.model.config['protocol'] != 'layer2': self.unit.status = BlockedStatus( 'Invalid protocol; ' 'only "layer2" currently supported') return current_config_hash = self._config_hash() if current_config_hash != self._stored.config_hash: self._stored.started = False self._stored.config_hash = current_config_hash self._on_start(event) def _on_remove(self, event): """Remove of artifacts created by the K8s API.""" self.unit.status = MaintenanceStatus("Removing supplementary " "Kubernetes objects") utils.remove_k8s_objects(self._stored.namespace) self.unit.status = MaintenanceStatus("Removing pod") self._stored.started = False self._stored.k8s_objects_created = False def set_pod_spec(self, image_info): """Set pod spec.""" iprange = self.model.config["iprange"].split(",") cm = "address-pools:\n- name: default\n protocol: layer2\n addresses:\n" for range in iprange: cm += " - " + range + "\n" self.model.pod.set_spec( { 'version': 3, 'serviceAccount': { 'roles': [{ 'global': True, 'rules': [ { 'apiGroups': [''], 'resources': ['services'], 'verbs': ['get', 'list', 'watch', 'update'], }, { 'apiGroups': [''], 'resources': ['services/status'], 'verbs': ['update'], }, { 'apiGroups': [''], 'resources': ['events'], 'verbs': ['create', 'patch'], }, { 'apiGroups': ['policy'], 'resourceNames': ['controller'], 'resources': ['podsecuritypolicies'], 'verbs': ['use'], }, ], }], }, 'containers': [{ 'name': 'controller', 'imageDetails': image_info, 'imagePullPolicy': 'Always', 'ports': [{ 'containerPort': 7472, 'protocol': 'TCP', 'name': 'monitoring' }], # TODO: add constraint fields once it exists in pod_spec # bug : https://bugs.launchpad.net/juju/+bug/1893123 # 'resources': { # 'limits': { # 'cpu': '100m', # 'memory': '100Mi', # } # }, 'kubernetes': { 'securityContext': { 'privileged': False, 'runAsNonRoot': True, 'runAsUser': 65534, 'readOnlyRootFilesystem': True, 'capabilities': { 'drop': ['ALL'] } }, # fields do not exist in pod_spec # 'TerminationGracePeriodSeconds': 0, }, }], 'service': { 'annotations': { 'prometheus.io/port': '7472', 'prometheus.io/scrape': 'true' } }, 'configMaps': { 'config': { 'config': cm } } }, )
class MongoDBCharm(CharmBase): """A Juju Charm to deploy MongoDB on Kubernetes This charm has the following features: - Add one more MongoDB units - Reconfigure replica set anytime number of MongoDB units changes - Provides a database relation for any MongoDB client """ state = StoredState() def __init__(self, *args): super().__init__(*args) self.state.set_default(pod_spec=None) self.state.set_default(mongodb_initialized=False) self.state.set_default(replica_set_hosts=None) self.port = MONGODB_PORT self.image = OCIImageResource(self, "mongodb-image") # Register all of the events we want to observe self.framework.observe(self.on.config_changed, self.configure_pod) self.framework.observe(self.on.upgrade_charm, self.configure_pod) self.framework.observe(self.on.start, self.on_start) self.framework.observe(self.on.stop, self.on_stop) self.framework.observe(self.on.update_status, self.on_update_status) self.framework.observe(self.on["database"].relation_changed, self.on_database_relation_changed) self.framework.observe(self.on[PEER].relation_changed, self.reconfigure) self.framework.observe(self.on[PEER].relation_departed, self.reconfigure) logger.debug("MongoDBCharm initialized!") ############################################## # CHARM HOOKS HANDLERS # ############################################## # Handles config-changed and upgrade-charm events def configure_pod(self, event): """Configure MongoDB Pod specification A new MongoDB pod specification is set only if it is different from the current specification. """ # Continue only if the unit is the leader if not self.unit.is_leader(): self.on_update_status(event) return logger.debug("Running configuring_pod") # Fetch image information try: self.unit.status = WaitingStatus("Fetching image information") image_info = self.image.fetch() except OCIImageResourceError: self.unit.status = BlockedStatus( "Error fetching image information") return # Build Pod spec self.unit.status = WaitingStatus("Assembling pod spec") builder = PodSpecBuilder( name=self.model.app.name, replica_set_name=self.replica_set_name, port=self.port, image_info=image_info, ) pod_spec = builder.make_pod_spec() # Update pod spec if the generated one is different # from the one previously applied if self.state.pod_spec != pod_spec: self.model.pod.set_spec(pod_spec) self.state.pod_spec = pod_spec self.on_update_status(event) logger.debug("Running configuring_pod finished") # Handles start event def on_start(self, event): """Initialize MongoDB This event handler is deferred if initialization of MongoDB replica set fails. By doing so it is gauranteed that another attempt at initialization will be made. """ logger.debug("Running on_start") if not self.unit.is_leader(): return if not self.mongo.is_ready(): self.unit.status = WaitingStatus("Waiting for MongoDB Service") logger.debug("Waiting for MongoDB Service") event.defer() if not self.state.mongodb_initialized: logger.debug("Initializing MongoDB") self.unit.status = WaitingStatus("Initializing MongoDB") try: self.mongo.initialize_replica_set(self.cluster_hosts) self.state.mongodb_initialized = True self.state.replica_set_hosts = self.cluster_hosts logger.debug("MongoDB Initialized") except Exception as e: logger.info("Deferring on_start since : error={}".format(e)) event.defer() self.on_update_status(event) logger.debug("Running on_start finished") # Handles stop event def on_stop(self, _): """Mark terminating unit as inactive """ self.unit.status = MaintenanceStatus('Pod is terminating.') # Handles update-status event def on_update_status(self, event): """Set status for all units Status may be - MongoDB API server not reachable (service is not ready), - MongoDB Replication set is not Initialized - Unit is active """ if not self.unit.is_leader(): self.unit.status = ActiveStatus() return if not self.mongo.is_ready(): status_message = "service not ready yet" self.unit.status = WaitingStatus(status_message) return if not self.state.mongodb_initialized: status_message = "mongodb not initialized" self.unit.status = WaitingStatus(status_message) return self.unit.status = ActiveStatus() ############################################## # PEER RELATION HOOK HANDLERS # ############################################## # Handles relation-changed and relation-departed events def reconfigure(self, event): """Reconfigure replicat set The number of replicas in the MongoDB replica set is updated. """ logger.debug("Running reconfigure") if (self.unit.is_leader() and self.need_replica_set_reconfiguration()): try: self.mongo.reconfigure_replica_set(self.cluster_hosts) except Exception as e: logger.info( "Deferring relation event since : error={}".format(e)) event.defer() self.on_update_status(event) logger.debug("Running reconfigure finished") ############################################## # RELATIONS # ############################################## # handles client relation for MongoDB def on_database_relation_changed(self, event): """Connect to database client Any MongoDB client is provided with the following information - Is MongoDB in a replicated or unitary state - Replica set URI - Standalone URI Using this information a client can establish a database connection with MongoDB, for instances using the pymongo Python Module. """ event.relation.data[self.unit]['replicated'] = str(self.is_joined) event.relation.data[ self.unit]['replica_set_name'] = self.replica_set_name event.relation.data[self.unit]['standalone_uri'] = "{}".format( self.standalone_uri) event.relation.data[self.unit]['replica_set_uri'] = "{}".format( self.replica_set_uri) ############################################## # PROPERTIES # ############################################## @property def mongo(self): """Return a MongoDB API client A pymongo client is returned. """ return Mongo(standalone_uri=self.standalone_uri, replica_set_uri="{}?replicaSet={}".format( self.replica_set_uri, self.replica_set_name)) @property def replica_set_uri(self): """Construct a replica set URI """ uri = "mongodb://" for i, host in enumerate(self.cluster_hosts): if i: uri += "," uri += "{}:{}".format(host, self.port) uri += "/" return uri @property def standalone_uri(self): """Construct a standalone URI """ return "mongodb://{}:{}/".format(self.model.app.name, self.port) @property def replica_set_name(self): """Find the replica set name """ return self.model.config["replica_set_name"] @property def num_peers(self): """Find number of deployed MongoDB units """ peer_relation = self.framework.model.get_relation(PEER) return len(peer_relation.units) + 1 if self.is_joined else 1 @property def is_joined(self): """Does MongoDB charm have peers """ peer_relation = self.framework.model.get_relation(PEER) return peer_relation is not None def _get_unit_hostname(self, _id: int) -> str: """Construct a DNS name for a MongoDB unit """ return "{}-{}.{}-endpoints".format(self.model.app.name, _id, self.model.app.name) @property def cluster_hosts(self: int) -> list: """Find all hostnames for MongoDB units """ return [self._get_unit_hostname(i) for i in range(self.num_peers)] def need_replica_set_reconfiguration(self): """Does MongoDB replica set need reconfiguration """ return self.cluster_hosts != self.state.replica_set_hosts
class PrometheusCharm(CharmBase): """A Juju Charm for Prometheus """ _stored = StoredState() def __init__(self, *args): logger.debug('Initializing Charm') super().__init__(*args) self._stored.set_default(alertmanagers=[]) self._stored.set_default(alertmanager_port='9093') self.framework.observe(self.on.config_changed, self._on_config_changed) self.framework.observe(self.on.stop, self._on_stop) self.framework.observe(self.on['alertmanager'].relation_changed, self._on_alertmanager_changed) self.framework.observe(self.on['alertmanager'].relation_broken, self._on_alertmanager_broken) self.framework.observe(self.on['grafana-source'].relation_changed, self._on_grafana_changed) def _on_config_changed(self, _): """Set a new Juju pod specification """ self._configure_pod() def _on_stop(self, _): """Mark unit is inactive """ self.unit.status = MaintenanceStatus('Pod is terminating.') def _on_grafana_changed(self, event): """Provide Grafana with data source information """ event.relation.data[self.unit]['port'] = str(self.model.config['port']) event.relation.data[self.unit]['source-type'] = 'prometheus' def _on_alertmanager_changed(self, event): """Set an alertmanager configuation """ if not self.unit.is_leader(): return addrs = json.loads(event.relation.data[event.app].get('addrs', '[]')) port = event.relation.data[event.app]['port'] self._stored.alertmanager_port = port self._stored.alertmanagers = addrs self._configure_pod() def _on_alertmanager_broken(self, event): """Remove all alertmanager configuration """ if not self.unit.is_leader(): return self._stored.alertmanagers.clear() self._configure_pod() def _cli_args(self): """Construct command line arguments for Prometheus """ config = self.model.config args = [ '--config.file=/etc/prometheus/prometheus.yml', '--storage.tsdb.path=/var/lib/prometheus', '--web.enable-lifecycle', '--web.console.templates=/usr/share/prometheus/consoles', '--web.console.libraries=/usr/share/prometheus/console_libraries' ] # get log level allowed_log_levels = ['debug', 'info', 'warn', 'error', 'fatal'] if config.get('log-level'): log_level = config['log-level'].lower() else: log_level = 'info' # If log level is invalid set it to debug if log_level not in allowed_log_levels: logging.error('Invalid loglevel: {0} given, {1} allowed. ' 'defaulting to DEBUG loglevel.'.format( log_level, '/'.join(allowed_log_levels))) log_level = 'debug' # set log level args.append('--log.level={0}'.format(log_level)) # Enable time series database compression if config.get('tsdb-wal-compression'): args.append('--storage.tsdb.wal-compression') # Set time series retention time if config.get('tsdb-retention-time') and self._is_valid_timespec( config['tsdb-retention-time']): args.append('--storage.tsdb.retention.time={}'.format( config['tsdb-retention-time'])) return args def _is_valid_timespec(self, timeval): """Is a time interval unit and value valid """ if not timeval: return False time, unit = timeval[:-1], timeval[-1] if unit not in ['y', 'w', 'd', 'h', 'm', 's']: logger.error('Invalid unit {} in time spec'.format(unit)) return False try: int(time) except ValueError: logger.error('Can not convert time {} to integer'.format(time)) return False if not int(time) > 0: logger.error('Expected positive time spec but got {}'.format(time)) return False return True def _are_valid_labels(self, json_data): """Are Prometheus external labels valid """ if not json_data: return False try: labels = json.loads(json_data) except (ValueError, TypeError): logger.error( 'Can not parse external labels : {}'.format(json_data)) return False if not isinstance(labels, dict): logger.error( 'Expected label dictionary but got : {}'.format(labels)) return False for key, value in labels.items(): if not isinstance(key, str) or not isinstance(value, str): logger.error('External label keys/values must be strings') return False return True def _external_labels(self): """Extract external labels for Prometheus from configuration """ config = self.model.config labels = {} if config.get('external-labels') and self._are_valid_labels( config['external-labels']): labels = json.loads(config['external-labels']) return labels def _prometheus_global_config(self): """Construct Prometheus global configuration """ config = self.model.config global_config = {} labels = self._external_labels() if labels: global_config['external_labels'] = labels if config.get('scrape-interval') and self._is_valid_timespec( config['scrape-interval']): global_config['scrape_interval'] = config['scrape-interval'] if config.get('scrape-timeout') and self._is_valid_timespec( config['scrape-timeout']): global_config['scrape_timeout'] = config['scrape-timeout'] if config.get('evaluation-interval') and self._is_valid_timespec( config['evaluation-interval']): global_config['evaluation_interval'] = config[ 'evaluation-interval'] return global_config def _alerting_config(self): """Construct Prometheus altering configuation """ alerting_config = '' if len(self._stored.alertmanagers) < 1: logger.debug('No alertmanagers available') return alerting_config targets = [] for manager in self._stored.alertmanagers: port = self._stored.alertmanager_port targets.append("{}:{}".format(manager, port)) manager_config = {'static_configs': [{'targets': targets}]} alerting_config = {'alertmanagers': [manager_config]} return alerting_config def _prometheus_config(self): """Construct Prometheus configuration """ config = self.model.config scrape_config = { 'global': self._prometheus_global_config(), 'scrape_configs': [] } alerting_config = self._alerting_config() if alerting_config: scrape_config['alerting'] = alerting_config # By default only monitor prometheus server itself default_config = { 'job_name': 'prometheus', 'scrape_interval': '5s', 'scrape_timeout': '5s', 'metrics_path': '/metrics', 'honor_timestamps': True, 'scheme': 'http', 'static_configs': [{ 'targets': ['localhost:{}'.format(config['port'])] }] } scrape_config['scrape_configs'].append(default_config) logger.debug('Prometheus config : {}'.format(scrape_config)) return yaml.dump(scrape_config) def _build_pod_spec(self): """Construct a Juju pod specification for Prometheus """ logger.debug('Building Pod Spec') config = self.model.config spec = { 'version': 3, 'containers': [{ 'name': self.app.name, 'imageDetails': { 'imagePath': config['prometheus-image-path'], 'username': config.get('prometheus-image-username', ''), 'password': config.get('prometheus-image-password', '') }, 'args': self._cli_args(), 'kubernetes': { 'readinessProbe': { 'httpGet': { 'path': '/-/ready', 'port': config['port'] }, 'initialDelaySeconds': 10, 'timeoutSeconds': 30 }, 'livenessProbe': { 'httpGet': { 'path': '/-/healthy', 'port': config['port'] }, 'initialDelaySeconds': 30, 'timeoutSeconds': 30 } }, 'ports': [{ 'containerPort': config['port'], 'name': 'prometheus-http', 'protocol': 'TCP' }], 'volumeConfig': [{ 'name': 'prometheus-config', 'mountPath': '/etc/prometheus', 'files': [{ 'path': 'prometheus.yml', 'content': self._prometheus_config() }] }] }] } return spec def _check_config(self): """Identify missing but required items in configuation :returns: list of missing configuration items (configuration keys) """ logger.debug('Checking Config') config = self.model.config missing = [] if not config.get('prometheus-image-path'): missing.append('prometheus-image-path') if config.get('prometheus-image-username') \ and not config.get('prometheus-image-password'): missing.append('prometheus-image-password') return missing def _configure_pod(self): """Setup a new Prometheus pod specification """ logger.debug('Configuring Pod') missing_config = self._check_config() if missing_config: logger.error('Incomplete Configuration : {}. ' 'Application will be blocked.'.format(missing_config)) self.unit.status = \ BlockedStatus('Missing configuration: {}'.format(missing_config)) return if not self.unit.is_leader(): self.unit.status = ActiveStatus() return self.unit.status = MaintenanceStatus('Setting pod spec.') pod_spec = self._build_pod_spec() self.model.pod.set_spec(pod_spec) self.app.status = ActiveStatus() self.unit.status = ActiveStatus()
class SlurmdCharm(CharmBase): """Operator charm responsible for facilitating slurmd lifecycle events.""" _stored = StoredState() def __init__(self, *args): """Initialize charm state, and observe charm lifecycle events.""" super().__init__(*args) self.config = self.model.config self.slurm_manager = SlurmManager(self, 'slurmd') self.slurmd = SlurmdProvides(self, "slurmd") self._stored.set_default( slurm_installed=False, slurm_config_available=False, slurm_config=dict(), ) event_handler_bindings = { self.on.install: self._on_install, self.on.config_changed: self._on_config_changed, self.on.upgrade_charm: self._on_upgrade, self.slurmd.on.slurmctld_available: self._on_render_config_and_restart, self.slurmd.on.slurmctld_unavailable: self._on_render_config_and_restart, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): """Install the slurm scheduler as snap or tar file.""" self.slurm_manager.install() self.unit.status = ActiveStatus("Slurm Installed") self._stored.slurm_installed = True def _on_upgrade(self, event): """Upgrade charm event handler.""" slurm_config = dict(self._stored.slurm_config) self.slurm_manager.upgrade(slurm_config, resource=False) def _on_config_changed(self, event): self.slurmd.force_set_config_on_app_relation_data() def _on_render_config_and_restart(self, event): """Retrieve slurm_config from controller and write slurm.conf.""" slurm_installed = self._stored.slurm_installed slurm_config_available = self._stored.slurm_config_available if (slurm_installed and slurm_config_available): # cast StoredState -> python dict slurm_config = dict(self._stored.slurm_config) self.slurm_manager.render_config_and_restart(slurm_config) self.unit.status = ActiveStatus("Slurmd Available") else: self.unit.status = BlockedStatus( "Blocked need relation to slurmctld.") event.defer() return def is_slurm_installed(self): """Return true/false based on whether or not slurm is installed.""" return self._stored.slurm_installed def set_slurm_config_available(self, config_available): """Set slurm_config_available in local stored state.""" self._stored.slurm_config_available = config_available def set_slurm_config(self, slurm_config): """Set the slurm_config in local stored state.""" self._stored.slurm_config = slurm_config
class CephClientRequires(Object): on = CephClientEvents() state = StoredState() def __init__(self, charm, relation_name): super().__init__(charm, relation_name) self.name = relation_name self.this_unit = self.model.unit self.relation_name = relation_name self.state.set_default(pools_available=False, broker_req={}) self.framework.observe(charm.on[relation_name].relation_joined, self.on_joined) self.framework.observe(charm.on[relation_name].relation_changed, self.on_changed) def on_joined(self, event): relation = self.model.get_relation(self.relation_name) if relation: logging.info("emiting broker_available") self.on.broker_available.emit() def request_osd_settings(self, settings): relation = self.model.get_relation(self.relation_name) relation.data[self.model.unit]['osd-settings'] = json.dumps( settings, sort_keys=True) @property def pools_available(self): return self.state.pools_available def mon_hosts(self, mon_ips): """List of all monitor host public addresses""" hosts = [] for ceph_addrs in mon_ips: # NOTE(jamespage): This looks odd but deals with # use with ceph-proxy which # presents all monitors in # a single space delimited field. for addr in ceph_addrs.split(' '): hosts.append(ch_ip.format_ipv6_addr(addr) or addr) hosts.sort() return hosts def get_relation_data(self): data = {} mon_ips = [] for relation in self.framework.model.relations[self.relation_name]: for unit in relation.units: _data = { 'key': relation.data[unit].get('key'), 'auth': relation.data[unit].get('auth') } mon_ip = relation.data[unit].get('ceph-public-address') if mon_ip: mon_ips.append(mon_ip) if all(_data.values()): data = _data if data: data['mon_hosts'] = self.mon_hosts(mon_ips) return data def existing_request_complete(self): rq = self.get_existing_request() if rq and self.is_request_complete(rq, self.model.relations[self.name]): return True return False def on_changed(self, event): logging.info("ceph client on_changed") relation_data = self.get_relation_data() if relation_data: if self.existing_request_complete(): logging.info("emiting pools available") self.state.pools_available = True self.on.pools_available.emit() else: logging.info("incomplete request. broker_req not found") def get_broker_rsp_key(self): return 'broker-rsp-{}'.format(self.this_unit.name.replace('/', '-')) def get_existing_request(self): logging.info("get_existing_request") # json.dumps of the CephBrokerRq() rq = ch_ceph.CephBrokerRq() if self.state.broker_req: try: j = json.loads(self.state.broker_req) logging.info("Json request: {}".format(self.state.broker_req)) rq.set_ops(j['ops']) except ValueError as err: logging.info("Unable to decode broker_req: {}. Error {}" "".format(self.state.broker_req, err)) return rq def create_replicated_pool(self, name, replicas=3, weight=None, pg_num=None, group=None, namespace=None): """ Request pool setup @param name: Name of pool to create @param replicas: Number of replicas for supporting pools @param weight: The percentage of data the pool makes up @param pg_num: If not provided, this value will be calculated by the broker based on how many OSDs are in the cluster at the time of creation. Note that, if provided, this value will be capped at the current available maximum. @param group: Group to add pool to. @param namespace: A group can optionally have a namespace defined that will be used to further restrict pool access. """ logging.info("create_replicated_pool") relations = self.framework.model.relations[self.name] logging.info("create_replicated_pool: {}".format(relations)) if not relations: return rq = self.get_existing_request() logging.info("Adding create_replicated_pool request") rq.add_op_create_replicated_pool(name=name, replica_count=replicas, pg_num=pg_num, weight=weight, group=group, namespace=namespace) logging.info("Storing request") self.state.broker_req = rq.request logging.info("Calling send_request_if_needed") # ch_ceph.send_request_if_needed(rq, relation=self.name) self.send_request_if_needed(rq, relations) def request_ceph_permissions(self, client_name, permissions): logging.info("request_ceph_permissions") relations = self.framework.model.relations[self.name] if not relations: return rq = self.get_existing_request() rq.add_op({ 'op': 'set-key-permissions', 'permissions': permissions, 'client': client_name }) self.state.broker_req = rq.request # ch_ceph.send_request_if_needed(rq, relation=self.name) self.send_request_if_needed(rq, relations) def get_previous_request(self, relation): """Get the previous request. :param relation: Relation to check for existing request. :type relation: ops.model.Relation, :returns: The previous ceph request. :rtype: ch_ceph.CephBrokerRq """ request = None broker_req = relation.data[self.this_unit].get('broker_req') if broker_req: request_data = json.loads(broker_req) request = ch_ceph.CephBrokerRq( api_version=request_data['api-version'], request_id=request_data['request-id']) request.set_ops(request_data['ops']) return request def get_request_states(self, request, relations): """Get the existing requests and their states. :param request: A CephBrokerRq object :type request: ch_ceph.CephBrokerRq :param relations: List of relations to check for existing request. :type relations: [ops.model.Relation, ...] :returns: Whether request is complete. :rtype: bool """ complete = [] requests = {} for relation in relations: complete = False previous_request = self.get_previous_request(relation) if request == previous_request: sent = True complete = self.is_request_complete_for_relation( previous_request, relation) else: sent = False complete = False rid = "{}:{}".format(relation.name, relation.id) requests[rid] = { 'sent': sent, 'complete': complete, } return requests def is_request_complete_for_relation(self, request, relation): """Check if a given request has been completed on the given relation :param request: A CephBrokerRq object :type request: ch_ceph.CephBrokerRq :param relation: A relation to check for an existing request. :type relation: ops.model.Relation :returns: Whether request is complete. :rtype: bool """ broker_key = self.get_broker_rsp_key() for unit in relation.units: if relation.data[unit].get(broker_key): rsp = ch_ceph.CephBrokerRsp(relation.data[unit][broker_key]) if rsp.request_id == request.request_id: if not rsp.exit_code: return True else: if relation.data[unit].get('broker_rsp'): logging.info('No response for this unit yet') return False def is_request_complete(self, request, relations): """Check a functionally equivalent request has already been completed Returns True if a similair request has been completed :param request: A CephBrokerRq object :type request: ch_ceph.CephBrokerRq :param relations: List of relations to check for existing request. :type relations: [ops.model.Relation, ...] :returns: Whether request is complete. :rtype: bool """ states = self.get_request_states(request, relations) for rid in states.keys(): if not states[rid]['complete']: return False return True def is_request_sent(self, request, relations): """Check if a functionally equivalent request has already been sent Returns True if a similair request has been sent :param request: A CephBrokerRq object :type request: ch_ceph.CephBrokerRq :param relations: List of relations to check for existing request. :type relations: [ops.model.Relation, ...] :returns: Whether equivalent request has been sent. :rtype: bool """ states = self.get_request_states(request, relations) for rid in states.keys(): if not states[rid]['sent']: return False return True def send_request_if_needed(self, request, relations): """Send request if an equivalent request has not already been sent :param request: A CephBrokerRq object :type request: ch_ceph.CephBrokerRq :param relations: List of relations to check for existing request. :type relations: [ops.model.Relation, ...] """ if self.is_request_sent(request, relations): logging.debug('Request already sent, not sending new request') else: for relation in relations: logging.debug('Sending request {}'.format(request.request_id)) relation.data[self.this_unit]['broker_req'] = request.request
class RabbitMQAMQPProvides(Object): """ RabbitMQAMQPProvides class """ on = RabbitMQAMQPClientEvents() _stored = StoredState() def __init__(self, charm, relation_name): super().__init__(charm, relation_name) self.charm = charm self.relation_name = relation_name self.framework.observe( self.charm.on[relation_name].relation_joined, self._on_amqp_relation_joined ) self.framework.observe( self.charm.on[relation_name].relation_changed, self._on_amqp_relation_changed ) self.framework.observe( self.charm.on[relation_name].relation_broken, self._on_amqp_relation_broken ) @property def _amqp_rel(self): """This AMQP relationship.""" return self.framework.model.get_relation(self.relation_name) def _on_amqp_relation_joined(self, event): """Handle AMQP joined.""" logging.debug("RabbitMQAMQPProvides on_joined") self.on.has_amqp_clients.emit() def _on_amqp_relation_changed(self, event): """Handle AMQP changed.""" logging.debug("RabbitMQAMQPProvides on_changed") # Validate data on the relation if self.username(event) and self.vhost(event): self.on.ready_amqp_clients.emit() if self.charm.unit.is_leader(): self.set_amqp_credentials( event, self.username(event), self.vhost(event)) def _on_amqp_relation_broken(self, event): """Handle AMQP broken.""" logging.debug("RabbitMQAMQPProvides on_departed") # TODO clear data on the relation def username(self, event): """Return the AMQP username from the client side of the relation.""" return event.relation.data[self._amqp_rel.app].get("username") def vhost(self, event): """Return the AMQP vhost from the client side of the relation.""" return event.relation.data[self._amqp_rel.app].get("vhost") def set_amqp_credentials(self, event, username, vhost): """Set AMQP Credentials. :param event: The current event :type EventsBase :param username: The requested username :type username: str :param vhost: The requested vhost :type vhost: str :returns: None :rtype: None """ # TODO: Can we move this into the charm code? # TODO TLS Support. Existing interfaces set ssl_port and ssl_ca logging.debug("Setting amqp connection information.") try: if not self.charm.does_vhost_exist(vhost): self.charm.create_vhost(vhost) password = self.charm.create_user(username) self.charm.set_user_permissions(username, vhost) event.relation.data[self.charm.app]["password"] = password event.relation.data[self.charm.app]["hostname"] = self.charm.hostname except requests.exceptions.ConnectionError as e: logging.warning("Rabbitmq is not ready. Defering. Errno: {}".format(e.errno)) event.defer()
class SlurmdCharm(CharmBase): """Slurmd lifecycle events.""" _stored = StoredState() on = SlurmdCharmEvents() def __init__(self, *args): """Init _stored attributes and interfaces, observe events.""" super().__init__(*args) self._stored.set_default(nhc_conf=str(), slurm_installed=False, slurmctld_available=False, slurmctld_started=False, cluster_name=str()) self._slurm_manager = SlurmManager(self, "slurmd") self._fluentbit = FluentbitClient(self, "fluentbit") # interface to slurmctld, should only have one slurmctld per slurmd app self._slurmd = Slurmd(self, "slurmd") self._slurmd_peer = SlurmdPeer(self, "slurmd-peer") event_handler_bindings = { self.on.install: self._on_install, self.on.upgrade_charm: self._on_upgrade, self.on.update_status: self._on_update_status, self.on.config_changed: self._on_config_changed, self.on.slurmctld_started: self._on_slurmctld_started, self.on.slurmd_start: self._on_slurmd_start, self.on.check_etcd: self._on_check_etcd, self._slurmd.on.slurmctld_available: self._on_slurmctld_available, self._slurmd.on.slurmctld_unavailable: self._on_slurmctld_unavailable, # fluentbit self.on["fluentbit"].relation_created: self._on_configure_fluentbit, # actions self.on.version_action: self._on_version_action, self.on.node_configured_action: self._on_node_configured_action, self.on.get_node_inventory_action: self._on_get_node_inventory_action, self.on.show_nhc_config_action: self._on_show_nhc_config, # infiniband actions self.on.get_infiniband_repo_action: self.get_infiniband_repo, self.on.set_infiniband_repo_action: self.set_infiniband_repo, self.on.install_infiniband_action: self.install_infiniband, self.on.uninstall_infiniband_action: self.uninstall_infiniband, self.on.start_infiniband_action: self.start_infiniband, self.on.enable_infiniband_action: self.enable_infiniband, self.on.stop_infiniband_action: self.stop_infiniband, self.on.is_active_infiniband_action: self.is_active_infiniband, # nvdia actions self.on.nvidia_repo_action: self.nvidia_repo, self.on.nvidia_package_action: self.nvidia_package, self.on.nvidia_install_action: self.nvidia_install, } for event, handler in event_handler_bindings.items(): self.framework.observe(event, handler) def _on_install(self, event): """Perform installation operations for slurmd.""" self.unit.set_workload_version(Path("version").read_text().strip()) self.unit.status = WaitingStatus("Installing slurmd") custom_repo = self.config.get("custom-slurm-repo") successful_installation = self._slurm_manager.install(custom_repo) logger.debug(f"### slurmd installed: {successful_installation}") if successful_installation: self._stored.slurm_installed = True else: self.unit.status = BlockedStatus("Error installing slurmd") event.defer() self._check_status() def _on_configure_fluentbit(self, event): """Set up Fluentbit log forwarding.""" self._configure_fluentbit() def _configure_fluentbit(self): logger.debug("## Configuring fluentbit") cfg = list() cfg.extend(self._slurm_manager.fluentbit_config_nhc) cfg.extend(self._slurm_manager.fluentbit_config_slurm) self._fluentbit.configure(cfg) def _on_upgrade(self, event): """Perform upgrade operations.""" self.unit.set_workload_version(Path("version").read_text().strip()) def _on_update_status(self, event): """Handle update status.""" self._check_status() def _check_status(self) -> bool: """Check if we heve all needed components. - partition name - slurm installed - slurmctld available and working - munge key configured and working """ if self._slurm_manager.needs_reboot: self.unit.status = BlockedStatus("Machine needs reboot") return False if not self.get_partition_name(): self.unit.status = WaitingStatus("Waiting on charm configuration") return False if not self._stored.slurm_installed: self.unit.status = BlockedStatus("Error installing slurmd") return False if not self._slurmd.is_joined: self.unit.status = BlockedStatus("Need relations: slurmctld") return False if not self._stored.slurmctld_available: self.unit.status = WaitingStatus("Waiting on: slurmctld") return False if not self._slurm_manager.check_munged(): self.unit.status = BlockedStatus("Error configuring munge key") return False if not self._stored.slurmctld_started: self.unit.status = WaitingStatus("Waiting slurmctld to start") return False self.unit.status = ActiveStatus("slurmd available") return True def ensure_slurmd_starts(self, max_attemps=10) -> bool: """Ensure slurmd is up and running.""" logger.debug("## Stoping slurmd") self._slurm_manager.slurm_systemctl('stop') for i in range(max_attemps): if self._slurm_manager.slurm_is_active(): logger.debug("## Slurmd running") break else: logger.warning("## Slurmd not running, trying to start it") self.unit.status = WaitingStatus("Starting slurmd") self._slurm_manager.restart_slurm_component() sleep(2 + i) if self._slurm_manager.slurm_is_active(): return True else: self.unit.status = BlockedStatus("Cannot start slurmd") return False def _set_slurmctld_available(self, flag: bool): """Change stored value for slurmctld availability.""" self._stored.slurmctld_available = flag def _set_slurmctld_started(self, flag: bool): """Change stored value for slurmctld started.""" self._stored.slurmctld_started = flag def _on_slurmctld_available(self, event): """Get data from slurmctld and send inventory.""" if not self._stored.slurm_installed: event.defer() return logger.debug( '#### Slurmctld available - setting overrides for configless') # get slurmctld host:port from relation and override systemd services host = self._slurmd.slurmctld_hostname port = self._slurmd.slurmctld_port self._slurm_manager.create_configless_systemd_override(host, port) self._slurm_manager.daemon_reload() self._write_munge_key_and_restart_munge() self._set_slurmctld_available(True) self._on_set_partition_info_on_app_relation_data(event) self._check_status() # check etcd for hostnames self.on.check_etcd.emit() def _on_check_etcd(self, event): """Check if node is accounted for. Check if slurmctld accounted for this node's inventory for the first time, if so, emit slurmctld_started event, so the node can start the daemon. """ host = self._slurmd.slurmctld_address port = self._slurmd.etcd_port logger.debug(f"## Connecting to etcd3 in {host}:{port}") client = Etcd3Client(host=host, port=port, api_path="/v3/") logger.debug("## Querying etcd3 for node list") try: v = client.get(key="all_nodes") logger.debug(f"## Got: {v}") except Exception as e: logger.error( f"## Unable to connect to {host} to get list of nodes: {e}") event.defer() return node_accounted = False if v: hostnames = json.loads(v[0]) logger.debug(f"### etcd3 node list: {hostnames}") if self.hostname in hostnames: self.on.slurmctld_started.emit() node_accounted = True if not node_accounted: logger.debug("## Node not accounted for. Deferring.") event.defer() def _on_slurmctld_unavailable(self, event): logger.debug("## Slurmctld unavailable") self._set_slurmctld_available(False) self._set_slurmctld_started(False) self._slurm_manager.slurm_systemctl('stop') self._check_status() def _on_slurmctld_started(self, event): """Set flag to True and emit slurmd_start event.""" self._set_slurmctld_started(True) self.on.slurmd_start.emit() def _on_slurmd_start(self, event): if not self._check_status(): event.defer() return # only set up fluentbit if we have a relation to it if self._fluentbit._relation is not None: self._configure_fluentbit() # at this point, we have slurm installed, munge configured, and we know # slurmctld accounted for this node. It should be safe to start slurmd if self.ensure_slurmd_starts(): logger.debug("## slurmctld started and slurmd is running") else: event.defer() self._check_status() def _on_config_changed(self, event): """Handle charm configuration changes.""" if self.model.unit.is_leader(): logger.debug("## slurmd config changed - leader") self._on_set_partition_info_on_app_relation_data(event) nhc_conf = self.model.config.get('nhc-conf') if nhc_conf: if nhc_conf != self._stored.nhc_conf: self._stored.nhc_conf = nhc_conf self._slurm_manager.render_nhc_config(nhc_conf) def get_partition_name(self) -> str: """Return the partition_name in the slurmd relation.""" # Determine if a user-supplied partition-name config exists, if so # ensure the partition_name is consistent with the supplied config. # If no partition name has been specified then generate one. partition_name = self._slurmd_peer.partition_name partition_name_from_config = self.config.get("partition-name") if partition_name: if partition_name_from_config: partition_name_from_config = partition_name_from_config.replace( ' ', '-') if partition_name != partition_name_from_config: self._set_partition_name(partition_name_from_config) partition_name = partition_name_from_config else: logger.debug("Partition name unchanged.") else: logger.debug("Partition name unchanged.") else: partition_name = f"osd-{self.app.name}" logger.debug(f"Partition name: {partition_name}") self._set_partition_name(partition_name) return partition_name def _set_partition_name(self, name: str): """Set the partition_name in the slurmd relation.""" if self.model.unit.is_leader(): self._slurmd_peer.partition_name = name def _write_munge_key_and_restart_munge(self): logger.debug('#### slurmd charm - writting munge key') self._slurm_manager.configure_munge_key( self._slurmd.get_stored_munge_key()) if self._slurm_manager.restart_munged(): logger.debug("## Munge restarted succesfully") else: logger.error("## Unable to restart munge") def _on_version_action(self, event): """Return version of installed components. - Slurm - munge - NHC - infiniband """ version = {} version['slurm'] = self._slurm_manager.slurm_version() version['munge'] = self._slurm_manager.munge_version() version['nhc'] = self._slurm_manager.nhc_version() version['infiniband'] = self._slurm_manager.infiniband_version() event.set_results(version) def _on_node_configured_action(self, event): """Remove node from DownNodes.""" # trigger reconfig self._slurmd.configure_new_node() logger.debug('### This node is not new anymore') def _on_get_node_inventory_action(self, event): """Return node inventory.""" inventory = self._slurmd.node_inventory event.set_results({'inventory': inventory}) def get_infiniband_repo(self, event): """Return the currently used infiniband repository.""" repo = self._slurm_manager.infiniband.repository event.set_results({'infiniband-repo': repo}) def set_infiniband_repo(self, event): """Set the infiniband repository.""" repo = event.params["repo"] logger.debug(f"#### setting custom infiniband repo: {repo}") repo = base64.b64decode(repo).decode() self._slurm_manager.infiniband.repository = repo def install_infiniband(self, event): """Install infiniband.""" logger.debug("#### Installing Infiniband") self._slurm_manager.infiniband.install() event.set_results({'installation': 'Successfull. Please reboot node.'}) self.unit.status = BlockedStatus("Need reboot for Infiniband") def uninstall_infiniband(self, event): """Install infiniband.""" logger.debug("#### Uninstalling Infiniband") self._slurm_manager.infiniband.uninstall() def start_infiniband(self, event): """Start Infiniband systemd service.""" logger.debug("#### Starting Infiniband service") self._slurm_manager.infiniband.start() def enable_infiniband(self, event): """Enable Infiniband systemd service.""" logger.debug("#### Enabling Infiniband service") self._slurm_manager.infiniband.enable() def stop_infiniband(self, event): """Stop Infiniband systemd service.""" logger.debug("#### Stoping Infiniband service") self._slurm_manager.infiniband.stop() def is_active_infiniband(self, event): """Check if Infiniband systemd service is arctive.""" status = self._slurm_manager.infiniband.is_active() logger.debug(f"#### Infiniband service is-active: {status}") event.set_results({'infiniband-is-active': status}) def nvidia_repo(self, event): """Set or get the used nvidia repository.""" repo = event.params.get("repo", None) if repo: self._slurm_manager.nvidia.repository = base64.b64decode( repo).decode() event.set_results( {'nvidia-repo': self._slurm_manager.nvidia.repository}) def nvidia_package(self, event): """Set or get the used nvidia package.""" package = event.params.get("package", None) if package or package == "": # user supplied a package name -> store it self._slurm_manager.nvidia.package = package event.set_results( {'nvidia-package': self._slurm_manager.nvidia.package}) def nvidia_install(self, event): """Install nvidia drivers.""" logger.debug("#### Installing nvidia drivers: %s", self._slurm_manager.nvidia.package) self._slurm_manager.nvidia.install() event.set_results({'installation': 'Successfull. Please reboot node.'}) self.unit.status = BlockedStatus("Need reboot for nvidia") def _on_show_nhc_config(self, event): """Show current nhc.conf.""" nhc_conf = self._slurm_manager.get_nhc_config() event.set_results({"nhc.conf": nhc_conf}) def _on_set_partition_info_on_app_relation_data(self, event): """Set the slurm partition info on the application relation data.""" # Only the leader can set data on the relation. if self.model.unit.is_leader(): # If the relation with slurmctld exists then set our # partition info on the application relation data. # This handler shouldn't fire if the relation isn't made, # but add this extra check here just incase. if self._slurmd.is_joined: partition = self._assemble_partition() if partition: self._slurmd.set_partition_info_on_app_relation_data( partition) else: event.defer() else: event.defer() def _assemble_partition(self): """Assemble the partition info.""" partition_name = self.get_partition_name() partition_config = self.config.get("partition-config") partition_state = self.config.get("partition-state") logger.debug(f"## partition_name: {partition_name}") return { "partition_name": partition_name, "partition_state": partition_state, "partition_config": partition_config, } @property def hostname(self) -> str: """Return the hostname.""" return self._slurm_manager.hostname @property def cluster_name(self) -> str: """Return the cluster-name.""" return self._stored.cluster_name @cluster_name.setter def cluster_name(self, name: str): """Set the cluster-name.""" self._stored.cluster_name = name