def update_target_mounts(self): # If mounts is None then nothing changed since the last update and so we can just return. # Not the same as [] empty list which means no mounts if self.host_data["mounts"] is None: return # Loop over all mountables we expected on this host, whether they # were actually seen in the results or not. mounted_uuids = dict([(str(m["fs_uuid"]), m) for m in self.host_data["mounts"]]) for target_mount in ManagedTargetMount.objects.filter(host=self.host): # Mounted-ness # ============ mounted_locally = target_mount.target.uuid in mounted_uuids # Recovery status # =============== if mounted_locally: mount_info = mounted_uuids[target_mount.target.uuid] recovery_status = mount_info["recovery_status"] else: recovery_status = {} # Update to active_mount and alerts for monitor-only # targets done here instead of resource_locations if target_mount.target.immutable_state: target = target_mount.target if mounted_locally: job_scheduler_notify.notify( target, self.started_at, { "state": "mounted", "active_mount_id": target_mount.id }, ["mounted", "unmounted"], ) elif not mounted_locally and target.active_mount == target_mount: log.debug("clearing active_mount, %s %s", self.started_at, self.host) job_scheduler_notify.notify( target, self.started_at, { "state": "unmounted", "active_mount_id": None }, ["mounted", "unmounted"], ) with transaction.atomic(): if target_mount.target.active_mount is None: TargetRecoveryInfo.update(target_mount.target, {}) TargetRecoveryAlert.notify(target_mount.target, False) elif mounted_locally: recovering = TargetRecoveryInfo.update( target_mount.target, recovery_status) TargetRecoveryAlert.notify(target_mount.target, recovering)
def save(self, *args, **kwargs): self.full_clean() # Grab a copy of the outlet pre-save so that we can determine # which hosts need to have their fencing reconfigured. try: old_self = PowerControlDeviceOutlet.objects.get(pk=self.pk) except PowerControlDeviceOutlet.DoesNotExist: old_self = None super(PowerControlDeviceOutlet, self).save(*args, **kwargs) # Need to force a commit here to ensure that the updated outlet # configuration is available to other threads (e.g. fence reconfig). from django.db import transaction transaction.commit() reconfigure = {'reconfigure_fencing': True} previous = old_self.host if old_self else None for host in self._hosts_for_fence_reconfiguration(self.host, previous): if host.pacemaker_configuration: job_scheduler_notify.notify(host.pacemaker_configuration, tznow(), reconfigure) else: job_log.debug("Skipping reconfiguration of non-server %s" % host)
def test_late_notification(self): """Test that notifications are droppped when they are older than the last change to an objects state""" self.lnet_configuration = self.assertState(self.lnet_configuration, 'lnet_up') awhile_ago = django.utils.timezone.now() - datetime.timedelta(seconds = 120) job_scheduler_notify.notify(freshen(self.lnet_configuration), awhile_ago, {'state': 'lnet_down'}, ['lnet_up']) self.assertEqual(freshen(self.lnet_configuration).state, 'lnet_up')
def test_buffered_notification(self): """Test that notifications for locked items are buffered and replayed when the locking Job has completed.""" self.lnet_configuration = self.assertState(self.lnet_configuration, 'lnet_up') # Set boot_time to something that should change. now = django.utils.timezone.now() job_scheduler_notify.notify(freshen(self.host), now, {'boot_time': now}) self.assertEqual(freshen(self.host).boot_time, now) # Not much later, but later enough (fastest boot EVAR). later = django.utils.timezone.now() self.assertNotEqual(later, now) # This is more direct than fooling around with trying to get the # timing right. Contrive a locking event on the host we want to # notify, and the notification should be buffered. self.job_scheduler._lock_cache.all_by_item[self.host] = ["fake lock"] job_scheduler_notify.notify(freshen(self.host), later, {'boot_time': later}) # Now, remove the lock and make sure that the second notification # didn't get through during the lock. del(self.job_scheduler._lock_cache.all_by_item[self.host]) self.assertEqual(freshen(self.host).boot_time, now) # Run any job, doesn't matter -- we just want to ensure that the # notification buffer is drained after the job completes. self.lnet_configuration = self.set_and_assert_state(self.lnet_configuration, 'lnet_down') self.assertEqual(freshen(self.host).boot_time, later) # Just for completeness, check that the notification buffer for this # host was completely drained and removed. buffer_key = (tuple(self.host.content_type.natural_key()), self.host.pk) self.assertEqual([], self.job_scheduler._notification_buffer.drain_notifications_for_key(buffer_key)) self.assertEqual([], self.job_scheduler._notification_buffer.notification_keys)
def update_properties(self, properties): if properties is not None: properties = json.dumps(properties) # use the job scheduler to update, but only as necessary if self.host.properties != properties: job_scheduler_notify.notify(self.host, self.started_at, {"properties": properties})
def update(self, boot_time, client_start_time): """ :return A boolean, true if the agent should be sent a SESSION_TERMINATE_ALL: indicates whether a fresh client run (different start time) is seen. """ self.last_contact = IMLDateTime.utcnow() if boot_time is not None and boot_time != self._boot_time: if self._boot_time is not None: HostRebootEvent.register_event(alert_item=self._host, boot_time=boot_time, severity=logging.WARNING) log.warning("Server %s rebooted at %s" % (self.fqdn, boot_time)) self._boot_time = boot_time job_scheduler_notify.notify(self._host, self._boot_time, {'boot_time': boot_time}) require_reset = False if client_start_time is not None and client_start_time != self._client_start_time: if self._client_start_time is not None: log.warning("Agent restart on server %s at %s" % (self.fqdn, client_start_time)) require_reset = True self._client_start_time = client_start_time if not self._healthy: self.update_health(True) return require_reset
def test_notification(self): """Test that state notifications cause the state of an object to change""" self.lnet_configuration = self.assertState(self.lnet_configuration, "lnet_up") now = django.utils.timezone.now() job_scheduler_notify.notify(freshen(self.lnet_configuration), now, {"state": "lnet_down"}, ["lnet_up"]) self.assertEqual(freshen(self.lnet_configuration).state, "lnet_down")
def update_resource_locations(self): # If resource_locations is None then nothing changed since the last update and so we can just return. # Not the same as [] empty list which means no resource_locations if self.host_data["resource_locations"] is None: return if "crm_mon_error" in self.host_data["resource_locations"]: # Means that it was not possible to obtain a # list from corosync: corosync may well be absent if # we're monitoring a non-chroma-managed monitor-only # system. But if there are managed mounts # then this is a problem. crm_mon_error = self.host_data["resource_locations"]["crm_mon_error"] if ManagedTarget.objects.filter(immutable_state=False, managedtargetmount__host=self.host).count(): log.error( "Got no resource_locations from host %s, but there are chroma-configured mounts on that server!\n" "crm_mon returned rc=%s,stdout=%s,stderr=%s" % (self.host, crm_mon_error["rc"], crm_mon_error["stdout"], crm_mon_error["stderr"]) ) return for resource_name, node_name in self.host_data["resource_locations"].items(): try: target = ManagedTarget.objects.get(ha_label=resource_name) except ManagedTarget.DoesNotExist: # audit_log.warning("Resource %s on host %s is not a known target" % (resource_name, self.host)) continue # If we're operating on a Managed* rather than a purely monitored target if not target.immutable_state: if node_name is None: active_mount = None else: try: host = ManagedHost.objects.get(Q(nodename=node_name) | Q(fqdn=node_name)) try: active_mount = ManagedTargetMount.objects.get(target=target, host=host) except ManagedTargetMount.DoesNotExist: log.warning( "Resource for target '%s' is running on host '%s', but there is no such TargetMount" % (target, host) ) active_mount = None except ManagedHost.DoesNotExist: log.warning("Resource location node '%s' does not match any Host" % (node_name)) active_mount = None job_scheduler_notify.notify( target, self.started_at, { "state": ["unmounted", "mounted"][active_mount is not None], "active_mount_id": None if active_mount is None else active_mount.id, }, ["mounted", "unmounted"], )
def run(self, kwargs): corosync_configuration = kwargs['corosync_configuration'] self.invoke_agent_expect_result(corosync_configuration.host, "change_mcast_port", {'old_mcast_port': kwargs['old_mcast_port'], 'new_mcast_port': kwargs['new_mcast_port']}) job_scheduler_notify.notify(corosync_configuration, now(), {'mcast_port': kwargs['new_mcast_port']})
def update_client_mounts(self): # Client mount audit comes in via metrics due to the way the # ClientAudit is implemented. try: client_mounts = self.host_data["metrics"]["raw"][ "lustre_client_mounts"] except KeyError: client_mounts = [] # If lustre_client_mounts is None then nothing changed since the last update and so we can just return. # Not the same as [] empty list which means no mounts if client_mounts == None: return expected_fs_mounts = LustreClientMount.objects.select_related( "filesystem").filter(host=self.host) actual_fs_mounts = [ m["mountspec"].split(":/")[1] for m in client_mounts ] # Don't bother with the rest if there's nothing to do. if len(expected_fs_mounts) == 0 and len(actual_fs_mounts) == 0: return for expected_mount in expected_fs_mounts: if expected_mount.active and expected_mount.filesystem.name not in actual_fs_mounts: update = dict(state="unmounted", mountpoint=None) job_scheduler_notify.notify(expected_mount, self.started_at, update) log.info("updated mount %s on %s -> inactive" % (expected_mount.mountpoint, self.host)) for actual_mount in client_mounts: fsname = actual_mount["mountspec"].split(":/")[1] try: mount = [ m for m in expected_fs_mounts if m.filesystem.name == fsname ][0] log.debug("mount: %s" % mount) if not mount.active: update = dict(state="mounted", mountpoint=actual_mount["mountpoint"]) job_scheduler_notify.notify(mount, self.started_at, update) log.info("updated mount %s on %s -> active" % (actual_mount["mountpoint"], self.host)) except IndexError: log.info("creating new mount %s on %s" % (actual_mount["mountpoint"], self.host)) filesystem = ManagedFilesystem.objects.get(name=fsname) JobSchedulerClient.create_client_mount( self.host, filesystem, actual_mount["mountpoint"])
def run(self, kwargs): corosync_configuration = kwargs["corosync_configuration"] self.invoke_agent_expect_result( corosync_configuration.host, "change_mcast_port", { "old_mcast_port": kwargs["old_mcast_port"], "new_mcast_port": kwargs["new_mcast_port"] }, ) job_scheduler_notify.notify(corosync_configuration, now(), {"mcast_port": kwargs["new_mcast_port"]})
def run(self, kwargs): corosync_configuration = kwargs["corosync_configuration"] config = self.invoke_agent_expect_result(corosync_configuration.host, "get_corosync_autoconfig") # Select dedicated line as ring0 to carry all the traffic by default - this # prevents congestion on managment network ring0_name, ring0_config = next( (interface, config) for interface, config in config["interfaces"].items() if config["dedicated"] == True) ring1_name, ring1_config = next( (interface, config) for interface, config in config["interfaces"].items() if config["dedicated"] == False) self.invoke_agent_expect_result( corosync_configuration.host, "configure_network", { "ring0_name": ring0_name, "ring1_name": ring1_name, "ring1_ipaddr": ring1_config["ipaddr"], "ring1_prefix": ring1_config["prefix"], }, ) self.invoke_agent_expect_result( corosync_configuration.host, "configure_corosync", { "ring0_name": ring0_name, "ring1_name": ring1_name, "old_mcast_port": corosync_configuration.mcast_port, "new_mcast_port": config["mcast_port"], }, ) job_scheduler_notify.notify( corosync_configuration, now(), { "mcast_port": config["mcast_port"], "network_interfaces": [ring0_name, ring1_name] }, )
def run(self, kwargs): corosync_configuration = kwargs['corosync_configuration'] self.invoke_agent_expect_result( corosync_configuration.host, "configure_corosync", { 'ring0_name': kwargs['ring0_name'], 'ring1_name': kwargs['ring1_name'], 'old_mcast_port': kwargs['old_mcast_port'], 'new_mcast_port': kwargs['new_mcast_port'] }) job_scheduler_notify.notify( corosync_configuration, now(), { 'mcast_port': kwargs['new_mcast_port'], 'network_interfaces': [kwargs['ring0_name'], kwargs['ring1_name']] })
def save(self, *args, **kwargs): self.full_clean() # Grab a copy of the outlet pre-save so that we can determine # which hosts need to have their fencing reconfigured. try: old_self = PowerControlDeviceOutlet.objects.get(pk=self.pk) except PowerControlDeviceOutlet.DoesNotExist: old_self = None super(PowerControlDeviceOutlet, self).save(*args, **kwargs) reconfigure = {"reconfigure_fencing": True} previous = old_self.host if old_self else None for host in self._hosts_for_fence_reconfiguration(self.host, previous): if host.pacemaker_configuration: job_scheduler_notify.notify(host.pacemaker_configuration, tznow(), reconfigure) else: job_log.debug("Skipping reconfiguration of non-server %s" % host)
def update_packages(self, packages): if not packages: # Packages is allowed to be None # (means is not the initial message, or there was a problem talking to RPM or yum) return # An update is required if: # * A package is installed on the storage server for which there is a more recent version # available on the manager # or # * A package is available on the manager, and specified in the server's profile's list of # packages, but is not installed on the storage server. # Update the package models needs_update = chroma_core.models.package.update(self.host, packages) # Check for any non-installed packages that should be installed for package in self.host.server_profile.serverprofilepackage_set.all(): try: package_data = packages[package.bundle.bundle_name][ package.package_name] except KeyError: log.warning( "Expected package %s/%s not found in report from %s" % (package.bundle.bundle_name, package.package_name, self.host)) continue else: if not package_data['installed']: log.info("Update available (not installed): %s/%s on %s" % (package.bundle.bundle_name, package.package_name, self.host)) needs_update = True break log.info("update_packages(%s): updates=%s" % (self.host, needs_update)) job_scheduler_notify.notify(self.host, self.started_at, {'needs_update': needs_update})
def run(self, kwargs): corosync_configuration = kwargs["corosync_configuration"] self.invoke_agent_expect_result( corosync_configuration.host, "configure_corosync", { "ring0_name": kwargs["ring0_name"], "ring1_name": kwargs["ring1_name"], "old_mcast_port": kwargs["old_mcast_port"], "new_mcast_port": kwargs["new_mcast_port"], }, ) job_scheduler_notify.notify( corosync_configuration, now(), { "mcast_port": kwargs["new_mcast_port"], "network_interfaces": [kwargs["ring0_name"], kwargs["ring1_name"]], }, )
def run(self, kwargs): corosync_configuration = kwargs['corosync_configuration'] config = self.invoke_agent_expect_result(corosync_configuration.host, "get_corosync_autoconfig") ring0_name, ring0_config = next( (interface, config) for interface, config in config['interfaces'].items() if config['dedicated'] == False) ring1_name, ring1_config = next( (interface, config) for interface, config in config['interfaces'].items() if config['dedicated'] == True) self.invoke_agent_expect_result( corosync_configuration.host, "configure_network", { 'ring0_name': ring0_name, 'ring1_name': ring1_name, 'ring1_ipaddr': ring1_config['ipaddr'], 'ring1_prefix': ring1_config['prefix'] }) self.invoke_agent_expect_result( corosync_configuration.host, "configure_corosync", { 'ring0_name': ring0_name, 'ring1_name': ring1_name, 'old_mcast_port': corosync_configuration.mcast_port, 'new_mcast_port': config['mcast_port'] }) job_scheduler_notify.notify( corosync_configuration, now(), { 'mcast_port': config['mcast_port'], 'network_interfaces': [ring0_name, ring1_name] })
def run(self, kwargs): corosync_configuration = kwargs["corosync_configuration"] # detect local interfaces for use in corosync 'rings', network level configuration only config = self.invoke_agent_expect_result(corosync_configuration.host, "get_corosync_autoconfig") # Select dedicated line as ring0 to carry all the traffic by default - this # prevents congestion on managment network ring0_name, ring0_config = next( (interface, config) for interface, config in config["interfaces"].items() if config["dedicated"] == True) ring1_name, ring1_config = next( (interface, config) for interface, config in config["interfaces"].items() if config["dedicated"] == False) # apply the configurations of corosync 'rings', network level configuration only self.invoke_agent_expect_result( corosync_configuration.host, "configure_network", { "ring0_name": ring0_name, "ring1_name": ring1_name, "ring1_ipaddr": ring1_config["ipaddr"], "ring1_prefix": ring1_config["prefix"], }, ) logging.debug("Node %s returned corosync configuration %s" % (corosync_configuration.host.fqdn, config)) # Serialize across nodes with the same mcast_port so that we ensure commands # are executed in the same order. with peer_mcast_ports_configuration_lock[config["mcast_port"]]: from chroma_core.models import ManagedHost corosync_peers = self._corosync_peers( corosync_configuration.host.fqdn, config["mcast_port"]) logging.debug( "Node %s has corosync peers %s" % (corosync_configuration.host.fqdn, ",".join(corosync_peers))) # If we are adding then we action on a host that is already part of the cluster # otherwise we have to action on the host we are adding because it is the first node in the cluster # TODO: Harden this up a little so it tries to pick a peer that is actively communicating, might be useful # when adding a new host in place of an old host. Also if ignoring peer, should we destroy that peer's # corosync configuration? actioning_host = corosync_configuration.host if corosync_peers: peer = ManagedHost.objects.get(fqdn=corosync_peers[0]) if peer.state in ["managed", "packages_installed"]: actioning_host = peer else: logging.warning( "peer corosync config ignored as host state == %s (not packages_installed or " "managed)" % peer.state) logging.debug( "actioning host for %s corosync configuration stage 2: %s" % (corosync_configuration.host.fqdn, actioning_host.fqdn)) # Stage 1 configures pcsd on the host being added, sets the password, enables and starts it etc. self.invoke_agent_expect_result( corosync_configuration.host, "configure_corosync2_stage_1", { "mcast_port": config["mcast_port"], "pcs_password": self._pcs_password, "fqdn": corosync_configuration.host.fqdn, }, ) corosync_configuration.host.corosync_ring0 = ring0_config["ipaddr"] corosync_configuration.host.save(update_fields=["corosync_ring0"]) # Stage 2 configures the cluster either by creating it or adding a node to it. self.invoke_agent_expect_result( actioning_host, "configure_corosync2_stage_2", { "ring0_name": ring0_name, "ring1_name": ring1_name, "new_node_fqdn": corosync_configuration.host.corosync_ring0, "mcast_port": config["mcast_port"], "pcs_password": self._pcs_password, "create_cluster": actioning_host == corosync_configuration.host, }, ) logging.debug("Node %s corosync configuration complete" % corosync_configuration.host.fqdn) job_scheduler_notify.notify( corosync_configuration, now(), { "mcast_port": config["mcast_port"], "network_interfaces": [ring0_name, ring1_name] }, )
def on_data(self, fqdn, body): """Process all incoming messages from the Corosync agent plugin Request to have the status changed for an instance. If the current state determines that a host is offline, then raise that alert. old messages should not be processed. datetime is in UTC of the node's localtime in the standard ISO string format """ try: host = ManagedHost.objects.get(fqdn=fqdn) except ManagedHost.DoesNotExist: # This might happen when we are deleting a host and the queues mean a message is still sat waiting to be # processed. Something has spoken to us and we don't know anything about it so really we can't do anything # other than drop it. log.warning( "Corosync message from unknown host %s, the message was dropped." % fqdn) return # If corosync is not configured yet, or we don't actually have corosync - then ignore the input if (not host.corosync_configuration ) or host.corosync_configuration.state == 'unconfigured': return if body.get('state'): job_scheduler_notify.notify(host.corosync_configuration, timezone.now(), {'state': body['state']['corosync']}) job_scheduler_notify.notify(host.pacemaker_configuration, timezone.now(), {'state': body['state']['pacemaker']}) if body['state']['corosync'] == 'stopped': return else: if host.corosync_configuration.state != 'started': return if body.get('crm_info'): nodes = body['crm_info']['nodes'] dt = body['crm_info']['datetime'] options = body['crm_info'].get('options', {'stonith_enabled': None}) stonith_enabled = options['stonith_enabled'] try: dt = IMLDateTime.parse(dt) except ValueError: if dt != '': log.warning( "Invalid date or tz string from corosync plugin: %s" % dt) raise def is_new(peer_node_identifier): return (peer_node_identifier not in self._host_status or self._host_status[peer_node_identifier].datetime < dt) peers_str = "; ".join([ "%s: online=%s, new=%s" % (peer_node_identifier, data['online'], is_new(peer_node_identifier)) for peer_node_identifier, data in nodes.items() ]) log.debug("Incoming peer report from %s: %s" % (fqdn, peers_str)) # NB: This will ignore any unknown peers in the report. cluster_nodes = ManagedHost.objects.select_related( 'ha_cluster_peers').filter( Q(nodename__in=nodes.keys()) | Q(fqdn__in=nodes.keys())) unknown_nodes = set(nodes.keys()) - set([ h.nodename for h in cluster_nodes ]) - set([h.fqdn for h in cluster_nodes]) # Leaving this out for now - because they raise issue caused by limitations in the simulator and # test system as a whole. Difficult to know if they will or won't be raised it all depends on the past. # CorosyncUnknownPeersAlert.notify(host.corosync_configuration, unknown_nodes != set()) if unknown_nodes: log.warning("Unknown nodes in report from %s: %s" % (fqdn, unknown_nodes)) if stonith_enabled is not None: StonithNotEnabledAlert.notify(host.corosync_configuration, stonith_enabled is False) CorosyncNoPeersAlert.notify(host.corosync_configuration, len(cluster_nodes) == 1) # CorosyncToManyPeersAlert.notify(host.corosync_configuration, len(cluster_nodes) > 2) # Consider all nodes in the peer group for this reporting agent for host in cluster_nodes: try: data = nodes[host.nodename] node_identifier = host.nodename except KeyError: data = nodes[host.fqdn] node_identifier = host.fqdn cluster_peer_keys = sorted( [node.pk for node in cluster_nodes if node is not host]) if is_new(node_identifier) and host.corosync_configuration: host_reported_online = data['online'] == 'true' log.debug("Corosync processing " "peer %s of %s " % (host.fqdn, fqdn)) # Raise an Alert - system suppresses duplicates log.debug("Alert notify on %s: active=%s" % (host, not host_reported_online)) HostOfflineAlert.notify(host, not host_reported_online) if host_reported_online == False: log.debug("Host %s offline" % host.fqdn) else: log.debug("Host %s online" % host.fqdn) # Attempt to save the state. if host.corosync_configuration.corosync_reported_up != host_reported_online: job_scheduler_notify.notify( host.corosync_configuration, timezone.now(), {'corosync_reported_up': host_reported_online}) peer_host_peer_keys = sorted( [h.pk for h in host.ha_cluster_peers.all()]) if peer_host_peer_keys != cluster_peer_keys: job_scheduler_notify.notify( host, timezone.now(), {'ha_cluster_peers': cluster_peer_keys}) # Keep internal track of the hosts state. self._host_status[node_identifier] = self.HostStatus( status=host_reported_online, datetime=dt)
def run(self, kwargs): corosync_configuration = kwargs['corosync_configuration'] # detect local interfaces for use in corosync 'rings', network level configuration only config = self.invoke_agent_expect_result(corosync_configuration.host, "get_corosync_autoconfig") ring0_name, ring0_config = next((interface, config) for interface, config in config['interfaces'].items() if config['dedicated'] == False) ring1_name, ring1_config = next((interface, config) for interface, config in config['interfaces'].items() if config['dedicated'] == True) # apply the configurations of corosync 'rings', network level configuration only self.invoke_agent_expect_result(corosync_configuration.host, "configure_network", {'ring0_name': ring0_name, 'ring1_name': ring1_name, 'ring1_ipaddr': ring1_config['ipaddr'], 'ring1_prefix': ring1_config['prefix']}) logging.debug("Node %s returned corosync configuration %s" % (corosync_configuration.host.fqdn, config)) # Serialize across nodes with the same mcast_port so that we ensure commands # are executed in the same order. with peer_mcast_ports_configuration_lock[config['mcast_port']]: from chroma_core.models import ManagedHost corosync_peers = self._corosync_peers(corosync_configuration.host.fqdn, config['mcast_port']) logging.debug("Node %s has corosync peers %s" % (corosync_configuration.host.fqdn, ",".join(corosync_peers))) # If we are adding then we action on a host that is already part of the cluster # otherwise we have to action on the host we are adding because it is the first node in the cluster # TODO: Harden this up a little so it tries to pick a peer that is actively communicating, might be useful # when adding a new host in place of an old host. Also if ignoring peer, should we destroy that peer's # corosync configuration? actioning_host = corosync_configuration.host if corosync_peers: peer = ManagedHost.objects.get(fqdn=corosync_peers[0]) if peer.state in ['managed', 'packages_installed']: actioning_host = peer else: logging.warning('peer corosync config ignored as host state == %s (not packages_installed or ' 'managed)' % peer.state) logging.debug('actioning host for %s corosync configuration stage 2: %s' % (corosync_configuration.host.fqdn, actioning_host.fqdn)) # Stage 1 configures pcsd on the host being added, sets the password, enables and starts it etc. self.invoke_agent_expect_result(corosync_configuration.host, "configure_corosync2_stage_1", {'mcast_port': config['mcast_port'], 'pcs_password': self._pcs_password}) # Stage 2 configures the cluster either by creating it or adding a node to it. self.invoke_agent_expect_result(actioning_host, "configure_corosync2_stage_2", {'ring0_name': ring0_name, 'ring1_name': ring1_name, 'new_node_fqdn': corosync_configuration.host.fqdn, 'mcast_port': config['mcast_port'], 'pcs_password': self._pcs_password, 'create_cluster': actioning_host == corosync_configuration.host}) logging.debug("Node %s corosync configuration complete" % corosync_configuration.host.fqdn) job_scheduler_notify.notify(corosync_configuration, now(), {'mcast_port': config['mcast_port'], 'network_interfaces': [ring0_name, ring1_name]})
def _call(cls, host, cmd, args): cls.calls.append((cmd, args)) cls.host_calls[host.fqdn].append((cmd, args)) if not cls.succeed: cls._fail(host.fqdn) if (cmd, args) in cls.fail_commands: cls._fail(host.fqdn) mock_server = cls.mock_servers[host.address] log.info("invoke_agent %s %s %s" % (host, cmd, args)) # This isn't really accurate because lnet is scanned asynchonously, but it is as close as we can get today # Fixme: Also I know think this is writing to the wrong thing and should be changing the mock_server entries. # to lnet_up, I guess the mock_server needs an lnet state really, rather than relying on nids present. if cmd == "load_lnet": synthetic_lnet_configuration(host, mock_server["nids"]) return elif cmd == "device_plugin": # Only returns nid info today. return create_synthetic_device_info(host, mock_server, args["plugin"]) elif cmd == "format_target": inode_size = None if "mkfsoptions" in args: inode_arg = re.search("-I (\d+)", args["mkfsoptions"]) if inode_arg: inode_size = int(inode_arg.group(1).__str__()) if inode_size is None: # A 'foo' value inode_size = 777 return { "uuid": uuid.uuid1().__str__(), "inode_count": 666, "inode_size": inode_size, "filesystem_type": "ext4", } elif cmd == "stop_target": ha_label = args["ha_label"] target = ManagedTarget.objects.get(ha_label=ha_label) return agent_result_ok elif cmd == "start_target": ha_label = args["ha_label"] target = ManagedTarget.objects.get(ha_label=ha_label) return agent_result(target.primary_host.nodename) elif cmd == "register_target": # Assume mount paths are "/mnt/testfs-OST0001" style mount_point = args["mount_point"] label = re.search("/mnt/([^\s]+)", mount_point).group(1) return {"label": label} elif cmd == "detect_scan": return mock_server["detect-scan"] elif cmd == "install_packages": return agent_result([]) elif cmd == "register_server": api_client = TestApiClient() old_is_authenticated = CsrfAuthentication.is_authenticated try: CsrfAuthentication.is_authenticated = mock.Mock( return_value=True) api_client.client.login(username="******", password="******") fqdn = cls.mock_servers[host]["fqdn"] response = api_client.post( args["url"] + "register/%s/" % args["secret"], data={ "address": host, "fqdn": fqdn, "nodename": cls.mock_servers[host]["nodename"], "capabilities": ["manage_targets"], "version": cls.version, "csr": helper.generate_csr(fqdn), }, ) assert response.status_code == 201 registration_data = Serializer().deserialize( response.content, format=response["Content-Type"]) print("MockAgent.invoke returning %s" % registration_data) return registration_data finally: CsrfAuthentication.is_authenticated = old_is_authenticated elif cmd == "kernel_status": return { "running": "fake_kernel-0.1", "required": "fake_kernel-0.1", "available": ["fake_kernel-0.1"] } elif cmd == "selinux_status": return {"status": "Disabled"} elif cmd == "reboot_server": now = IMLDateTime.utcnow() log.info("rebooting %s; updating boot_time to %s" % (host, now)) job_scheduler_notify.notify(host, now, {"boot_time": now}) elif cmd == "which zfs": return 1 elif "import platform;" in cmd: return "0" elif "socket.gethostbyname(socket.gethostname())" in cmd: if not mock_server["tests"]["hostname_valid"]: return "127.0.0.1" else: return mock_server["address"] elif "print os.uname()[1]" in cmd: return "%s\n%s" % (mock_server["nodename"], mock_server["fqdn"]) elif "socket.getfqdn()" in cmd: return mock_server["fqdn"] elif "ping" in cmd: result = (0 if mock_server["tests"]["reverse_resolve"] else 2) + (0 if mock_server["tests"]["reverse_ping"] else 1) return result elif "ElectricFence" in cmd: return 0 if mock_server["tests"]["yum_can_update"] else 1 elif "openssl version -a" in cmd: return 0 if mock_server["tests"]["openssl"] else 1 elif "curl -k https" in cmd: return json.dumps({"host_id": host.id, "command_id": 0}) elif cmd in [ "configure_pacemaker", "unconfigure_pacemaker", "configure_target_store", "unconfigure_target_store", "deregister_server", "restart_agent", "shutdown_server", "host_corosync_config", "check_block_device", "set_conf_param", "purge_configuration", ]: return None elif cmd in [ "configure_target_ha", "unconfigure_target_ha", "start_lnet", "stop_lnet", "unload_lnet", "unconfigure_lnet", "configure_corosync", "unconfigure_corosync", "start_corosync", "stop_corosync", "start_pacemaker", "stop_pacemaker", "configure_ntp", "unconfigure_ntp", "import_target", "export_target", "set_profile", "update_profile", "failover_target", "failback_target", "configure_network", "open_firewall", "close_firewall", ]: return agent_result_ok elif cmd == "get_corosync_autoconfig": return agent_result({ "interfaces": { "eth0": { "dedicated": False, "ipaddr": "192.168.0.1", "prefix": 24 }, "eth1": { "dedicated": True, "ipaddr": "10.10.0.01", "prefix": 24 }, }, "mcast_port": "666", }) else: assert False, ( "The %s command is not in the known list for MockAgentRpc. Please add it then when people modify it a simple text search will let them know to change it here as well." % cmd)
def on_success(self): from chroma_core.models import HaCluster for h in HaCluster.host_peers(self.corosync_configuration.host): job_scheduler_notify.notify(h.corosync_configuration, now(), {"mcast_port": self.mcast_port})
def update_packages(self, package_report): if not package_report: # Packages is allowed to be None # (means is not the initial message, or there was a problem talking to RPM or yum) return # An update is required if: # * A package is installed on the storage server for which there is a more recent version # available on the manager # or # * A package is available on the manager, and specified in the server's profile's list of # packages, but is not installed on the storage server. def _version_info_list(package_data): return [VersionInfo(*package) for package in package_data] def _updates_available(installed_versions, available_versions): # versions are of form (EPOCH, VERSION, RELEASE, ARCH) # Map of arch to highest installed version max_installed_version = {} for installed_info in installed_versions: max_inst = max_installed_version.get(installed_info.arch, None) if max_inst is None or installed_info > max_inst: max_installed_version[installed_info.arch] = installed_info for available_info in available_versions: max_inst = max_installed_version.get(available_info.arch, None) if max_inst is not None and available_info > max_inst: log.debug("Update available: %s > %s" % (available_info, max_inst)) return True return False updates = False repos = package_report.keys() for package in self.host.server_profile.serverprofilepackage_set.all(): package_data = {} for repo in repos: try: package_data = package_report[repo][package.package_name] except KeyError: continue break if not package_data: log.warning("Required Package %s not available for %s" % (package.package_name, self.host)) continue if not package_data["installed"]: log.info("Update available (not installed): %s on %s" % (package.package_name, self.host)) updates = True break if _updates_available( _version_info_list(package_data["installed"]), _version_info_list(package_data["available"])): log.info("Update needed: %s on %s" % (package.package_name, self.host)) updates = True break log.info("update_packages(%s): updates=%s" % (self.host, updates)) job_scheduler_notify.notify(self.host, self.started_at, {"needs_update": updates})
def _call(cls, host, cmd, args): cls.calls.append((cmd, args)) cls.host_calls[host].append((cmd, args)) if not cls.succeed: cls._fail(host.fqdn) if (cmd, args) in cls.fail_commands: cls._fail(host.fqdn) mock_server = cls.mock_servers[host.address] log.info("invoke_agent %s %s %s" % (host, cmd, args)) # This isn't really accurate because lnet is scanned asynchonously, but it is as close as we can get today # Fixme: Also I know think this is writing to the wrong thing and should be changing the mock_server entries. # to lnet_up, I guess the mock_server needs an lnet state really, rather than relying on nids present. if cmd == "load_lnet": synthetic_lnet_configuration(host, mock_server['nids']) return elif cmd == "device_plugin": # Only returns nid info today. return create_synthetic_device_info(host, mock_server, args['plugin']) elif cmd == 'format_target': inode_size = None if 'mkfsoptions' in args: inode_arg = re.search("-I (\d+)", args['mkfsoptions']) if inode_arg: inode_size = int(inode_arg.group(1).__str__()) if inode_size is None: # A 'foo' value inode_size = 777 return {'uuid': uuid.uuid1().__str__(), 'inode_count': 666, 'inode_size': inode_size, 'filesystem_type': 'ext4'} elif cmd == 'stop_target': ha_label = args['ha_label'] target = ManagedTarget.objects.get(ha_label = ha_label) return agent_result_ok elif cmd == 'start_target': ha_label = args['ha_label'] target = ManagedTarget.objects.get(ha_label = ha_label) return agent_result(target.primary_host.nodename) elif cmd == 'register_target': # Assume mount paths are "/mnt/testfs-OST0001" style mount_point = args['mount_point'] label = re.search("/mnt/([^\s]+)", mount_point).group(1) return {'label': label} elif cmd == 'detect_scan': return mock_server['detect-scan'] elif cmd == 'install_packages': return agent_result([]) elif cmd == 'register_server': api_client = TestApiClient() old_is_authenticated = CsrfAuthentication.is_authenticated try: CsrfAuthentication.is_authenticated = mock.Mock(return_value = True) api_client.client.login(username = '******', password = '******') fqdn = cls.mock_servers[host]['fqdn'] response = api_client.post(args['url'] + "register/%s/" % args['secret'], data = { 'address': host, 'fqdn': fqdn, 'nodename': cls.mock_servers[host]['nodename'], 'capabilities': ['manage_targets'], 'version': cls.version, 'csr': helper.generate_csr(fqdn) }) assert response.status_code == 201 registration_data = Serializer().deserialize(response.content, format = response['Content-Type']) print "MockAgent.invoke returning %s" % registration_data return registration_data finally: CsrfAuthentication.is_authenticated = old_is_authenticated elif cmd == 'kernel_status': return { 'running': 'fake_kernel-0.1', 'required': 'fake_kernel-0.1', 'available': ['fake_kernel-0.1'] } elif cmd == 'reboot_server': now = IMLDateTime.utcnow() log.info("rebooting %s; updating boot_time to %s" % (host, now)) job_scheduler_notify.notify(host, now, {'boot_time': now}) elif 'socket.gethostbyname(socket.gethostname())' in cmd: if not mock_server['tests']['hostname_valid']: return '127.0.0.1' else: return mock_server['address'] elif 'print os.uname()[1]' in cmd: return '%s\n%s' % (mock_server['nodename'], mock_server['fqdn']) elif 'socket.getfqdn()' in cmd: return mock_server['fqdn'] elif 'ping' in cmd: result = ((0 if mock_server['tests']['reverse_resolve'] else 2) + (0 if mock_server['tests']['reverse_ping'] else 1)) return result elif 'python-fedora-django' in cmd: return 0 if mock_server['tests']['yum_valid_repos'] else 1 elif 'ElectricFence' in cmd: return 0 if mock_server['tests']['yum_can_update'] else 1 elif 'curl -k https' in cmd: return json.dumps({'host_id': host.id, 'command_id': 0}) elif cmd in ['configure_pacemaker', 'unconfigure_pacemaker', 'configure_rsyslog', 'unconfigure_rsyslog', 'configure_target_store', 'unconfigure_target_store', 'deregister_server', 'restart_agent', 'shutdown_server', 'host_corosync_config', 'check_block_device', 'set_conf_param', 'purge_configuration']: return None elif cmd in ['configure_target_ha', 'unconfigure_target_ha', 'start_lnet', 'stop_lnet', 'unload_lnet', 'unconfigure_lnet', 'configure_corosync', 'unconfigure_corosync', 'start_corosync', 'stop_corosync', 'start_pacemaker', 'stop_pacemaker', 'configure_ntp', 'unconfigure_ntp', 'import_target', 'export_target', 'import_target', 'export_target' 'set_profile', 'update_profile', 'failover_target', 'failback_target', 'configure_network', 'open_firewall', 'close_firewall']: return agent_result_ok elif cmd == 'get_corosync_autoconfig': return agent_result({'interfaces': {'eth0': {'dedicated': False, 'ipaddr': '192.168.0.1', 'prefix': 24}, 'eth1': {'dedicated': True, 'ipaddr': '10.10.0.01', 'prefix': 24}}, 'mcast_port': '666'}) else: assert False, "The %s command is not in the known list for MockAgentRpc. Please add it then when people modify it a simple text search will let them know to change it here as well." % cmd