Пример #1
0
    def update_target_mounts(self):
        # If mounts is None then nothing changed since the last update and so we can just return.
        # Not the same as [] empty list which means no mounts
        if self.host_data["mounts"] is None:
            return

        # Loop over all mountables we expected on this host, whether they
        # were actually seen in the results or not.
        mounted_uuids = dict([(str(m["fs_uuid"]), m)
                              for m in self.host_data["mounts"]])
        for target_mount in ManagedTargetMount.objects.filter(host=self.host):

            # Mounted-ness
            # ============
            mounted_locally = target_mount.target.uuid in mounted_uuids

            # Recovery status
            # ===============
            if mounted_locally:
                mount_info = mounted_uuids[target_mount.target.uuid]
                recovery_status = mount_info["recovery_status"]
            else:
                recovery_status = {}

            # Update to active_mount and alerts for monitor-only
            # targets done here instead of resource_locations
            if target_mount.target.immutable_state:
                target = target_mount.target
                if mounted_locally:
                    job_scheduler_notify.notify(
                        target,
                        self.started_at,
                        {
                            "state": "mounted",
                            "active_mount_id": target_mount.id
                        },
                        ["mounted", "unmounted"],
                    )
                elif not mounted_locally and target.active_mount == target_mount:
                    log.debug("clearing active_mount, %s %s", self.started_at,
                              self.host)

                    job_scheduler_notify.notify(
                        target,
                        self.started_at,
                        {
                            "state": "unmounted",
                            "active_mount_id": None
                        },
                        ["mounted", "unmounted"],
                    )

            with transaction.atomic():
                if target_mount.target.active_mount is None:
                    TargetRecoveryInfo.update(target_mount.target, {})
                    TargetRecoveryAlert.notify(target_mount.target, False)
                elif mounted_locally:
                    recovering = TargetRecoveryInfo.update(
                        target_mount.target, recovery_status)
                    TargetRecoveryAlert.notify(target_mount.target, recovering)
    def save(self, *args, **kwargs):
        self.full_clean()

        # Grab a copy of the outlet pre-save so that we can determine
        # which hosts need to have their fencing reconfigured.
        try:
            old_self = PowerControlDeviceOutlet.objects.get(pk=self.pk)
        except PowerControlDeviceOutlet.DoesNotExist:
            old_self = None

        super(PowerControlDeviceOutlet, self).save(*args, **kwargs)

        # Need to force a commit here to ensure that the updated outlet
        # configuration is available to other threads (e.g. fence reconfig).
        from django.db import transaction
        transaction.commit()

        reconfigure = {'reconfigure_fencing': True}
        previous = old_self.host if old_self else None
        for host in self._hosts_for_fence_reconfiguration(self.host, previous):
            if host.pacemaker_configuration:
                job_scheduler_notify.notify(host.pacemaker_configuration,
                                            tznow(), reconfigure)
            else:
                job_log.debug("Skipping reconfiguration of non-server %s" %
                              host)
 def test_late_notification(self):
     """Test that notifications are droppped when they are older than
     the last change to an objects state"""
     self.lnet_configuration = self.assertState(self.lnet_configuration, 'lnet_up')
     awhile_ago = django.utils.timezone.now() - datetime.timedelta(seconds = 120)
     job_scheduler_notify.notify(freshen(self.lnet_configuration), awhile_ago, {'state': 'lnet_down'}, ['lnet_up'])
     self.assertEqual(freshen(self.lnet_configuration).state, 'lnet_up')
    def test_buffered_notification(self):
        """Test that notifications for locked items are buffered and
        replayed when the locking Job has completed."""
        self.lnet_configuration = self.assertState(self.lnet_configuration, 'lnet_up')

        # Set boot_time to something that should change.
        now = django.utils.timezone.now()
        job_scheduler_notify.notify(freshen(self.host), now, {'boot_time': now})
        self.assertEqual(freshen(self.host).boot_time, now)

        # Not much later, but later enough (fastest boot EVAR).
        later = django.utils.timezone.now()
        self.assertNotEqual(later, now)

        # This is more direct than fooling around with trying to get the
        # timing right. Contrive a locking event on the host we want to
        # notify, and the notification should be buffered.
        self.job_scheduler._lock_cache.all_by_item[self.host] = ["fake lock"]
        job_scheduler_notify.notify(freshen(self.host), later, {'boot_time': later})

        # Now, remove the lock and make sure that the second notification
        # didn't get through during the lock.
        del(self.job_scheduler._lock_cache.all_by_item[self.host])
        self.assertEqual(freshen(self.host).boot_time, now)

        # Run any job, doesn't matter -- we just want to ensure that the
        # notification buffer is drained after the job completes.
        self.lnet_configuration = self.set_and_assert_state(self.lnet_configuration, 'lnet_down')
        self.assertEqual(freshen(self.host).boot_time, later)

        # Just for completeness, check that the notification buffer for this
        # host was completely drained and removed.
        buffer_key = (tuple(self.host.content_type.natural_key()), self.host.pk)
        self.assertEqual([], self.job_scheduler._notification_buffer.drain_notifications_for_key(buffer_key))
        self.assertEqual([], self.job_scheduler._notification_buffer.notification_keys)
 def update_properties(self, properties):
     if properties is not None:
         properties = json.dumps(properties)
         # use the job scheduler to update, but only as necessary
         if self.host.properties != properties:
             job_scheduler_notify.notify(self.host, self.started_at,
                                         {"properties": properties})
Пример #6
0
    def update(self, boot_time, client_start_time):
        """
        :return A boolean, true if the agent should be sent a SESSION_TERMINATE_ALL: indicates
                whether a fresh client run (different start time) is seen.
        """
        self.last_contact = IMLDateTime.utcnow()
        if boot_time is not None and boot_time != self._boot_time:
            if self._boot_time is not None:
                HostRebootEvent.register_event(alert_item=self._host,
                                               boot_time=boot_time,
                                               severity=logging.WARNING)
                log.warning("Server %s rebooted at %s" %
                            (self.fqdn, boot_time))
            self._boot_time = boot_time
            job_scheduler_notify.notify(self._host, self._boot_time,
                                        {'boot_time': boot_time})

        require_reset = False
        if client_start_time is not None and client_start_time != self._client_start_time:
            if self._client_start_time is not None:
                log.warning("Agent restart on server %s at %s" %
                            (self.fqdn, client_start_time))
            require_reset = True

            self._client_start_time = client_start_time

        if not self._healthy:
            self.update_health(True)

        return require_reset
Пример #7
0
 def test_notification(self):
     """Test that state notifications cause the state of an object to change"""
     self.lnet_configuration = self.assertState(self.lnet_configuration,
                                                "lnet_up")
     now = django.utils.timezone.now()
     job_scheduler_notify.notify(freshen(self.lnet_configuration), now,
                                 {"state": "lnet_down"}, ["lnet_up"])
     self.assertEqual(freshen(self.lnet_configuration).state, "lnet_down")
    def update_resource_locations(self):
        # If resource_locations is None then nothing changed since the last update and so we can just return.
        # Not the same as [] empty list which means no resource_locations
        if self.host_data["resource_locations"] is None:
            return

        if "crm_mon_error" in self.host_data["resource_locations"]:
            # Means that it was not possible to obtain a
            # list from corosync: corosync may well be absent if
            # we're monitoring a non-chroma-managed monitor-only
            # system.  But if there are managed mounts
            # then this is a problem.
            crm_mon_error = self.host_data["resource_locations"]["crm_mon_error"]
            if ManagedTarget.objects.filter(immutable_state=False, managedtargetmount__host=self.host).count():
                log.error(
                    "Got no resource_locations from host %s, but there are chroma-configured mounts on that server!\n"
                    "crm_mon returned rc=%s,stdout=%s,stderr=%s"
                    % (self.host, crm_mon_error["rc"], crm_mon_error["stdout"], crm_mon_error["stderr"])
                )
            return

        for resource_name, node_name in self.host_data["resource_locations"].items():
            try:
                target = ManagedTarget.objects.get(ha_label=resource_name)
            except ManagedTarget.DoesNotExist:
                # audit_log.warning("Resource %s on host %s is not a known target" % (resource_name, self.host))
                continue

            # If we're operating on a Managed* rather than a purely monitored target
            if not target.immutable_state:
                if node_name is None:
                    active_mount = None
                else:
                    try:
                        host = ManagedHost.objects.get(Q(nodename=node_name) | Q(fqdn=node_name))
                        try:
                            active_mount = ManagedTargetMount.objects.get(target=target, host=host)
                        except ManagedTargetMount.DoesNotExist:
                            log.warning(
                                "Resource for target '%s' is running on host '%s', but there is no such TargetMount"
                                % (target, host)
                            )
                            active_mount = None
                    except ManagedHost.DoesNotExist:
                        log.warning("Resource location node '%s' does not match any Host" % (node_name))
                        active_mount = None

                job_scheduler_notify.notify(
                    target,
                    self.started_at,
                    {
                        "state": ["unmounted", "mounted"][active_mount is not None],
                        "active_mount_id": None if active_mount is None else active_mount.id,
                    },
                    ["mounted", "unmounted"],
                )
Пример #9
0
    def run(self, kwargs):
        corosync_configuration = kwargs['corosync_configuration']

        self.invoke_agent_expect_result(corosync_configuration.host,
                                        "change_mcast_port",
                                        {'old_mcast_port': kwargs['old_mcast_port'],
                                         'new_mcast_port': kwargs['new_mcast_port']})

        job_scheduler_notify.notify(corosync_configuration,
                                    now(),
                                    {'mcast_port': kwargs['new_mcast_port']})
    def update_client_mounts(self):
        # Client mount audit comes in via metrics due to the way the
        # ClientAudit is implemented.
        try:
            client_mounts = self.host_data["metrics"]["raw"][
                "lustre_client_mounts"]
        except KeyError:
            client_mounts = []

        # If lustre_client_mounts is None then nothing changed since the last update and so we can just return.
        # Not the same as [] empty list which means no mounts
        if client_mounts == None:
            return

        expected_fs_mounts = LustreClientMount.objects.select_related(
            "filesystem").filter(host=self.host)
        actual_fs_mounts = [
            m["mountspec"].split(":/")[1] for m in client_mounts
        ]

        # Don't bother with the rest if there's nothing to do.
        if len(expected_fs_mounts) == 0 and len(actual_fs_mounts) == 0:
            return

        for expected_mount in expected_fs_mounts:
            if expected_mount.active and expected_mount.filesystem.name not in actual_fs_mounts:
                update = dict(state="unmounted", mountpoint=None)
                job_scheduler_notify.notify(expected_mount, self.started_at,
                                            update)
                log.info("updated mount %s on %s -> inactive" %
                         (expected_mount.mountpoint, self.host))

        for actual_mount in client_mounts:
            fsname = actual_mount["mountspec"].split(":/")[1]
            try:
                mount = [
                    m for m in expected_fs_mounts
                    if m.filesystem.name == fsname
                ][0]
                log.debug("mount: %s" % mount)
                if not mount.active:
                    update = dict(state="mounted",
                                  mountpoint=actual_mount["mountpoint"])
                    job_scheduler_notify.notify(mount, self.started_at, update)
                    log.info("updated mount %s on %s -> active" %
                             (actual_mount["mountpoint"], self.host))
            except IndexError:
                log.info("creating new mount %s on %s" %
                         (actual_mount["mountpoint"], self.host))
                filesystem = ManagedFilesystem.objects.get(name=fsname)
                JobSchedulerClient.create_client_mount(
                    self.host, filesystem, actual_mount["mountpoint"])
Пример #11
0
    def run(self, kwargs):
        corosync_configuration = kwargs["corosync_configuration"]

        self.invoke_agent_expect_result(
            corosync_configuration.host,
            "change_mcast_port",
            {
                "old_mcast_port": kwargs["old_mcast_port"],
                "new_mcast_port": kwargs["new_mcast_port"]
            },
        )

        job_scheduler_notify.notify(corosync_configuration, now(),
                                    {"mcast_port": kwargs["new_mcast_port"]})
Пример #12
0
    def run(self, kwargs):
        corosync_configuration = kwargs["corosync_configuration"]

        config = self.invoke_agent_expect_result(corosync_configuration.host,
                                                 "get_corosync_autoconfig")

        # Select dedicated line as ring0 to carry all the traffic by default - this
        # prevents congestion on managment network
        ring0_name, ring0_config = next(
            (interface, config)
            for interface, config in config["interfaces"].items()
            if config["dedicated"] == True)
        ring1_name, ring1_config = next(
            (interface, config)
            for interface, config in config["interfaces"].items()
            if config["dedicated"] == False)

        self.invoke_agent_expect_result(
            corosync_configuration.host,
            "configure_network",
            {
                "ring0_name": ring0_name,
                "ring1_name": ring1_name,
                "ring1_ipaddr": ring1_config["ipaddr"],
                "ring1_prefix": ring1_config["prefix"],
            },
        )

        self.invoke_agent_expect_result(
            corosync_configuration.host,
            "configure_corosync",
            {
                "ring0_name": ring0_name,
                "ring1_name": ring1_name,
                "old_mcast_port": corosync_configuration.mcast_port,
                "new_mcast_port": config["mcast_port"],
            },
        )

        job_scheduler_notify.notify(
            corosync_configuration,
            now(),
            {
                "mcast_port": config["mcast_port"],
                "network_interfaces": [ring0_name, ring1_name]
            },
        )
Пример #13
0
    def run(self, kwargs):
        corosync_configuration = kwargs['corosync_configuration']

        self.invoke_agent_expect_result(
            corosync_configuration.host, "configure_corosync", {
                'ring0_name': kwargs['ring0_name'],
                'ring1_name': kwargs['ring1_name'],
                'old_mcast_port': kwargs['old_mcast_port'],
                'new_mcast_port': kwargs['new_mcast_port']
            })

        job_scheduler_notify.notify(
            corosync_configuration, now(), {
                'mcast_port': kwargs['new_mcast_port'],
                'network_interfaces':
                [kwargs['ring0_name'], kwargs['ring1_name']]
            })
    def save(self, *args, **kwargs):
        self.full_clean()

        # Grab a copy of the outlet pre-save so that we can determine
        # which hosts need to have their fencing reconfigured.
        try:
            old_self = PowerControlDeviceOutlet.objects.get(pk=self.pk)
        except PowerControlDeviceOutlet.DoesNotExist:
            old_self = None

        super(PowerControlDeviceOutlet, self).save(*args, **kwargs)

        reconfigure = {"reconfigure_fencing": True}
        previous = old_self.host if old_self else None
        for host in self._hosts_for_fence_reconfiguration(self.host, previous):
            if host.pacemaker_configuration:
                job_scheduler_notify.notify(host.pacemaker_configuration,
                                            tznow(), reconfigure)
            else:
                job_log.debug("Skipping reconfiguration of non-server %s" %
                              host)
Пример #15
0
    def update_packages(self, packages):
        if not packages:
            # Packages is allowed to be None
            # (means is not the initial message, or there was a problem talking to RPM or yum)
            return

        # An update is required if:
        #  * A package is installed on the storage server for which there is a more recent version
        #    available on the manager
        #  or
        #  * A package is available on the manager, and specified in the server's profile's list of
        #    packages, but is not installed on the storage server.

        # Update the package models
        needs_update = chroma_core.models.package.update(self.host, packages)

        # Check for any non-installed packages that should be installed
        for package in self.host.server_profile.serverprofilepackage_set.all():
            try:
                package_data = packages[package.bundle.bundle_name][
                    package.package_name]
            except KeyError:
                log.warning(
                    "Expected package %s/%s not found in report from %s" %
                    (package.bundle.bundle_name, package.package_name,
                     self.host))
                continue
            else:
                if not package_data['installed']:
                    log.info("Update available (not installed): %s/%s on %s" %
                             (package.bundle.bundle_name, package.package_name,
                              self.host))
                    needs_update = True
                    break

        log.info("update_packages(%s): updates=%s" % (self.host, needs_update))
        job_scheduler_notify.notify(self.host, self.started_at,
                                    {'needs_update': needs_update})
Пример #16
0
    def run(self, kwargs):
        corosync_configuration = kwargs["corosync_configuration"]

        self.invoke_agent_expect_result(
            corosync_configuration.host,
            "configure_corosync",
            {
                "ring0_name": kwargs["ring0_name"],
                "ring1_name": kwargs["ring1_name"],
                "old_mcast_port": kwargs["old_mcast_port"],
                "new_mcast_port": kwargs["new_mcast_port"],
            },
        )

        job_scheduler_notify.notify(
            corosync_configuration,
            now(),
            {
                "mcast_port": kwargs["new_mcast_port"],
                "network_interfaces":
                [kwargs["ring0_name"], kwargs["ring1_name"]],
            },
        )
Пример #17
0
    def run(self, kwargs):
        corosync_configuration = kwargs['corosync_configuration']

        config = self.invoke_agent_expect_result(corosync_configuration.host,
                                                 "get_corosync_autoconfig")

        ring0_name, ring0_config = next(
            (interface, config)
            for interface, config in config['interfaces'].items()
            if config['dedicated'] == False)
        ring1_name, ring1_config = next(
            (interface, config)
            for interface, config in config['interfaces'].items()
            if config['dedicated'] == True)

        self.invoke_agent_expect_result(
            corosync_configuration.host, "configure_network", {
                'ring0_name': ring0_name,
                'ring1_name': ring1_name,
                'ring1_ipaddr': ring1_config['ipaddr'],
                'ring1_prefix': ring1_config['prefix']
            })

        self.invoke_agent_expect_result(
            corosync_configuration.host, "configure_corosync", {
                'ring0_name': ring0_name,
                'ring1_name': ring1_name,
                'old_mcast_port': corosync_configuration.mcast_port,
                'new_mcast_port': config['mcast_port']
            })

        job_scheduler_notify.notify(
            corosync_configuration, now(), {
                'mcast_port': config['mcast_port'],
                'network_interfaces': [ring0_name, ring1_name]
            })
Пример #18
0
    def run(self, kwargs):
        corosync_configuration = kwargs["corosync_configuration"]

        # detect local interfaces for use in corosync 'rings', network level configuration only
        config = self.invoke_agent_expect_result(corosync_configuration.host,
                                                 "get_corosync_autoconfig")

        # Select dedicated line as ring0 to carry all the traffic by default - this
        # prevents congestion on managment network
        ring0_name, ring0_config = next(
            (interface, config)
            for interface, config in config["interfaces"].items()
            if config["dedicated"] == True)
        ring1_name, ring1_config = next(
            (interface, config)
            for interface, config in config["interfaces"].items()
            if config["dedicated"] == False)

        # apply the configurations of corosync 'rings', network level configuration only
        self.invoke_agent_expect_result(
            corosync_configuration.host,
            "configure_network",
            {
                "ring0_name": ring0_name,
                "ring1_name": ring1_name,
                "ring1_ipaddr": ring1_config["ipaddr"],
                "ring1_prefix": ring1_config["prefix"],
            },
        )

        logging.debug("Node %s returned corosync configuration %s" %
                      (corosync_configuration.host.fqdn, config))

        # Serialize across nodes with the same mcast_port so that we ensure commands
        # are executed in the same order.
        with peer_mcast_ports_configuration_lock[config["mcast_port"]]:
            from chroma_core.models import ManagedHost

            corosync_peers = self._corosync_peers(
                corosync_configuration.host.fqdn, config["mcast_port"])

            logging.debug(
                "Node %s has corosync peers %s" %
                (corosync_configuration.host.fqdn, ",".join(corosync_peers)))

            # If we are adding then we action on a host that is already part of the cluster
            # otherwise we have to action on the host we are adding because it is the first node in the cluster
            # TODO: Harden this up a little so it tries to pick a peer that is actively communicating, might be useful
            # when adding a new host in place of an old host. Also if ignoring peer, should we destroy that peer's
            # corosync configuration?
            actioning_host = corosync_configuration.host
            if corosync_peers:
                peer = ManagedHost.objects.get(fqdn=corosync_peers[0])
                if peer.state in ["managed", "packages_installed"]:
                    actioning_host = peer
                else:
                    logging.warning(
                        "peer corosync config ignored as host state == %s (not packages_installed or "
                        "managed)" % peer.state)

            logging.debug(
                "actioning host for %s corosync configuration stage 2: %s" %
                (corosync_configuration.host.fqdn, actioning_host.fqdn))

            # Stage 1 configures pcsd on the host being added, sets the password, enables and starts it etc.
            self.invoke_agent_expect_result(
                corosync_configuration.host,
                "configure_corosync2_stage_1",
                {
                    "mcast_port": config["mcast_port"],
                    "pcs_password": self._pcs_password,
                    "fqdn": corosync_configuration.host.fqdn,
                },
            )

            corosync_configuration.host.corosync_ring0 = ring0_config["ipaddr"]
            corosync_configuration.host.save(update_fields=["corosync_ring0"])

            # Stage 2 configures the cluster either by creating it or adding a node to it.
            self.invoke_agent_expect_result(
                actioning_host,
                "configure_corosync2_stage_2",
                {
                    "ring0_name":
                    ring0_name,
                    "ring1_name":
                    ring1_name,
                    "new_node_fqdn":
                    corosync_configuration.host.corosync_ring0,
                    "mcast_port":
                    config["mcast_port"],
                    "pcs_password":
                    self._pcs_password,
                    "create_cluster":
                    actioning_host == corosync_configuration.host,
                },
            )

            logging.debug("Node %s corosync configuration complete" %
                          corosync_configuration.host.fqdn)

        job_scheduler_notify.notify(
            corosync_configuration,
            now(),
            {
                "mcast_port": config["mcast_port"],
                "network_interfaces": [ring0_name, ring1_name]
            },
        )
    def on_data(self, fqdn, body):
        """Process all incoming messages from the Corosync agent plugin

        Request to have the status changed for an instance.  If the current
        state determines that a host is offline, then raise that alert.

        old messages should not be processed.

        datetime is in UTC of the node's localtime in the standard
        ISO string format
        """

        try:
            host = ManagedHost.objects.get(fqdn=fqdn)
        except ManagedHost.DoesNotExist:
            # This might happen when we are deleting a host and the queues mean a message is still sat waiting to be
            # processed. Something has spoken to us and we don't know anything about it so really we can't do anything
            # other than drop it.
            log.warning(
                "Corosync message from unknown host %s, the message was dropped."
                % fqdn)
            return

        # If corosync is not configured yet, or we don't actually have corosync - then ignore the input
        if (not host.corosync_configuration
            ) or host.corosync_configuration.state == 'unconfigured':
            return

        if body.get('state'):
            job_scheduler_notify.notify(host.corosync_configuration,
                                        timezone.now(),
                                        {'state': body['state']['corosync']})

            job_scheduler_notify.notify(host.pacemaker_configuration,
                                        timezone.now(),
                                        {'state': body['state']['pacemaker']})

            if body['state']['corosync'] == 'stopped':
                return
        else:
            if host.corosync_configuration.state != 'started':
                return

        if body.get('crm_info'):
            nodes = body['crm_info']['nodes']
            dt = body['crm_info']['datetime']

            options = body['crm_info'].get('options',
                                           {'stonith_enabled': None})
            stonith_enabled = options['stonith_enabled']

            try:
                dt = IMLDateTime.parse(dt)
            except ValueError:
                if dt != '':
                    log.warning(
                        "Invalid date or tz string from corosync plugin: %s" %
                        dt)
                    raise

            def is_new(peer_node_identifier):
                return (peer_node_identifier not in self._host_status or
                        self._host_status[peer_node_identifier].datetime < dt)

            peers_str = "; ".join([
                "%s: online=%s, new=%s" %
                (peer_node_identifier, data['online'],
                 is_new(peer_node_identifier))
                for peer_node_identifier, data in nodes.items()
            ])
            log.debug("Incoming peer report from %s:  %s" % (fqdn, peers_str))

            # NB: This will ignore any unknown peers in the report.
            cluster_nodes = ManagedHost.objects.select_related(
                'ha_cluster_peers').filter(
                    Q(nodename__in=nodes.keys()) | Q(fqdn__in=nodes.keys()))

            unknown_nodes = set(nodes.keys()) - set([
                h.nodename for h in cluster_nodes
            ]) - set([h.fqdn for h in cluster_nodes])

            # Leaving this out for now - because they raise issue caused by limitations in the simulator and
            # test system as a whole. Difficult to know if they will or won't be raised it all depends on the past.
            # CorosyncUnknownPeersAlert.notify(host.corosync_configuration, unknown_nodes != set())
            if unknown_nodes:
                log.warning("Unknown nodes in report from %s: %s" %
                            (fqdn, unknown_nodes))

            if stonith_enabled is not None:
                StonithNotEnabledAlert.notify(host.corosync_configuration,
                                              stonith_enabled is False)

            CorosyncNoPeersAlert.notify(host.corosync_configuration,
                                        len(cluster_nodes) == 1)
            # CorosyncToManyPeersAlert.notify(host.corosync_configuration, len(cluster_nodes) > 2)

            #  Consider all nodes in the peer group for this reporting agent
            for host in cluster_nodes:
                try:
                    data = nodes[host.nodename]
                    node_identifier = host.nodename
                except KeyError:
                    data = nodes[host.fqdn]
                    node_identifier = host.fqdn

                cluster_peer_keys = sorted(
                    [node.pk for node in cluster_nodes if node is not host])

                if is_new(node_identifier) and host.corosync_configuration:
                    host_reported_online = data['online'] == 'true'

                    log.debug("Corosync processing "
                              "peer %s of %s " % (host.fqdn, fqdn))

                    #  Raise an Alert - system suppresses duplicates
                    log.debug("Alert notify on %s: active=%s" %
                              (host, not host_reported_online))
                    HostOfflineAlert.notify(host, not host_reported_online)
                    if host_reported_online == False:
                        log.debug("Host %s offline" % host.fqdn)
                    else:
                        log.debug("Host %s online" % host.fqdn)

                    #  Attempt to save the state.
                    if host.corosync_configuration.corosync_reported_up != host_reported_online:
                        job_scheduler_notify.notify(
                            host.corosync_configuration, timezone.now(),
                            {'corosync_reported_up': host_reported_online})

                    peer_host_peer_keys = sorted(
                        [h.pk for h in host.ha_cluster_peers.all()])
                    if peer_host_peer_keys != cluster_peer_keys:
                        job_scheduler_notify.notify(
                            host, timezone.now(),
                            {'ha_cluster_peers': cluster_peer_keys})

                    #  Keep internal track of the hosts state.
                    self._host_status[node_identifier] = self.HostStatus(
                        status=host_reported_online, datetime=dt)
Пример #20
0
    def run(self, kwargs):
        corosync_configuration = kwargs['corosync_configuration']

        # detect local interfaces for use in corosync 'rings', network level configuration only
        config = self.invoke_agent_expect_result(corosync_configuration.host, "get_corosync_autoconfig")

        ring0_name, ring0_config = next((interface, config) for interface, config in
                                        config['interfaces'].items() if config['dedicated'] == False)
        ring1_name, ring1_config = next((interface, config) for interface, config in
                                        config['interfaces'].items() if config['dedicated'] == True)

        # apply the configurations of corosync 'rings', network level configuration only
        self.invoke_agent_expect_result(corosync_configuration.host,
                                        "configure_network",
                                        {'ring0_name': ring0_name,
                                         'ring1_name': ring1_name,
                                         'ring1_ipaddr': ring1_config['ipaddr'],
                                         'ring1_prefix': ring1_config['prefix']})

        logging.debug("Node %s returned corosync configuration %s" % (corosync_configuration.host.fqdn,
                                                                      config))

        # Serialize across nodes with the same mcast_port so that we ensure commands
        # are executed in the same order.
        with peer_mcast_ports_configuration_lock[config['mcast_port']]:
            from chroma_core.models import ManagedHost

            corosync_peers = self._corosync_peers(corosync_configuration.host.fqdn, config['mcast_port'])

            logging.debug("Node %s has corosync peers %s" % (corosync_configuration.host.fqdn,
                                                             ",".join(corosync_peers)))

            # If we are adding then we action on a host that is already part of the cluster
            # otherwise we have to action on the host we are adding because it is the first node in the cluster
            # TODO: Harden this up a little so it tries to pick a peer that is actively communicating, might be useful
            # when adding a new host in place of an old host. Also if ignoring peer, should we destroy that peer's
            # corosync configuration?
            actioning_host = corosync_configuration.host
            if corosync_peers:
                peer = ManagedHost.objects.get(fqdn=corosync_peers[0])
                if peer.state in ['managed', 'packages_installed']:
                    actioning_host = peer
                else:
                    logging.warning('peer corosync config ignored as host state == %s (not packages_installed or '
                                    'managed)' % peer.state)

            logging.debug('actioning host for %s corosync configuration stage 2: %s' % (corosync_configuration.host.fqdn,
                                                                                        actioning_host.fqdn))

            # Stage 1 configures pcsd on the host being added, sets the password, enables and starts it etc.
            self.invoke_agent_expect_result(corosync_configuration.host,
                                            "configure_corosync2_stage_1",
                                            {'mcast_port': config['mcast_port'],
                                             'pcs_password': self._pcs_password})

            # Stage 2 configures the cluster either by creating it or adding a node to it.
            self.invoke_agent_expect_result(actioning_host,
                                            "configure_corosync2_stage_2",
                                            {'ring0_name': ring0_name,
                                             'ring1_name': ring1_name,
                                             'new_node_fqdn': corosync_configuration.host.fqdn,
                                             'mcast_port': config['mcast_port'],
                                             'pcs_password': self._pcs_password,
                                             'create_cluster': actioning_host == corosync_configuration.host})

            logging.debug("Node %s corosync configuration complete" % corosync_configuration.host.fqdn)

        job_scheduler_notify.notify(corosync_configuration,
                                    now(),
                                    {'mcast_port': config['mcast_port'],
                                     'network_interfaces': [ring0_name, ring1_name]})
    def _call(cls, host, cmd, args):
        cls.calls.append((cmd, args))
        cls.host_calls[host.fqdn].append((cmd, args))

        if not cls.succeed:
            cls._fail(host.fqdn)

        if (cmd, args) in cls.fail_commands:
            cls._fail(host.fqdn)

        mock_server = cls.mock_servers[host.address]

        log.info("invoke_agent %s %s %s" % (host, cmd, args))

        # This isn't really accurate because lnet is scanned asynchonously, but it is as close as we can get today
        # Fixme: Also I know think this is writing to the wrong thing and should be changing the mock_server entries.
        # to lnet_up, I guess the mock_server needs an lnet state really, rather than relying on nids present.
        if cmd == "load_lnet":
            synthetic_lnet_configuration(host, mock_server["nids"])
            return
        elif cmd == "device_plugin":
            # Only returns nid info today.
            return create_synthetic_device_info(host, mock_server,
                                                args["plugin"])
        elif cmd == "format_target":
            inode_size = None
            if "mkfsoptions" in args:
                inode_arg = re.search("-I (\d+)", args["mkfsoptions"])
                if inode_arg:
                    inode_size = int(inode_arg.group(1).__str__())

            if inode_size is None:
                # A 'foo' value
                inode_size = 777

            return {
                "uuid": uuid.uuid1().__str__(),
                "inode_count": 666,
                "inode_size": inode_size,
                "filesystem_type": "ext4",
            }
        elif cmd == "stop_target":
            ha_label = args["ha_label"]
            target = ManagedTarget.objects.get(ha_label=ha_label)
            return agent_result_ok
        elif cmd == "start_target":
            ha_label = args["ha_label"]
            target = ManagedTarget.objects.get(ha_label=ha_label)
            return agent_result(target.primary_host.nodename)
        elif cmd == "register_target":
            # Assume mount paths are "/mnt/testfs-OST0001" style
            mount_point = args["mount_point"]
            label = re.search("/mnt/([^\s]+)", mount_point).group(1)
            return {"label": label}
        elif cmd == "detect_scan":
            return mock_server["detect-scan"]
        elif cmd == "install_packages":
            return agent_result([])
        elif cmd == "register_server":
            api_client = TestApiClient()
            old_is_authenticated = CsrfAuthentication.is_authenticated
            try:
                CsrfAuthentication.is_authenticated = mock.Mock(
                    return_value=True)
                api_client.client.login(username="******",
                                        password="******")
                fqdn = cls.mock_servers[host]["fqdn"]

                response = api_client.post(
                    args["url"] + "register/%s/" % args["secret"],
                    data={
                        "address": host,
                        "fqdn": fqdn,
                        "nodename": cls.mock_servers[host]["nodename"],
                        "capabilities": ["manage_targets"],
                        "version": cls.version,
                        "csr": helper.generate_csr(fqdn),
                    },
                )
                assert response.status_code == 201
                registration_data = Serializer().deserialize(
                    response.content, format=response["Content-Type"])
                print("MockAgent.invoke returning %s" % registration_data)
                return registration_data
            finally:
                CsrfAuthentication.is_authenticated = old_is_authenticated
        elif cmd == "kernel_status":
            return {
                "running": "fake_kernel-0.1",
                "required": "fake_kernel-0.1",
                "available": ["fake_kernel-0.1"]
            }
        elif cmd == "selinux_status":
            return {"status": "Disabled"}
        elif cmd == "reboot_server":
            now = IMLDateTime.utcnow()
            log.info("rebooting %s; updating boot_time to %s" % (host, now))
            job_scheduler_notify.notify(host, now, {"boot_time": now})
        elif cmd == "which zfs":
            return 1
        elif "import platform;" in cmd:
            return "0"
        elif "socket.gethostbyname(socket.gethostname())" in cmd:
            if not mock_server["tests"]["hostname_valid"]:
                return "127.0.0.1"
            else:
                return mock_server["address"]
        elif "print os.uname()[1]" in cmd:
            return "%s\n%s" % (mock_server["nodename"], mock_server["fqdn"])
        elif "socket.getfqdn()" in cmd:
            return mock_server["fqdn"]
        elif "ping" in cmd:
            result = (0 if mock_server["tests"]["reverse_resolve"] else
                      2) + (0 if mock_server["tests"]["reverse_ping"] else 1)
            return result
        elif "ElectricFence" in cmd:
            return 0 if mock_server["tests"]["yum_can_update"] else 1
        elif "openssl version -a" in cmd:
            return 0 if mock_server["tests"]["openssl"] else 1
        elif "curl -k https" in cmd:
            return json.dumps({"host_id": host.id, "command_id": 0})
        elif cmd in [
                "configure_pacemaker",
                "unconfigure_pacemaker",
                "configure_target_store",
                "unconfigure_target_store",
                "deregister_server",
                "restart_agent",
                "shutdown_server",
                "host_corosync_config",
                "check_block_device",
                "set_conf_param",
                "purge_configuration",
        ]:
            return None
        elif cmd in [
                "configure_target_ha",
                "unconfigure_target_ha",
                "start_lnet",
                "stop_lnet",
                "unload_lnet",
                "unconfigure_lnet",
                "configure_corosync",
                "unconfigure_corosync",
                "start_corosync",
                "stop_corosync",
                "start_pacemaker",
                "stop_pacemaker",
                "configure_ntp",
                "unconfigure_ntp",
                "import_target",
                "export_target",
                "set_profile",
                "update_profile",
                "failover_target",
                "failback_target",
                "configure_network",
                "open_firewall",
                "close_firewall",
        ]:
            return agent_result_ok
        elif cmd == "get_corosync_autoconfig":
            return agent_result({
                "interfaces": {
                    "eth0": {
                        "dedicated": False,
                        "ipaddr": "192.168.0.1",
                        "prefix": 24
                    },
                    "eth1": {
                        "dedicated": True,
                        "ipaddr": "10.10.0.01",
                        "prefix": 24
                    },
                },
                "mcast_port": "666",
            })
        else:
            assert False, (
                "The %s command is not in the known list for MockAgentRpc. Please add it then when people modify it a simple text search will let them know to change it here as well."
                % cmd)
Пример #22
0
    def on_success(self):
        from chroma_core.models import HaCluster

        for h in HaCluster.host_peers(self.corosync_configuration.host):
            job_scheduler_notify.notify(h.corosync_configuration, now(),
                                        {"mcast_port": self.mcast_port})
    def update_packages(self, package_report):
        if not package_report:
            # Packages is allowed to be None
            # (means is not the initial message, or there was a problem talking to RPM or yum)
            return

        # An update is required if:
        #  * A package is installed on the storage server for which there is a more recent version
        #    available on the manager
        #  or
        #  * A package is available on the manager, and specified in the server's profile's list of
        #    packages, but is not installed on the storage server.

        def _version_info_list(package_data):
            return [VersionInfo(*package) for package in package_data]

        def _updates_available(installed_versions, available_versions):
            # versions are of form (EPOCH, VERSION, RELEASE, ARCH)

            # Map of arch to highest installed version
            max_installed_version = {}

            for installed_info in installed_versions:
                max_inst = max_installed_version.get(installed_info.arch, None)
                if max_inst is None or installed_info > max_inst:
                    max_installed_version[installed_info.arch] = installed_info

            for available_info in available_versions:
                max_inst = max_installed_version.get(available_info.arch, None)
                if max_inst is not None and available_info > max_inst:
                    log.debug("Update available: %s > %s" %
                              (available_info, max_inst))
                    return True

            return False

        updates = False

        repos = package_report.keys()
        for package in self.host.server_profile.serverprofilepackage_set.all():
            package_data = {}
            for repo in repos:
                try:
                    package_data = package_report[repo][package.package_name]
                except KeyError:
                    continue
                break

            if not package_data:
                log.warning("Required Package %s not available for %s" %
                            (package.package_name, self.host))
                continue

            if not package_data["installed"]:
                log.info("Update available (not installed): %s on %s" %
                         (package.package_name, self.host))
                updates = True
                break

            if _updates_available(
                    _version_info_list(package_data["installed"]),
                    _version_info_list(package_data["available"])):
                log.info("Update needed: %s on %s" %
                         (package.package_name, self.host))
                updates = True
                break

        log.info("update_packages(%s): updates=%s" % (self.host, updates))
        job_scheduler_notify.notify(self.host, self.started_at,
                                    {"needs_update": updates})
Пример #24
0
    def _call(cls, host, cmd, args):
        cls.calls.append((cmd, args))
        cls.host_calls[host].append((cmd, args))

        if not cls.succeed:
            cls._fail(host.fqdn)

        if (cmd, args) in cls.fail_commands:
            cls._fail(host.fqdn)

        mock_server = cls.mock_servers[host.address]

        log.info("invoke_agent %s %s %s" % (host, cmd, args))

        # This isn't really accurate because lnet is scanned asynchonously, but it is as close as we can get today
        # Fixme: Also I know think this is writing to the wrong thing and should be changing the mock_server entries.
        # to lnet_up, I guess the mock_server needs an lnet state really, rather than relying on nids present.
        if cmd == "load_lnet":
            synthetic_lnet_configuration(host, mock_server['nids'])
            return
        elif cmd == "device_plugin":
            # Only returns nid info today.
            return create_synthetic_device_info(host, mock_server, args['plugin'])
        elif cmd == 'format_target':
            inode_size = None
            if 'mkfsoptions' in args:
                inode_arg = re.search("-I (\d+)", args['mkfsoptions'])
                if inode_arg:
                    inode_size = int(inode_arg.group(1).__str__())

            if inode_size is None:
                # A 'foo' value
                inode_size = 777

            return {'uuid': uuid.uuid1().__str__(),
                    'inode_count': 666,
                    'inode_size': inode_size,
                    'filesystem_type': 'ext4'}
        elif cmd == 'stop_target':
            ha_label = args['ha_label']
            target = ManagedTarget.objects.get(ha_label = ha_label)
            return agent_result_ok
        elif cmd == 'start_target':
            ha_label = args['ha_label']
            target = ManagedTarget.objects.get(ha_label = ha_label)
            return agent_result(target.primary_host.nodename)
        elif cmd == 'register_target':
            # Assume mount paths are "/mnt/testfs-OST0001" style
            mount_point = args['mount_point']
            label = re.search("/mnt/([^\s]+)", mount_point).group(1)
            return {'label': label}
        elif cmd == 'detect_scan':
            return mock_server['detect-scan']
        elif cmd == 'install_packages':
            return agent_result([])
        elif cmd == 'register_server':
            api_client = TestApiClient()
            old_is_authenticated = CsrfAuthentication.is_authenticated
            try:
                CsrfAuthentication.is_authenticated = mock.Mock(return_value = True)
                api_client.client.login(username = '******', password = '******')
                fqdn = cls.mock_servers[host]['fqdn']

                response = api_client.post(args['url'] + "register/%s/" % args['secret'], data = {
                    'address': host,
                    'fqdn': fqdn,
                    'nodename': cls.mock_servers[host]['nodename'],
                    'capabilities': ['manage_targets'],
                    'version': cls.version,
                    'csr': helper.generate_csr(fqdn)
                })
                assert response.status_code == 201
                registration_data = Serializer().deserialize(response.content, format = response['Content-Type'])
                print "MockAgent.invoke returning %s" % registration_data
                return registration_data
            finally:
                CsrfAuthentication.is_authenticated = old_is_authenticated
        elif cmd == 'kernel_status':
            return {
                'running': 'fake_kernel-0.1',
                'required': 'fake_kernel-0.1',
                'available': ['fake_kernel-0.1']
            }
        elif cmd == 'reboot_server':
            now = IMLDateTime.utcnow()
            log.info("rebooting %s; updating boot_time to %s" % (host, now))
            job_scheduler_notify.notify(host, now, {'boot_time': now})
        elif 'socket.gethostbyname(socket.gethostname())' in cmd:
            if not mock_server['tests']['hostname_valid']:
                return '127.0.0.1'
            else:
                return mock_server['address']
        elif 'print os.uname()[1]' in cmd:
            return '%s\n%s' % (mock_server['nodename'], mock_server['fqdn'])
        elif 'socket.getfqdn()' in cmd:
            return mock_server['fqdn']
        elif 'ping' in cmd:
            result = ((0 if mock_server['tests']['reverse_resolve'] else 2) +
                      (0 if mock_server['tests']['reverse_ping'] else 1))
            return result
        elif 'python-fedora-django' in cmd:
            return 0 if mock_server['tests']['yum_valid_repos'] else 1
        elif 'ElectricFence' in cmd:
            return 0 if mock_server['tests']['yum_can_update'] else 1
        elif 'curl -k https' in cmd:
            return json.dumps({'host_id': host.id,
                               'command_id': 0})
        elif cmd in ['configure_pacemaker', 'unconfigure_pacemaker',
                     'configure_rsyslog', 'unconfigure_rsyslog',
                     'configure_target_store', 'unconfigure_target_store',
                     'deregister_server', 'restart_agent',
                     'shutdown_server', 'host_corosync_config', 'check_block_device',
                     'set_conf_param', 'purge_configuration']:
            return None
        elif cmd in ['configure_target_ha', 'unconfigure_target_ha',
                     'start_lnet', 'stop_lnet', 'unload_lnet', 'unconfigure_lnet',
                     'configure_corosync', 'unconfigure_corosync',
                     'start_corosync', 'stop_corosync',
                     'start_pacemaker', 'stop_pacemaker',
                     'configure_ntp', 'unconfigure_ntp',
                     'import_target', 'export_target',
                     'import_target', 'export_target'
                     'set_profile', 'update_profile',
                     'failover_target', 'failback_target',
                     'configure_network', 'open_firewall', 'close_firewall']:
            return agent_result_ok
        elif cmd == 'get_corosync_autoconfig':
            return agent_result({'interfaces': {'eth0': {'dedicated': False,
                                                         'ipaddr': '192.168.0.1',
                                                         'prefix': 24},
                                                'eth1': {'dedicated': True,
                                                         'ipaddr': '10.10.0.01',
                                                         'prefix': 24}},
                                 'mcast_port': '666'})
        else:
            assert False, "The %s command is not in the known list for MockAgentRpc. Please add it then when people modify it a simple text search will let them know to change it here as well." % cmd