def on_data(self, fqdn, body):
        """Process all incoming messages from the Corosync agent plugin

        Request to have the status changed for an instance.  If the current
        state determines that a host is offline, then raise that alert.

        old messages should not be processed.

        datetime is in UTC of the node's localtime in the standard
        ISO string format
        """

        try:
            host = ManagedHost.objects.get(fqdn=fqdn)
        except ManagedHost.DoesNotExist:
            # This might happen when we are deleting a host and the queues mean a message is still sat waiting to be
            # processed. Something has spoken to us and we don't know anything about it so really we can't do anything
            # other than drop it.
            log.warning(
                "Corosync message from unknown host %s, the message was dropped."
                % fqdn)
            return

        # If corosync is not configured yet, or we don't actually have corosync - then ignore the input
        if (not host.corosync_configuration
            ) or host.corosync_configuration.state == 'unconfigured':
            return

        if body.get('state'):
            job_scheduler_notify.notify(host.corosync_configuration,
                                        timezone.now(),
                                        {'state': body['state']['corosync']})

            job_scheduler_notify.notify(host.pacemaker_configuration,
                                        timezone.now(),
                                        {'state': body['state']['pacemaker']})

            if body['state']['corosync'] == 'stopped':
                return
        else:
            if host.corosync_configuration.state != 'started':
                return

        if body.get('crm_info'):
            nodes = body['crm_info']['nodes']
            dt = body['crm_info']['datetime']

            options = body['crm_info'].get('options',
                                           {'stonith_enabled': None})
            stonith_enabled = options['stonith_enabled']

            try:
                dt = IMLDateTime.parse(dt)
            except ValueError:
                if dt != '':
                    log.warning(
                        "Invalid date or tz string from corosync plugin: %s" %
                        dt)
                    raise

            def is_new(peer_node_identifier):
                return (peer_node_identifier not in self._host_status or
                        self._host_status[peer_node_identifier].datetime < dt)

            peers_str = "; ".join([
                "%s: online=%s, new=%s" %
                (peer_node_identifier, data['online'],
                 is_new(peer_node_identifier))
                for peer_node_identifier, data in nodes.items()
            ])
            log.debug("Incoming peer report from %s:  %s" % (fqdn, peers_str))

            # NB: This will ignore any unknown peers in the report.
            cluster_nodes = ManagedHost.objects.select_related(
                'ha_cluster_peers').filter(
                    Q(nodename__in=nodes.keys()) | Q(fqdn__in=nodes.keys()))

            unknown_nodes = set(nodes.keys()) - set([
                h.nodename for h in cluster_nodes
            ]) - set([h.fqdn for h in cluster_nodes])

            # Leaving this out for now - because they raise issue caused by limitations in the simulator and
            # test system as a whole. Difficult to know if they will or won't be raised it all depends on the past.
            # CorosyncUnknownPeersAlert.notify(host.corosync_configuration, unknown_nodes != set())
            if unknown_nodes:
                log.warning("Unknown nodes in report from %s: %s" %
                            (fqdn, unknown_nodes))

            if stonith_enabled is not None:
                StonithNotEnabledAlert.notify(host.corosync_configuration,
                                              stonith_enabled is False)

            CorosyncNoPeersAlert.notify(host.corosync_configuration,
                                        len(cluster_nodes) == 1)
            # CorosyncToManyPeersAlert.notify(host.corosync_configuration, len(cluster_nodes) > 2)

            #  Consider all nodes in the peer group for this reporting agent
            for host in cluster_nodes:
                try:
                    data = nodes[host.nodename]
                    node_identifier = host.nodename
                except KeyError:
                    data = nodes[host.fqdn]
                    node_identifier = host.fqdn

                cluster_peer_keys = sorted(
                    [node.pk for node in cluster_nodes if node is not host])

                if is_new(node_identifier) and host.corosync_configuration:
                    host_reported_online = data['online'] == 'true'

                    log.debug("Corosync processing "
                              "peer %s of %s " % (host.fqdn, fqdn))

                    #  Raise an Alert - system suppresses duplicates
                    log.debug("Alert notify on %s: active=%s" %
                              (host, not host_reported_online))
                    HostOfflineAlert.notify(host, not host_reported_online)
                    if host_reported_online == False:
                        log.debug("Host %s offline" % host.fqdn)
                    else:
                        log.debug("Host %s online" % host.fqdn)

                    #  Attempt to save the state.
                    if host.corosync_configuration.corosync_reported_up != host_reported_online:
                        job_scheduler_notify.notify(
                            host.corosync_configuration, timezone.now(),
                            {'corosync_reported_up': host_reported_online})

                    peer_host_peer_keys = sorted(
                        [h.pk for h in host.ha_cluster_peers.all()])
                    if peer_host_peer_keys != cluster_peer_keys:
                        job_scheduler_notify.notify(
                            host, timezone.now(),
                            {'ha_cluster_peers': cluster_peer_keys})

                    #  Keep internal track of the hosts state.
                    self._host_status[node_identifier] = self.HostStatus(
                        status=host_reported_online, datetime=dt)
Пример #2
0
    def test_removals(self):
        """Test that after objects are removed all GETs still work

        The idea is to go through a add hosts, create FS, remove FS, remove hosts
        cycle and then do a spider of the API to ensure that there aren't any
        exceptions rendering things (e.g. due to trying to dereference removed
        things incorrectly)"""

        host = synthetic_host('myserver')
        self.create_simple_filesystem(host)

        # Create a command/job/step result referencing the host
        command = Command.objects.create(message="test command",
                                         complete=True,
                                         errored=True)
        job = StopLNetJob.objects.create(
            lnet_configuration=host.lnet_configuration,
            state='complete',
            errored=True)
        command.jobs.add(job)
        step_klass, args = job.get_steps()[0]
        StepResult.objects.create(job=job,
                                  backtrace="an error",
                                  step_klass=step_klass,
                                  args=args,
                                  step_index=0,
                                  step_count=1,
                                  state='failed')

        # There will now be an CommandErroredAlert because the command above failed.
        alerts = self.deserialize(
            self.api_client.get("/api/alert/"))['objects']
        self.assertEqual(len(alerts), 1)
        self.assertEqual(alerts[0]['alert_type'], 'CommandErroredAlert')

        # Now create an alert/event referencing the host
        HostOfflineAlert.notify(host, True)
        alerts = self.deserialize(
            self.api_client.get("/api/alert/", data={'active':
                                                     True}))['objects']
        self.assertEqual(len(alerts), 1)
        self.assertEqual(alerts[0]['alert_type'], 'HostOfflineAlert')

        # Double check that is 2 alerts in total.
        alerts = self.deserialize(
            self.api_client.get("/api/alert/"))['objects']
        self.assertEqual(len(alerts), 2)

        # Cause JobScheduler() to delete the objects, check the objects are gone in the API
        # and the API can still be spidered cleanly
        job = ForceRemoveHostJob(host=host)
        for step_klass, args in job.get_steps():
            step_klass(job, args, None, None, None).run(args)

        # Check everything is gone
        self.assertEqual(ManagedTarget.objects.count(), 0)
        self.assertEqual(ManagedHost.objects.count(), 0)
        self.assertEqual(Volume.objects.count(), 0)
        self.assertEqual(VolumeNode.objects.count(), 0)
        self.assertListEqual(
            self.deserialize(
                self.api_client.get("/api/alert/?active=true"))['objects'], [])
        self.assertListEqual(
            self.deserialize(self.api_client.get("/api/volume/"))['objects'],
            [])
        self.assertListEqual(
            self.deserialize(
                self.api_client.get("/api/volume_node/"))['objects'], [])
        self.assertListEqual(
            self.deserialize(self.api_client.get("/api/target/"))['objects'],
            [])
        self.assertListEqual(
            self.deserialize(self.api_client.get("/api/host/"))['objects'], [])
        self.assertListEqual(
            self.deserialize(
                self.api_client.get("/api/filesystem/"))['objects'], [])

        # Check resources still render without exceptions
        self.spider_api()