Exemplo n.º 1
0
    def on_message(self, message):
        try:
            # Deserialize any datetimes which were serialized for JSON
            deserialized_update_attrs = {}
            model_klass = ContentType.objects.get_by_natural_key(*message["instance_natural_key"]).model_class()
            for attr, value in message["update_attrs"].items():
                try:
                    field = [f for f in model_klass._meta.fields if f.name == attr][0]
                except IndexError:
                    # e.g. _id names, they aren't datetimes so ignore them
                    deserialized_update_attrs[attr] = value
                else:
                    if isinstance(field, DateTimeField):
                        deserialized_update_attrs[attr] = IMLDateTime.parse(value)
                    else:
                        deserialized_update_attrs[attr] = value

            log.debug("on_message: %s %s" % (message, deserialized_update_attrs))

            self._job_scheduler.notify(
                message["instance_natural_key"],
                message["instance_id"],
                message["time"],
                deserialized_update_attrs,
                message["from_states"],
            )
        except:
            # Log bad messages and continue, swallow the exception to avoid
            # bringing down the whole service
            log.warning("on_message: bad message: %s" % traceback.format_exc())
Exemplo n.º 2
0
 def test_HYD648(self):
     """Test that datetimes in the API have a timezone"""
     synthetic_host('myserver')
     response = self.api_client.get("/api/host/")
     self.assertHttpOK(response)
     host = self.deserialize(response)['objects'][0]
     t = IMLDateTime.parse(host['state_modified_at'])
     self.assertNotEqual(t.tzinfo, None)
Exemplo n.º 3
0
    def run(self, host_id, host_data):
        host = ManagedHost.objects.get(pk=host_id)
        self.started_at = IMLDateTime.parse(host_data["started_at"])
        self.host = host
        self.host_data = host_data
        log.debug("UpdateScan.run: %s" % self.host)

        self.audit_host()
Exemplo n.º 4
0
    def test_fetch_not_dismissed_alerts_since_last_sample(self):

        data = {
            "begin__gte": str(self.sample_date),
            "dismissed": 'false',
            "severity__in": ['WARNING', 'ERROR']
        }

        response = self.api_client.get("/api/alert/", data=data)
        self.assertHttpOK(response)
        objects = self.deserialize(response)['objects']
        self.assertEqual(len(objects), 2, self.dump_objects(objects))
        for ev in objects:
            self.assertEqual(ev['dismissed'], False)
            self.assertTrue(ev['severity'] in ['WARNING', 'ERROR'])
            self.assertTrue(IMLDateTime.parse(ev['begin']) >= self.sample_date)
Exemplo n.º 5
0
    def test_fetch_not_dismissed_alerts_since_last_sample(self):

        data = {
            "begin__gte": str(self.sample_date),
            "dismissed": False,
            "severity__in": ["WARNING", "ERROR"]
        }

        response = self.api_client.get("/api/alert/", data=data)
        self.assertHttpOK(response)
        objects = self.deserialize(response)["objects"]
        self.assertEqual(len(objects), 2, self.dump_objects(objects))
        for ev in objects:
            self.assertEqual(ev["dismissed"], False)
            self.assertTrue(ev["severity"] in ["WARNING", "ERROR"])
            self.assertTrue(IMLDateTime.parse(ev["begin"]) >= self.sample_date)
    def _wait_for_server_boot_time(self, fqdn, old_boot_time=None):
        running_time = 0
        while running_time < TEST_TIMEOUT:
            hosts = self.get_list("/api/host/")
            for host in hosts:
                if host['fqdn'] == fqdn:
                    if host['boot_time'] is not None:
                        boot_time = IMLDateTime.parse(host['boot_time'])
                        if old_boot_time:
                            if boot_time > old_boot_time:
                                return boot_time
                        else:
                            return boot_time

            running_time += 1
            time.sleep(1)

        self.assertLess(running_time, TEST_TIMEOUT, "Timed out waiting for host boot_time to be set.")
Exemplo n.º 7
0
    def on_data(self, fqdn, body):
        with transaction.commit_on_success():
            with LogMessage.delayed as log_messages:
                for msg in body['log_lines']:
                    try:
                        log_messages.insert(dict(
                            fqdn = fqdn,
                            message = msg['message'],
                            severity = msg['severity'],
                            facility = msg['facility'],
                            tag = msg['source'],
                            datetime = IMLDateTime.parse(msg['datetime']).as_datetime,
                            message_class = LogMessage.get_message_class(msg['message'])
                        ))
                        self._table_size += 1

                        self._parser.parse(fqdn, msg)
                    except Exception, e:
                        self.log.error("Error %s ingesting systemd-journal entry: %s" % (e, msg))
Exemplo n.º 8
0
    def on_data(self, fqdn, body):
        with transaction.atomic():
            with DelayedContextFrom(LogMessage) as log_messages:
                for msg in body["log_lines"]:
                    try:
                        log_messages.insert(
                            dict(
                                fqdn=fqdn,
                                message=msg["message"],
                                severity=msg["severity"],
                                facility=msg["facility"],
                                tag=msg["source"],
                                datetime=IMLDateTime.parse(
                                    msg["datetime"]).as_datetime,
                                message_class=LogMessage.get_message_class(
                                    msg["message"]),
                            ))
                        self._table_size += 1

                        self._parser.parse(fqdn, msg)
                    except Exception as e:
                        self.log.error(
                            "Error %s ingesting systemd-journal entry: %s" %
                            (e, msg))
Exemplo n.º 9
0
    def metric_dispatch(self, request, **kwargs):
        """
        GET parameters:
        :metrics: Comma separated list of strings (e.g. kbytesfree,kbytestotal)
        :job: 'id', 'user', or 'name'.
                only supply one metric of 'read_bytes', 'write_bytes', 'read_iops', 'write_iops', 'metadata_iops'
        :begin: Time ISO8601 string, e.g. '2008-09-03T20:56:35.450686Z'
        :end: Time ISO8601 string, e.g. '2008-09-03T20:56:35.450686Z'
        :latest: boolean -- if true, you are asking for a single time point, the latest value
        :max_points: maximum number of datapoints returned, may result in lower resolution samples
        :num_points: return exact number of data points scaled for the date range
        :reduce_fn: one of 'average', 'sum'
        :group_by: an attribute name of the object you're fetching.  For example, to get
                   the total OST stats for a filesystem, when requesting from
                   the target resource, use reduce_fn=sum, group_by=filesystem.
                   If the group_by attribute is absent from a record in the results,
                   that record is discarded.
        """
        errors = defaultdict(list)

        if request.method != 'GET':
            return self.create_response(request,
                                        "",
                                        response_class=HttpMethodNotAllowed)

        latest, update = (request.GET.get(name, '').lower() in ('true', '1')
                          for name in ('latest', 'update'))
        if update and latest:
            errors['update'].append("update and latest are mutually exclusive")

        metrics = filter(None, request.GET.get('metrics', '').split(','))
        job = request.GET.get('job', '')
        if job:
            if len(metrics) != 1:
                errors['job'].append("Job metrics must be a single string")
            if latest:
                errors['job'].append("Job metrics and latest are incompatible")

        num_points = 0
        if 'num_points' in request.GET:
            try:
                num_points = int(request.GET['num_points'])
            except ValueError:
                errors['num_points'].append(
                    "num_points must be a valid integer")
            if latest or update:
                errors['num_points'].append(
                    "num_points requires a fixed begin and end")

        begin = end = None
        if not latest:
            try:
                begin = IMLDateTime.parse(request.GET['begin'])
            except KeyError:
                errors['begin'].append(
                    "This field is mandatory when latest=false")
            except ValueError:
                errors['begin'].append("Malformed time string")
            try:
                end = IMLDateTime.parse(request.GET['end'])
            except KeyError:
                if update or num_points:
                    errors['end'].append(
                        "This field is mandatory when latest=false")
                else:
                    end = timezone.now()
            except ValueError:
                errors['end'].append("Malformed time string")
        if update:
            begin, end = end, timezone.now()

        try:
            max_points = int(request.GET.get('max_points', 1000))
        except ValueError:
            errors['max_points'].append("max_points must be a valid integer")
        if errors:
            return self.create_response(request,
                                        errors,
                                        response_class=HttpBadRequest)

        if 'pk' in kwargs:
            return self.get_metric_detail(request, metrics, begin, end, job,
                                          max_points, num_points, **kwargs)
        return self.get_metric_list(request, metrics, begin, end, job,
                                    max_points, num_points, **kwargs)
Exemplo n.º 10
0
    def send(self):
        events = []

        envelope = dict(fqdn=self.client.fqdn,
                        copytool=self.copytool.id,
                        events=events)

        envelope_size = len(json.dumps(envelope))
        while True:
            try:
                event = self.retry_queue.get_nowait()
                copytool_log.debug("Got event from retry queue: %s" % event)
            except Queue.Empty:
                try:
                    raw_event = self.send_queue.get_nowait()
                    event = json.loads(raw_event)
                    copytool_log.debug("Got event from send queue: %s" % event)
                except Queue.Empty:
                    break
                except ValueError:
                    copytool_log.error("Invalid JSON: %s" % raw_event)
                    break

            try:
                date = IMLDateTime.parse(event['event_time'])
                event['event_time'] = date.astimezone(
                    tz=FixedOffset(0)).strftime("%Y-%m-%d %H:%M:%S+00:00")
            except ValueError as e:
                copytool_log.error("Invalid event date in event '%s': %s" %
                                   (event, e))
                break

            # During restore operations, we don't know the data_fid until
            # after the operation has started (i.e. RUNNING). The tricky part
            # is that when the restore completes, the source_fid is set to
            # data_fid, so unless we do this swap we'll lose track of the
            # operation.
            if 'RUNNING' in event['event_type']:
                if event['source_fid'] in self.active_operations:
                    self.active_operations[
                        event['data_fid']] = self.active_operations.pop(
                            event['source_fid'])

            if self.active_operations.get(event.get('data_fid', None), None):
                event['active_operation'] = self.active_operations[
                    event['data_fid']]

            if 'FINISH' in event['event_type']:
                try:
                    del self.active_operations[event['data_fid']]
                except KeyError:
                    pass

            copytool_log.debug("event: %s" % json.dumps(event))

            event_size = len(json.dumps(event))
            if event_size > MAX_BYTES_PER_POST:
                copytool_log.error("Oversized event dropped: %s" % event)
                break

            if events and event_size > MAX_BYTES_PER_POST - envelope_size:
                copytool_log.info("Requeueing oversized message "
                                  "(%d + %d > %d, %d messages)" %
                                  (event_size, envelope_size,
                                   MAX_BYTES_PER_POST, len(events)))
                self.retry_queue.put(event)
                break

            events.append(event)
            envelope_size += event_size

        if events:
            copytool_log.debug("EventRelay sending %d events" % len(events))
            try:
                data = self.client.post(envelope)
                copytool_log.debug("Got data back from POST: %s" % data)
                try:
                    self.active_operations.update(data['active_operations'])
                except (KeyError, TypeError):
                    pass
                # Reset any backoff delay that might have been added
                self.reset_backoff()
            except HttpError:
                copytool_log.error("Failed to relay events, requeueing")
                for event in envelope['events']:
                    self.retry_queue.put(event)
                self.backoff()
Exemplo n.º 11
0
    def get(self, request):
        """
        Send messages TO the agent.
        Handle a long-polling GET for messages to the agent
        """

        fqdn = self.valid_fqdn(request)
        if not fqdn:
            return HttpForbidden()
        server_boot_time = IMLDateTime.parse(request.GET["server_boot_time"])
        client_start_time = IMLDateTime.parse(request.GET["client_start_time"])

        messages = []

        try:
            reset_required = self.hosts.update(fqdn, server_boot_time,
                                               client_start_time)
        except ManagedHost.DoesNotExist:
            # This should not happen because the HTTPS frontend should have the
            # agent certificate revoked before removing the ManagedHost from the database
            log.error("GET from unknown server %s" % fqdn)
            return HttpResponseBadRequest("Unknown server '%s'" % fqdn)

        if reset_required:
            # This is the case where the http_agent service restarts, so
            # we have to let the agent know that all open sessions
            # are now over.
            messages.append({
                "fqdn": fqdn,
                "type": "SESSION_TERMINATE_ALL",
                "plugin": None,
                "session_id": None,
                "session_seq": None,
                "body": None,
            })

        log.debug("MessageView.get: composing messages for %s" % fqdn)
        queues = self.queues.get(fqdn)

        # If this handler is sitting on the TX queue, draining messages, then
        # when a new session starts, *before* sending any TX messages, we have to
        # make sure it has been disconnected, to avoid the TX messages being sent
        # to an 'old' session (old session meaning TCP connection from a now-dead agent)

        with queues.tx_lock:
            try:
                first_message = queues.tx.get(block=True,
                                              timeout=self.LONG_POLL_TIMEOUT)
                if first_message["type"] == "TX_BARRIER":
                    if first_message["client_start_time"] != request.GET[
                            "client_start_time"]:
                        log.warning("Cancelling GET due to barrier %s %s" %
                                    (first_message["client_start_time"],
                                     request.GET["client_start_time"]))
                        return HttpResponse(json.dumps({"messages": []}),
                                            content_type="application/json")
                else:
                    messages.append(first_message)
            except Queue.Empty:
                pass
            else:
                # TODO: limit number of messages per response
                while True:
                    try:
                        message = queues.tx.get(block=False)
                        if message["type"] == "TX_BARRIER":
                            if message["client_start_time"] != request.GET[
                                    "client_start_time"]:
                                log.warning(
                                    "Cancelling GET due to barrier %s %s" %
                                    (message["client_start_time"],
                                     request.GET["client_start_time"]))
                                return HttpResponse(
                                    json.dumps({"messages": []}),
                                    content_type="application/json")
                        else:
                            messages.append(message)
                    except Queue.Empty:
                        break

        messages = self._filter_valid_messages(fqdn, messages)

        log.debug("MessageView.get: responding to %s with %s messages (%s)" %
                  (fqdn, len(messages), client_start_time))
        return HttpResponse(json.dumps({"messages": messages}),
                            content_type="application/json")
    def on_data(self, fqdn, body):
        """Process all incoming messages from the Corosync agent plugin

        Request to have the status changed for an instance.  If the current
        state determines that a host is offline, then raise that alert.

        old messages should not be processed.

        datetime is in UTC of the node's localtime in the standard
        ISO string format
        """

        try:
            host = ManagedHost.objects.get(fqdn=fqdn)
        except ManagedHost.DoesNotExist:
            # This might happen when we are deleting a host and the queues mean a message is still sat waiting to be
            # processed. Something has spoken to us and we don't know anything about it so really we can't do anything
            # other than drop it.
            log.warning(
                "Corosync message from unknown host %s, the message was dropped."
                % fqdn)
            return

        # If corosync is not configured yet, or we don't actually have corosync - then ignore the input
        if (not host.corosync_configuration
            ) or host.corosync_configuration.state == 'unconfigured':
            return

        if body.get('state'):
            job_scheduler_notify.notify(host.corosync_configuration,
                                        timezone.now(),
                                        {'state': body['state']['corosync']})

            job_scheduler_notify.notify(host.pacemaker_configuration,
                                        timezone.now(),
                                        {'state': body['state']['pacemaker']})

            if body['state']['corosync'] == 'stopped':
                return
        else:
            if host.corosync_configuration.state != 'started':
                return

        if body.get('crm_info'):
            nodes = body['crm_info']['nodes']
            dt = body['crm_info']['datetime']

            options = body['crm_info'].get('options',
                                           {'stonith_enabled': None})
            stonith_enabled = options['stonith_enabled']

            try:
                dt = IMLDateTime.parse(dt)
            except ValueError:
                if dt != '':
                    log.warning(
                        "Invalid date or tz string from corosync plugin: %s" %
                        dt)
                    raise

            def is_new(peer_node_identifier):
                return (peer_node_identifier not in self._host_status or
                        self._host_status[peer_node_identifier].datetime < dt)

            peers_str = "; ".join([
                "%s: online=%s, new=%s" %
                (peer_node_identifier, data['online'],
                 is_new(peer_node_identifier))
                for peer_node_identifier, data in nodes.items()
            ])
            log.debug("Incoming peer report from %s:  %s" % (fqdn, peers_str))

            # NB: This will ignore any unknown peers in the report.
            cluster_nodes = ManagedHost.objects.select_related(
                'ha_cluster_peers').filter(
                    Q(nodename__in=nodes.keys()) | Q(fqdn__in=nodes.keys()))

            unknown_nodes = set(nodes.keys()) - set([
                h.nodename for h in cluster_nodes
            ]) - set([h.fqdn for h in cluster_nodes])

            # Leaving this out for now - because they raise issue caused by limitations in the simulator and
            # test system as a whole. Difficult to know if they will or won't be raised it all depends on the past.
            # CorosyncUnknownPeersAlert.notify(host.corosync_configuration, unknown_nodes != set())
            if unknown_nodes:
                log.warning("Unknown nodes in report from %s: %s" %
                            (fqdn, unknown_nodes))

            if stonith_enabled is not None:
                StonithNotEnabledAlert.notify(host.corosync_configuration,
                                              stonith_enabled is False)

            CorosyncNoPeersAlert.notify(host.corosync_configuration,
                                        len(cluster_nodes) == 1)
            # CorosyncToManyPeersAlert.notify(host.corosync_configuration, len(cluster_nodes) > 2)

            #  Consider all nodes in the peer group for this reporting agent
            for host in cluster_nodes:
                try:
                    data = nodes[host.nodename]
                    node_identifier = host.nodename
                except KeyError:
                    data = nodes[host.fqdn]
                    node_identifier = host.fqdn

                cluster_peer_keys = sorted(
                    [node.pk for node in cluster_nodes if node is not host])

                if is_new(node_identifier) and host.corosync_configuration:
                    host_reported_online = data['online'] == 'true'

                    log.debug("Corosync processing "
                              "peer %s of %s " % (host.fqdn, fqdn))

                    #  Raise an Alert - system suppresses duplicates
                    log.debug("Alert notify on %s: active=%s" %
                              (host, not host_reported_online))
                    HostOfflineAlert.notify(host, not host_reported_online)
                    if host_reported_online == False:
                        log.debug("Host %s offline" % host.fqdn)
                    else:
                        log.debug("Host %s online" % host.fqdn)

                    #  Attempt to save the state.
                    if host.corosync_configuration.corosync_reported_up != host_reported_online:
                        job_scheduler_notify.notify(
                            host.corosync_configuration, timezone.now(),
                            {'corosync_reported_up': host_reported_online})

                    peer_host_peer_keys = sorted(
                        [h.pk for h in host.ha_cluster_peers.all()])
                    if peer_host_peer_keys != cluster_peer_keys:
                        job_scheduler_notify.notify(
                            host, timezone.now(),
                            {'ha_cluster_peers': cluster_peer_keys})

                    #  Keep internal track of the hosts state.
                    self._host_status[node_identifier] = self.HostStatus(
                        status=host_reported_online, datetime=dt)
 def __init__(self, event_time, **kwargs):
     self.timestamp = IMLDateTime.parse(event_time)
     self._parse_type(kwargs["event_type"])
     self.error = None
     self.__dict__.update(kwargs)
Exemplo n.º 14
0
 def test_prase(self):
     for str, date_time in [
         (
             "2016/10/09T19:20:21+1000",
             IMLDateTime(
                 year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=0, tzinfo=FixedOffset(600)
             ),
         ),
         (
             "2016/10/09T19:20:21.12345+1000",
             IMLDateTime(
                 year=2016,
                 month=10,
                 day=9,
                 hour=19,
                 minute=20,
                 second=21,
                 microsecond=123450,
                 tzinfo=FixedOffset(600),
             ),
         ),
         (
             "2016-10-09 19:20:21-1000",
             IMLDateTime(
                 year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=0, tzinfo=FixedOffset(-600)
             ),
         ),
         (
             "2016-10-09 19:20:21.12345-1000",
             IMLDateTime(
                 year=2016,
                 month=10,
                 day=9,
                 hour=19,
                 minute=20,
                 second=21,
                 microsecond=123450,
                 tzinfo=FixedOffset(-600),
             ),
         ),
         (
             "19:20:21-1000 2016-10/09",
             IMLDateTime(
                 year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=0, tzinfo=FixedOffset(-600)
             ),
         ),
         (
             "19:20:21.12345-0000 2016/10-09",
             IMLDateTime(
                 year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=123450, tzinfo=FixedOffset(0)
             ),
         ),
         (
             "        19:20:21.12345        -1000              2016/10-09        ",
             IMLDateTime(
                 year=2016,
                 month=10,
                 day=9,
                 hour=19,
                 minute=20,
                 second=21,
                 microsecond=123450,
                 tzinfo=FixedOffset(-600),
             ),
         ),
         ("2016/10-09", IMLDateTime(year=2016, month=10, day=9, tzinfo=FixedOffset(0))),
         (
             "19:20:21.12345-0000",
             IMLDateTime(
                 year=1900, month=1, day=1, hour=19, minute=20, second=21, microsecond=123450, tzinfo=FixedOffset(0)
             ),
         ),
         (
             "19:20:21-0500",
             IMLDateTime(
                 year=1900, month=1, day=1, hour=19, minute=20, second=21, microsecond=0, tzinfo=FixedOffset(-300)
             ),
         ),
     ]:
         self.assertEqual(IMLDateTime.parse(str), date_time)