예제 #1
0
    def run_single_check(self, check):
        """Run a single check

        returns number of measurement collected, collection time
        """

        sub_timer = util.Timer()
        count = 0
        log.debug("Running plugin %s" % check.name)
        try:

            # Run the check.
            check.run()

            current_check_metrics = check.get_metrics()

            # Emit the metrics after each check
            self._emit(current_check_metrics)

            # Save the status of the check.
            count += len(current_check_metrics)

        except Exception:
            log.exception("Error running plugin %s" % check.name)

        sub_collect_duration = sub_timer.step()
        sub_collect_duration_mills = sub_collect_duration * 1000
        log.debug(
            "Finished plugin %s run. Collection time: %.2fms %d Metrics." %
            (check.name, round(sub_collect_duration_mills, 2), count))
        if sub_collect_duration > util.get_sub_collection_warn():
            log.warn("Collection time for check %s is high: %.2fs." %
                     (check.name, round(sub_collect_duration, 2)))
        return count, sub_collect_duration_mills
예제 #2
0
    def run(self, check_frequency):
        """Collect data from each check and submit their data.

        Also, submit a metric which is how long the checks_d took
        """
        timer = util.Timer()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        # checks_d checks
        num_metrics = self.run_checks_d(check_frequency)

        collect_duration = timer.step()

        # Warn if collection time is approaching the collection period
        if collect_duration > (4 * check_frequency / 5):
            log.warn("Collection time (s) is high: %.1f, metrics count: %d" %
                     (collect_duration, num_metrics))

        self.collector_stats(num_metrics, collect_duration)
        collect_stats = []
        dimensions = {'component': 'monasca-agent', 'service': 'monitoring'}
        # Add in metrics on the collector run
        for name, value in self.collection_metrics.items():
            metric = metrics.Metric(name,
                                    self._set_dimensions(dimensions),
                                    tenant=None)
            collect_stats.append(metric.measurement(value, time.time()))
        self.collection_metrics.clear()
        self._emit(collect_stats)

        # Persist the status of the collection run.
        self._set_status(collect_duration)
예제 #3
0
    def run(self):
        """Collect data from each check and submit their data.

        There are currently two types of checks the system checks and the configured ones from checks_d
        """
        timer = util.Timer()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        # checks_d checks
        num_metrics, emitter_statuses, checks_statuses = self.run_checks_d()

        collect_duration = timer.step()

        collect_stats = []
        dimensions = {'component': 'monasca-agent', 'service': 'monitoring'}
        # Add in metrics on the collector run
        for name, value in self.collector_stats(num_metrics, collect_duration).iteritems():
            collect_stats.append(metrics.Measurement(name,
                                                     time.time(),
                                                     value,
                                                     self._set_dimensions(dimensions),
                                                     None))
        emitter_statuses.append(self._emit(collect_stats))

        # Persist the status of the collection run.
        self._set_status(checks_statuses, emitter_statuses, collect_duration)
예제 #4
0
    def flush(self):

        if self._trs_to_flush is not None:
            log.debug("A flush is already in progress, not doing anything")
            return

        to_flush = []
        # Do we have something to do ?
        now = datetime.now()
        for tr in self._transactions:
            if tr.time_to_flush(now):
                to_flush.append(tr)

        count = len(to_flush)
        should_log = self._flush_count + 1 <= FLUSH_LOGGING_INITIAL or \
            (self._flush_count + 1) % FLUSH_LOGGING_PERIOD == 0
        if count > 0:
            if should_log:
                log.info(
                    "Flushing %s transaction%s during flush #%s" %
                    (count, util.plural(count), str(self._flush_count + 1)))
            else:
                log.debug(
                    "Flushing %s transaction%s during flush #%s" %
                    (count, util.plural(count), str(self._flush_count + 1)))

            timer = util.Timer()
            self._trs_to_flush = to_flush
            self.flush_next()
            # The emit time is reported on the next run.
            dimensions = self._set_dimensions({
                'component': 'monasca-agent',
                'service': 'monitoring'
            })
            emit_measurement = metrics.Measurement('monasca.emit_time_sec',
                                                   time.time(), timer.step(),
                                                   dimensions)
            MetricTransaction([emit_measurement],
                              headers={'Content-Type': 'application/json'})
        else:
            if should_log:
                log.info("No transaction to flush during flush #%s" %
                         str(self._flush_count + 1))
            else:
                log.debug("No transaction to flush during flush #%s" %
                          str(self._flush_count + 1))

        if self._flush_count + 1 == FLUSH_LOGGING_INITIAL:
            log.info(
                "First flushes done, next flushes will be logged every %s flushes."
                % FLUSH_LOGGING_PERIOD)

        self._flush_count += 1

        check_status.ForwarderStatus(
            queue_length=self._total_count,
            queue_size=self._total_size,
            flush_count=self._flush_count,
            transactions_received=self._transactions_received,
            transactions_flushed=self._transactions_flushed).persist()
예제 #5
0
    def run_checks_d(self):
        """Run defined checks_d checks.

        returns a list of Measurements, a dictionary of events and a list of check statuses.
        """
        sub_timer = util.Timer()
        measurements = []
        events = {}
        check_statuses = []
        for check in self.initialized_checks_d:
            if not self.continue_running:
                return
            log.debug("Running check %s" % check.name)
            instance_statuses = []
            metric_count = 0
            event_count = 0
            try:
                # Run the check.
                instance_statuses = check.run()

                # Collect the metrics and events.
                current_check_metrics = check.get_metrics()
                current_check_events = check.get_events()

                # Save them for the payload.
                measurements.extend(current_check_metrics)
                if current_check_events:
                    if check.name not in events:
                        events[check.name] = current_check_events
                    else:
                        events[check.name] += current_check_events

                # Save the status of the check.
                metric_count = len(current_check_metrics)
                event_count = len(current_check_events)
            except Exception:
                log.exception("Error running check %s" % check.name)

            status_check = check_status.CheckStatus(check.name, instance_statuses, metric_count, event_count,
                                                    library_versions=check.get_library_info())
            check_statuses.append(status_check)
            sub_collect_duration = sub_timer.step()
            sub_collect_duration_mills = sub_collect_duration * 1000
            log.debug("Finished run check %s. Collection time: %.2fms." % (
                check.name, round(sub_collect_duration_mills, 2)))
            if sub_collect_duration > util.get_sub_collection_warn():
                log.warn("Collection time for check %s is high: %.2fs." % (
                    check.name, round(sub_collect_duration, 2)))

        for check_name, info in self.init_failed_checks_d.iteritems():
            if not self.continue_running:
                return
            status_check = check_status.CheckStatus(check_name, None, None, None,
                                                    init_failed_error=info['error'],
                                                    init_failed_traceback=info['traceback'])
            check_statuses.append(status_check)

        return measurements, events, check_statuses
예제 #6
0
    def run(self):
        """Collect data from each check and submit their data.

        There are currently two types of checks the system checks and the configured ones from checks_d
        """
        timer = util.Timer()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        metrics_list = []

        timestamp = time.time()
        events = {}

        if self.os == 'windows':  # Windows uses old style checks.
            for check_type in self._checks:
                try:
                    for name, value in check_type.check().iteritems():
                        metrics_list.append(metrics.Measurement(name,
                                                                timestamp,
                                                                value,
                                                                self._set_dimensions(None),
                                                                None))
                except Exception:
                    log.exception('Error running check.')
        else:
            for check_type in self._checks:
                metrics_list.extend(check_type.check())

        # checks_d checks
        checks_d_metrics, checks_d_events, checks_statuses = self.run_checks_d()
        metrics_list.extend(checks_d_metrics)
        events.update(checks_d_events)

        # Store the metrics and events in the payload.
        collect_duration = timer.step()

        dimensions = {'component': 'monasca-agent', 'service': 'monitoring'}
        # Add in metrics on the collector run
        for name, value in self.collector_stats(len(metrics_list), len(events),
                                                collect_duration).iteritems():
            metrics_list.append(metrics.Measurement(name,
                                                    timestamp,
                                                    value,
                                                    self._set_dimensions(dimensions),
                                                    None))
        emitter_statuses = self._emit(metrics_list)

        # Persist the status of the collection run.
        self._set_status(checks_statuses, emitter_statuses, collect_duration)
예제 #7
0
    def stop(self, timeout=0):
        """Tell the collector to stop at the next logical point.
        """
        # This is called when the process is being killed, so
        # try to stop the collector as soon as possible.
        # Most importantly, don't try to submit to the emitters
        # because the forwarder is quite possibly already killed
        # in which case we'll get a misleading error in the logs.
        # Best to not even try.

        log.info("stopping the collector with timeout %d seconds" % timeout)

        self.continue_running = False
        for check_name in self.collection_times:
            check = self.collection_times[check_name]['check']
            check.stop()

        for check_name in self.collection_results:
            run_time = time.time(
            ) - self.collection_results[check_name]['start_time']
            log.info(
                'When exiting... Plugin %s still running after %d seconds' %
                (check_name, run_time))

        self.pool.close()

        # Won't call join() if timeout is zero. If we are in an event thread
        # a BlockingSwitchOutError occurs if wait

        if (timeout > 0):
            timer = util.Timer()
            for worker in self.pool._pool:
                t = timeout - timer.total()
                if t <= 0:
                    break
                if worker.is_alive():
                    try:
                        worker.join(t)
                    except Exception:
                        log.error("Unexpected error: ", sys.exc_info()[0])

        for worker in self.pool._pool:
            if worker.is_alive():
                # the worker didn't complete in the specified timeout.
                # collector must honor the stop request to avoid agent stop/restart hang.
                # os._exit() should be called after collector stops.
                log.info(
                    'worker %s is still alive when collector stop times out.' %
                    worker.name)