Exemplo n.º 1
0
    def run(self, checksd=None, start_event=True):
        """
        Collect data from each check and submit their data.
        """
        timer = Timer()
        if self.os != 'windows':
            cpu_clock = time.clock()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        payload = self._build_payload(start_event=start_event)
        metrics = payload['metrics']
        events = payload['events']
        service_checks = payload['service_checks']
        if checksd:
            self.initialized_checks_d = checksd['initialized_checks'] # is of type {check_name: check}
            self.init_failed_checks_d = checksd['init_failed_checks'] # is of type {check_name: {error, traceback}}
        # Run the system checks. Checks will depend on the OS
        if self.os == 'windows':
            # Win32 system checks
            try:
                metrics.extend(self._win32_system_checks['disk'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['network'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['io'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig))
            except Exception:
                log.exception('Unable to fetch Windows system metrics.')
        else:
            # Unix system checks
            sys_checks = self._unix_system_checks

            diskUsage = sys_checks['disk'].check(self.agentConfig)
            if diskUsage and len(diskUsage) == 2:
                payload["diskUsage"] = diskUsage[0]
                payload["inodes"] = diskUsage[1]

            load = sys_checks['load'].check(self.agentConfig)
            payload.update(load)

            memory = sys_checks['memory'].check(self.agentConfig)

            if memory:
                payload.update({
                    'memPhysUsed' : memory.get('physUsed'),
                    'memPhysPctUsable' : memory.get('physPctUsable'),
                    'memPhysFree' : memory.get('physFree'),
                    'memPhysTotal' : memory.get('physTotal'),
                    'memPhysUsable' : memory.get('physUsable'),
                    'memSwapUsed' : memory.get('swapUsed'),
                    'memSwapFree' : memory.get('swapFree'),
                    'memSwapPctFree' : memory.get('swapPctFree'),
                    'memSwapTotal' : memory.get('swapTotal'),
                    'memCached' : memory.get('physCached'),
                    'memBuffers': memory.get('physBuffers'),
                    'memShared': memory.get('physShared')
                })

            ioStats = sys_checks['io'].check(self.agentConfig)
            if ioStats:
                payload['ioStats'] = ioStats

            processes = sys_checks['processes'].check(self.agentConfig)
            payload.update({'processes': processes})

            cpuStats = sys_checks['cpu'].check(self.agentConfig)
            if cpuStats:
                payload.update(cpuStats)

        # Run old-style checks
        gangliaData = self._ganglia.check(self.agentConfig)
        dogstreamData = self._dogstream.check(self.agentConfig)
        ddforwarderData = self._ddforwarder.check(self.agentConfig)


        if gangliaData is not False and gangliaData is not None:
            payload['ganglia'] = gangliaData

        # dogstream
        if dogstreamData:
            dogstreamEvents = dogstreamData.get('dogstreamEvents', None)
            if dogstreamEvents:
                if 'dogstream' in payload['events']:
                    events['dogstream'].extend(dogstreamEvents)
                else:
                    events['dogstream'] = dogstreamEvents
                del dogstreamData['dogstreamEvents']

            payload.update(dogstreamData)

        # metrics about the forwarder
        if ddforwarderData:
            payload['datadog'] = ddforwarderData

        # Process the event checks.
        for event_check in self._event_checks:
            event_data = event_check.check(log, self.agentConfig)
            if event_data:
                events[event_check.key] = event_data

        # Resources checks
        if self.os != 'windows':
            has_resource = False
            for resources_check in self._resources_checks:
                resources_check.check()
                snaps = resources_check.pop_snapshots()
                if snaps:
                    has_resource = True
                    res_value = { 'snaps': snaps,
                                  'format_version': resources_check.get_format_version() }
                    res_format = resources_check.describe_format_if_needed()
                    if res_format is not None:
                        res_value['format_description'] = res_format
                    payload['resources'][resources_check.RESOURCE_KEY] = res_value

            if has_resource:
                payload['resources']['meta'] = {
                            'api_key': self.agentConfig['api_key'],
                            'host': payload['internalHostname'],
                        }

        # newer-style checks (not checks.d style)
        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)

        # checks.d checks
        check_statuses = []
        for check in self.initialized_checks_d:
            if not self.continue_running:
                return
            log.info("Running check %s" % check.name)
            instance_statuses = []
            metric_count = 0
            event_count = 0
            try:
                # Run the check.
                instance_statuses = check.run()

                # Collect the metrics and events.
                current_check_metrics = check.get_metrics()
                current_check_events = check.get_events()
                current_check_service_checks = check.get_service_checks()

                # Save them for the payload.
                metrics.extend(current_check_metrics)
                if current_check_events:
                    if check.name not in events:
                        events[check.name] = current_check_events
                    else:
                        events[check.name] += current_check_events
                if current_check_service_checks:
                    service_checks.extend(current_check_service_checks)

                # Save the status of the check.
                metric_count = len(current_check_metrics)
                event_count = len(current_check_events)
                service_check_count = len(current_check_service_checks)
            except Exception:
                log.exception("Error running check %s" % check.name)

            check_status = CheckStatus(check.name, instance_statuses, metric_count, event_count, service_check_count,
                library_versions=check.get_library_info())
            check_statuses.append(check_status)

        for check_name, info in self.init_failed_checks_d.iteritems():
            if not self.continue_running:
                return
            check_status = CheckStatus(check_name, None, None, None, None,
                                       init_failed_error=info['error'],
                                       init_failed_traceback=info['traceback'])
            check_statuses.append(check_status)


        # Store the metrics and events in the payload.
        payload['metrics'] = metrics
        payload['events'] = events
        payload['service_checks'] = service_checks
        collect_duration = timer.step()

        if self.os != 'windows':
            payload['metrics'].extend(self._agent_metrics.check(payload, self.agentConfig,
                collect_duration, self.emit_duration, time.clock() - cpu_clock))
        else:
            payload['metrics'].extend(self._agent_metrics.check(payload, self.agentConfig,
                collect_duration, self.emit_duration))


        emitter_statuses = self._emit(payload)
        self.emit_duration = timer.step()

        # Persist the status of the collection run.
        try:
            CollectorStatus(check_statuses, emitter_statuses, self.metadata_cache).persist()
        except Exception:
            log.exception("Error persisting collector status")

        if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
            log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                    (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))
            if self.run_count == FLUSH_LOGGING_INITIAL:
                log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD)

        else:
            log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                    (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))
Exemplo n.º 2
0
    def run(self, checksd=None, start_event=True, configs_reloaded=False):
        """
        Collect data from each check and submit their data.
        """
        log.debug("Found {num_checks} checks".format(num_checks=len(checksd['initialized_checks'])))
        timer = Timer()
        if not Platform.is_windows():
            cpu_clock = time.clock()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        if checksd:
            self.initialized_checks_d = checksd['initialized_checks']  # is a list of AgentCheck instances
            self.init_failed_checks_d = checksd['init_failed_checks']  # is of type {check_name: {error, traceback}}

        payload = AgentPayload()

        # Find the AgentMetrics check and pop it out
        # This check must run at the end of the loop to collect info on agent performance
        if not self._agent_metrics or configs_reloaded:
            for check in self.initialized_checks_d:
                if check.name == AGENT_METRICS_CHECK_NAME:
                    self._agent_metrics = check
                    self.initialized_checks_d.remove(check)
                    break

        # Initialize payload
        self._build_payload(payload)

        metrics = payload['metrics']
        events = payload['events']
        service_checks = payload['service_checks']

        # Run the system checks. Checks will depend on the OS
        if Platform.is_windows():
            # Win32 system checks
            try:
                metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['network'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['io'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['system'].check(self.agentConfig))
            except Exception:
                log.exception('Unable to fetch Windows system metrics.')
        else:
            # Unix system checks
            sys_checks = self._unix_system_checks

            load = sys_checks['load'].check(self.agentConfig)
            payload.update(load)

            system = sys_checks['system'].check(self.agentConfig)
            payload.update(system)

            memory = sys_checks['memory'].check(self.agentConfig)

            if memory:
                memstats = {
                    'memPhysUsed': memory.get('physUsed'),
                    'memPhysPctUsable': memory.get('physPctUsable'),
                    'memPhysFree': memory.get('physFree'),
                    'memPhysTotal': memory.get('physTotal'),
                    'memPhysUsable': memory.get('physUsable'),
                    'memSwapUsed': memory.get('swapUsed'),
                    'memSwapFree': memory.get('swapFree'),
                    'memSwapPctFree': memory.get('swapPctFree'),
                    'memSwapTotal': memory.get('swapTotal'),
                    'memCached': memory.get('physCached'),
                    'memBuffers': memory.get('physBuffers'),
                    'memShared': memory.get('physShared'),
                    'memSlab': memory.get('physSlab'),
                    'memPageTables': memory.get('physPageTables'),
                    'memSwapCached': memory.get('swapCached')
                }
                payload.update(memstats)

            ioStats = sys_checks['io'].check(self.agentConfig)
            if ioStats:
                payload['ioStats'] = ioStats

            processes = sys_checks['processes'].check(self.agentConfig)
            payload.update({'processes': processes})

            cpuStats = sys_checks['cpu'].check(self.agentConfig)
            if cpuStats:
                payload.update(cpuStats)

        # Run old-style checks
        gangliaData = self._ganglia.check(self.agentConfig)
        dogstreamData = self._dogstream.check(self.agentConfig)
        ddforwarderData = self._ddforwarder.check(self.agentConfig)

        if gangliaData is not False and gangliaData is not None:
            payload['ganglia'] = gangliaData

        # dogstream
        if dogstreamData:
            dogstreamEvents = dogstreamData.get('dogstreamEvents', None)
            if dogstreamEvents:
                if 'dogstream' in payload['events']:
                    events['dogstream'].extend(dogstreamEvents)
                else:
                    events['dogstream'] = dogstreamEvents
                del dogstreamData['dogstreamEvents']

            payload.update(dogstreamData)

        # metrics about the forwarder
        if ddforwarderData:
            payload['datadog'] = ddforwarderData

        # process collector of gohai (compliant with payload of legacy "resources checks")
        if not Platform.is_windows() and self._should_send_additional_data('processes'):
            gohai_processes = self._run_gohai_processes()
            if gohai_processes:
                try:
                    gohai_processes_json = json.loads(gohai_processes)
                    processes_payload = {
                        'snaps': [gohai_processes_json.get('processes')],
                        'format_version': 1
                    }
                    if self._is_first_run():
                        processes_payload['format_description'] = PROCESSES_FORMAT_DESCRIPTION

                    payload['resources'] = {
                        'processes': processes_payload,
                        'meta': {
                            'host': payload['internalHostname'],
                        }
                    }
                except Exception:
                    log.exception("Error running gohai processes collection")

        # newer-style checks (not checks.d style)
        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)

        # checks.d checks
        check_statuses = []
        for check in self.initialized_checks_d:
            if not self.continue_running:
                return
            log.info("Running check %s" % check.name)
            instance_statuses = []
            metric_count = 0
            event_count = 0
            service_check_count = 0
            check_start_time = time.time()
            check_stats = None

            try:
                # Run the check.
                instance_statuses = check.run()

                # Collect the metrics and events.
                current_check_metrics = check.get_metrics()
                current_check_events = check.get_events()
                check_stats = check._get_internal_profiling_stats()

                # Collect metadata
                current_check_metadata = check.get_service_metadata()

                # Save metrics & events for the payload.
                metrics.extend(current_check_metrics)
                if current_check_events:
                    if check.name not in events:
                        events[check.name] = current_check_events
                    else:
                        events[check.name] += current_check_events

                # Save the status of the check.
                metric_count = len(current_check_metrics)
                event_count = len(current_check_events)

            except Exception:
                log.exception("Error running check %s" % check.name)

            check_status = CheckStatus(
                check.name, instance_statuses, metric_count,
                event_count, service_check_count, service_metadata=current_check_metadata,
                library_versions=check.get_library_info(),
                source_type_name=check.SOURCE_TYPE_NAME or check.name,
                check_stats=check_stats
            )

            # Service check for Agent checks failures
            service_check_tags = ["check:%s" % check.name]
            if check_status.status == STATUS_OK:
                status = AgentCheck.OK
            elif check_status.status == STATUS_ERROR:
                status = AgentCheck.CRITICAL
            check.service_check('datadog.agent.check_status', status, tags=service_check_tags)

            # Collect the service checks and save them in the payload
            current_check_service_checks = check.get_service_checks()
            if current_check_service_checks:
                service_checks.extend(current_check_service_checks)
            service_check_count = len(current_check_service_checks)

            # Update the check status with the correct service_check_count
            check_status.service_check_count = service_check_count
            check_statuses.append(check_status)

            check_run_time = time.time() - check_start_time
            log.debug("Check %s ran in %.2f s" % (check.name, check_run_time))

            # Intrument check run timings if enabled.
            if self.check_timings:
                metric = 'datadog.agent.check_run_time'
                meta = {'tags': ["check:%s" % check.name]}
                metrics.append((metric, time.time(), check_run_time, meta))

        for check_name, info in self.init_failed_checks_d.iteritems():
            if not self.continue_running:
                return
            check_status = CheckStatus(check_name, None, None, None, None,
                                       init_failed_error=info['error'],
                                       init_failed_traceback=info['traceback'])
            check_statuses.append(check_status)

        # Add a service check for the agent
        service_checks.append(create_service_check('datadog.agent.up', AgentCheck.OK,
                              hostname=self.hostname))

        # Store the metrics and events in the payload.
        payload['metrics'] = metrics
        payload['events'] = events
        payload['service_checks'] = service_checks

        # Populate metadata
        self._populate_payload_metadata(payload, check_statuses, start_event)

        collect_duration = timer.step()

        if self._agent_metrics:
            metric_context = {
                'collection_time': collect_duration,
                'emit_time': self.emit_duration,
            }
            if not Platform.is_windows():
                metric_context['cpu_time'] = time.clock() - cpu_clock

            self._agent_metrics.set_metric_context(payload, metric_context)
            self._agent_metrics.run()
            agent_stats = self._agent_metrics.get_metrics()
            payload['metrics'].extend(agent_stats)
            if self.agentConfig.get('developer_mode'):
                log.debug("\n Agent developer mode stats: \n {0}".format(
                    Collector._stats_for_display(agent_stats))
                )
            # Flush metadata for the Agent Metrics check. Otherwise they'll just accumulate and leak.
            self._agent_metrics.get_service_metadata()

        # Let's send our payload
        emitter_statuses = payload.emit(log, self.agentConfig, self.emitters,
                                        self.continue_running)
        self.emit_duration = timer.step()

        # Persist the status of the collection run.
        try:
            CollectorStatus(check_statuses, emitter_statuses,
                            self.hostname_metadata_cache).persist()
        except Exception:
            log.exception("Error persisting collector status")

        if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
            log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                     (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))
            if self.run_count == FLUSH_LOGGING_INITIAL:
                log.info("First flushes done, next flushes will be logged every %s flushes." %
                         FLUSH_LOGGING_PERIOD)
        else:
            log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                      (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))

        return payload
Exemplo n.º 3
0
    def run(self, checksd=None, start_event=True, configs_reloaded=False):

        log.debug("Found {num_checks} checks".format(num_checks=len(checksd['initialized_checks'])))
        timer = Timer()
        if not Platform.is_windows():
            cpu_clock = time.clock()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        if checksd:
            self.initialized_checks_d = checksd['initialized_checks']
            self.init_failed_checks_d = checksd['init_failed_checks']

        payload = AgentPayload()

        if not self._agent_metrics or configs_reloaded:
            for check in self.initialized_checks_d:
                if check.name == AGENT_METRICS_CHECK_NAME:
                    self._agent_metrics = check
                    self.initialized_checks_d.remove(check)
                    break

        self._build_payload(payload)

        metrics = payload['metrics']
        events = payload['events']
        service_checks = payload['service_checks']

        if Platform.is_windows():
            try:
                metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['network'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['io'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['system'].check(self.agentConfig))
            except Exception:
                log.exception('Unable to fetch Windows system metrics.')
        else:
            sys_checks = self._unix_system_checks

            load = sys_checks['load'].check(self.agentConfig)
            payload.update(load)

            system = sys_checks['system'].check(self.agentConfig)
            payload.update(system)

            memory = sys_checks['memory'].check(self.agentConfig)

            if memory:
                memstats = {
                    'memPhysUsed': memory.get('physUsed'),
                    'memPhysPctUsage': memory.get('physPctUsage'),
                    'memPhysFree': memory.get('physFree'),
                    'memPhysTotal': memory.get('physTotal'),
                    'memPhysUsable': memory.get('physUsable'),
                    'memSwapUsed': memory.get('swapUsed'),
                    'memSwapFree': memory.get('swapFree'),
                    'memSwapPctFree': memory.get('swapPctFree'),
                    'memSwapTotal': memory.get('swapTotal'),
                    'memCached': memory.get('physCached'),
                    'memBuffers': memory.get('physBuffers'),
                    'memShared': memory.get('physShared'),
                    'memSlab': memory.get('physSlab'),
                    'memPageTables': memory.get('physPageTables'),
                    'memSwapCached': memory.get('swapCached')
                }
                payload.update(memstats)

            ioStats = sys_checks['io'].check(self.agentConfig)
            if ioStats:
                payload['ioStats'] = ioStats

            processes = sys_checks['processes'].check(self.agentConfig)
            payload.update({'processes': processes})

            cpuStats = sys_checks['cpu'].check(self.agentConfig)
            if cpuStats:
                payload.update(cpuStats)

        gangliaData = self._ganglia.check(self.agentConfig)
        monitorstreamData = self._monitorstream.check(self.agentConfig)
        ddforwarderData = self._ddforwarder.check(self.agentConfig)

        if gangliaData is not False and gangliaData is not None:
            payload['ganglia'] = gangliaData

        if monitorstreamData:
            monitorstreamEvents = monitorstreamData.get('monitorstreamEvents', None)
            if monitorstreamEvents:
                if 'monitorstream' in payload['events']:
                    events['monitorstream'].extend(monitorstreamEvents)
                else:
                    events['monitorstream'] = monitorstreamEvents
                del monitorstreamData['monitorstreamEvents']

            payload.update(monitorstreamData)

        if ddforwarderData:
            payload['datamonitor'] = ddforwarderData

        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)

        check_statuses = []
        for check in self.initialized_checks_d:
            if not self.continue_running:
                return
            log.info("Running check %s" % check.name)
            instance_statuses = []
            metric_count = 0
            event_count = 0
            service_check_count = 0
            check_start_time = time.time()
            check_stats = None

            try:
                instance_statuses = check.run()

                current_check_metrics = check.get_metrics()
                current_check_events = check.get_events()
                check_stats = check._get_internal_profiling_stats()

                current_check_metadata = check.get_service_metadata()

                metrics.extend(current_check_metrics)
                if current_check_events:
                    if check.name not in events:
                        events[check.name] = current_check_events
                    else:
                        events[check.name] += current_check_events

                metric_count = len(current_check_metrics)
                event_count = len(current_check_events)

            except Exception:
                log.exception("Error running check %s" % check.name)

            check_status = CheckStatus(
                check.name, instance_statuses, metric_count,
                event_count, service_check_count, service_metadata=current_check_metadata,
                library_versions=check.get_library_info(),
                source_type_name=check.SOURCE_TYPE_NAME or check.name,
                check_stats=check_stats
            )

            service_check_tags = ["check:%s" % check.name]
            if check_status.status == STATUS_OK:
                status = AgentCheck.OK
            elif check_status.status == STATUS_ERROR:
                status = AgentCheck.CRITICAL
            check.service_check('datamonitor.agent.check_status', status, tags=service_check_tags)

            current_check_service_checks = check.get_service_checks()
            if current_check_service_checks:
                service_checks.extend(current_check_service_checks)
            service_check_count = len(current_check_service_checks)

            check_status.service_check_count = service_check_count
            check_statuses.append(check_status)

            check_run_time = time.time() - check_start_time
            log.debug("Check %s ran in %.2f s" % (check.name, check_run_time))

            if self.check_timings:
                metric = 'datamonitor.agent.check_run_time'
                meta = {'tags': ["check:%s" % check.name]}
                metrics.append((metric, time.time(), check_run_time, meta))

        for check_name, info in self.init_failed_checks_d.iteritems():
            if not self.continue_running:
                return
            check_status = CheckStatus(check_name, None, None, None, None,
                                       init_failed_error=info['error'],
                                       init_failed_traceback=info['error'])
            check_statuses.append(check_status)

        service_checks.append(create_service_check('datamonitor.agent.up', AgentCheck.OK,
                                                   hostname=self.hostname))

        payload['metrics'] = metrics
        payload['events'] = events
        payload['service_checks'] = service_checks

        self._populate_payload_metadata(payload, check_statuses, start_event)

        collect_duration = timer.step()

        if self._agent_metrics:
            metric_context = {
                'collection_time': collect_duration,
                'emit_time': self.emit_duration,
            }
            if not Platform.is_windows():
                metric_context['cpu_time'] = time.clock() - cpu_clock

            self._agent_metrics.set_metric_context(payload, metric_context)
            self._agent_metrics.run()
            agent_stats = self._agent_metrics.get_metrics()
            payload['metrics'].extend(agent_stats)
            if self.agentConfig.get('developer_mode'):
                log.debug("\n Agent developer mode stats: \n {0}".format(
                    Collector._stats_for_display(agent_stats))
                )
            self._agent_metrics.get_service_metadata()

        emitter_statuses = payload.emit(log, self.agentConfig, self.emitters,
                                        self.continue_running)
        self.emit_duration = timer.step()

        try:
            CollectorStatus(check_statuses, emitter_statuses,
                            self.hostname_metadata_cache).persist()
        except Exception:
            log.exception("Error persisting collector status")

        if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
            log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                     (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))
            if self.run_count == FLUSH_LOGGING_INITIAL:
                log.info("First flushes done, next flushes will be logged every %s flushes." %
                         FLUSH_LOGGING_PERIOD)
        else:
            log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                      (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))

        return payload
Exemplo n.º 4
0
    def run(self, agentConfig):
        hostname=''
		
        timer = Timer()
        log.debug("Starting collection run")
			
        self.run_counter = 0
        while True:
            checksd = load_check_directory(agentConfig, hostname)
            log.debug("Found {num_checks} checks".format(num_checks=len(checksd['initialized_checks'])))

            if checksd:
                self.initialized_checks_d = checksd['initialized_checks']  # is a list of AgentCheck instances
                self.init_failed_checks_d = checksd['init_failed_checks']  # is of type {check_name: {error, traceback}}

            start = time.time()
            if not self.initialized_checks_d:
                log.info("Not found checks valid, please check for it")

            # checks.d checks
            for check in self.initialized_checks_d:
                log.info("Running check %s", check.name)
                instance_statuses = []
                check_start_time = time.time()

                try:
                    # Run the check.
                    instance_statuses = check.run()

                    # Collect the metrics.    
                    current_check_metrics = check.get_metrics()
                    parse_result(current_check_metrics, self.attr_map, self.id_with_ratio, self.id_with_ip)
				
                    log.info("check result for %s: \n\t%s" %(check.name,current_check_metrics))

                except Exception:
                    log.exception("Error running check %s" % check.name)

                check_run_time = time.time() - check_start_time
                log.debug("Check %s ran in %.2f s" % (check.name, check_run_time))

            
            collect_duration = timer.step()
            collect_duration = collect_duration - (0 if collect_duration <= COLLECT_INTERVAL else COLLECT_INTERVAL)
            self.run_counter = self.run_counter + 1;
            log.info("Finished run #%d. Collection time: %ss" % (self.run_counter, round(collect_duration, 2)))
            
            if self.run_counter > RELOAD_CHESK_INTERVAL:
                log.debug("Reload checks....")
                tmp_checksd = load_check_directory(agentConfig, hostname)
		
                checksd.clear()
                checksd['init_failed_checks'] = tmp_checksd['init_failed_checks']
                checksd['initialized_checks'] = []
                
                for new_check in tmp_checksd['initialized_checks']:
                    for check in self.initialized_checks_d:
                        if new_check.name == check.name:
                            new_check.aggregator = check.aggregator
                            break
                    checksd['initialized_checks'].append(new_check)
                        
                if checksd:
                    self.initialized_checks_d = checksd['initialized_checks']  # is a list of AgentCheck instances
                    self.init_failed_checks_d = checksd['init_failed_checks']  # is of type {check_name: {error, traceback}}		
					
                log.debug("Reload done, found {num_checks} checks".format(num_checks=len(self.initialized_checks_d)))

                self.run_counter = 0
			
            #time.sleep(COLLECT_INTERVAL if self.run_counter != 0 else (COLLECT_INTERVAL - timer.step() + collect_duration))	#collect interval
            sleep_time = COLLECT_INTERVAL - time.time() + start
            time.sleep(sleep_time)
			
        return {}
Exemplo n.º 5
0
    def run(self, checksd=None, start_event=True):
        """
        Collect data from each check and submit their data.
        """
        timer = Timer()
        if self.os != 'windows':
            cpu_clock = time.clock()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        payload = self._build_payload(start_event=start_event)
        metrics = payload['metrics']
        events = payload['events']
        service_checks = payload['service_checks']
        if checksd:
            self.initialized_checks_d = checksd[
                'initialized_checks']  # is of type {check_name: check}
            self.init_failed_checks_d = checksd[
                'init_failed_checks']  # is of type {check_name: {error, traceback}}
        # Run the system checks. Checks will depend on the OS
        if self.os == 'windows':
            # Win32 system checks
            try:
                metrics.extend(self._win32_system_checks['disk'].check(
                    self.agentConfig))
                metrics.extend(self._win32_system_checks['memory'].check(
                    self.agentConfig))
                metrics.extend(self._win32_system_checks['cpu'].check(
                    self.agentConfig))
                metrics.extend(self._win32_system_checks['network'].check(
                    self.agentConfig))
                metrics.extend(self._win32_system_checks['io'].check(
                    self.agentConfig))
                metrics.extend(self._win32_system_checks['proc'].check(
                    self.agentConfig))
            except Exception:
                log.exception('Unable to fetch Windows system metrics.')
        else:
            # Unix system checks
            sys_checks = self._unix_system_checks

            diskUsage = sys_checks['disk'].check(self.agentConfig)
            if diskUsage and len(diskUsage) == 2:
                payload["diskUsage"] = diskUsage[0]
                payload["inodes"] = diskUsage[1]

            load = sys_checks['load'].check(self.agentConfig)
            payload.update(load)

            system = sys_checks['system'].check(self.agentConfig)
            payload.update(system)

            memory = sys_checks['memory'].check(self.agentConfig)

            if memory:
                payload.update({
                    'memPhysUsed': memory.get('physUsed'),
                    'memPhysPctUsable': memory.get('physPctUsable'),
                    'memPhysFree': memory.get('physFree'),
                    'memPhysTotal': memory.get('physTotal'),
                    'memPhysUsable': memory.get('physUsable'),
                    'memSwapUsed': memory.get('swapUsed'),
                    'memSwapFree': memory.get('swapFree'),
                    'memSwapPctFree': memory.get('swapPctFree'),
                    'memSwapTotal': memory.get('swapTotal'),
                    'memCached': memory.get('physCached'),
                    'memBuffers': memory.get('physBuffers'),
                    'memShared': memory.get('physShared')
                })

            ioStats = sys_checks['io'].check(self.agentConfig)
            if ioStats:
                payload['ioStats'] = ioStats

            processes = sys_checks['processes'].check(self.agentConfig)
            payload.update({'processes': processes})

            cpuStats = sys_checks['cpu'].check(self.agentConfig)
            if cpuStats:
                payload.update(cpuStats)

        # Run old-style checks
        gangliaData = self._ganglia.check(self.agentConfig)
        dogstreamData = self._dogstream.check(self.agentConfig)
        ddforwarderData = self._ddforwarder.check(self.agentConfig)

        if gangliaData is not False and gangliaData is not None:
            payload['ganglia'] = gangliaData

        # dogstream
        if dogstreamData:
            dogstreamEvents = dogstreamData.get('dogstreamEvents', None)
            if dogstreamEvents:
                if 'dogstream' in payload['events']:
                    events['dogstream'].extend(dogstreamEvents)
                else:
                    events['dogstream'] = dogstreamEvents
                del dogstreamData['dogstreamEvents']

            payload.update(dogstreamData)

        # metrics about the forwarder
        if ddforwarderData:
            payload['datadog'] = ddforwarderData

        # Resources checks
        if self.os != 'windows':
            has_resource = False
            for resources_check in self._resources_checks:
                resources_check.check()
                snaps = resources_check.pop_snapshots()
                if snaps:
                    has_resource = True
                    res_value = {
                        'snaps': snaps,
                        'format_version': resources_check.get_format_version()
                    }
                    res_format = resources_check.describe_format_if_needed()
                    if res_format is not None:
                        res_value['format_description'] = res_format
                    payload['resources'][
                        resources_check.RESOURCE_KEY] = res_value

            if has_resource:
                payload['resources']['meta'] = {
                    'api_key': self.agentConfig['api_key'],
                    'host': payload['internalHostname'],
                }

        # newer-style checks (not checks.d style)
        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)

        # checks.d checks
        check_statuses = []
        for check in self.initialized_checks_d:
            if not self.continue_running:
                return
            log.info("Running check %s" % check.name)
            instance_statuses = []
            metric_count = 0
            event_count = 0
            service_check_count = 0
            check_start_time = time.time()
            try:
                # Run the check.
                instance_statuses = check.run()

                # Collect the metrics and events.
                current_check_metrics = check.get_metrics()
                current_check_events = check.get_events()

                # Save them for the payload.
                metrics.extend(current_check_metrics)
                if current_check_events:
                    if check.name not in events:
                        events[check.name] = current_check_events
                    else:
                        events[check.name] += current_check_events

                # Save the status of the check.
                metric_count = len(current_check_metrics)
                event_count = len(current_check_events)
            except Exception:
                log.exception("Error running check %s" % check.name)

            check_status = CheckStatus(
                check.name,
                instance_statuses,
                metric_count,
                event_count,
                service_check_count,
                library_versions=check.get_library_info(),
                source_type_name=check.SOURCE_TYPE_NAME or check.name)

            # Service check for Agent checks failures
            service_check_tags = ["check:%s" % check.name]
            if check_status.status == STATUS_OK:
                status = AgentCheck.OK
            elif check_status.status == STATUS_ERROR:
                status = AgentCheck.CRITICAL
            check.service_check('datadog.agent.check_status',
                                status,
                                tags=service_check_tags)

            # Collect the service checks and save them in the payload
            current_check_service_checks = check.get_service_checks()
            if current_check_service_checks:
                service_checks.extend(current_check_service_checks)
            service_check_count = len(current_check_service_checks)

            # Update the check status with the correct service_check_count
            check_status.service_check_count = service_check_count
            check_statuses.append(check_status)

            check_run_time = time.time() - check_start_time
            log.debug("Check %s ran in %.2f s" % (check.name, check_run_time))

            # Intrument check run timings if enabled.
            if self.check_timings:
                metric = 'datadog.agent.check_run_time'
                meta = {'tags': ["check:%s" % check.name]}
                metrics.append((metric, time.time(), check_run_time, meta))

        for check_name, info in self.init_failed_checks_d.iteritems():
            if not self.continue_running:
                return
            check_status = CheckStatus(check_name,
                                       None,
                                       None,
                                       None,
                                       None,
                                       init_failed_error=info['error'],
                                       init_failed_traceback=info['traceback'])
            check_statuses.append(check_status)

        # Add a service check for the agent
        service_checks.append(
            create_service_check('datadog.agent.up',
                                 AgentCheck.OK,
                                 hostname=self.hostname))

        # Store the metrics and events in the payload.
        payload['metrics'] = metrics
        payload['events'] = events
        payload['service_checks'] = service_checks

        if self._should_send_additional_data('agent_checks'):
            # Add agent checks statuses and error/warning messages
            agent_checks = []
            for check in check_statuses:
                if check.instance_statuses is not None:
                    for instance_status in check.instance_statuses:
                        agent_checks.append((
                            check.name,
                            check.source_type_name,
                            instance_status.instance_id,
                            instance_status.status,
                            # put error message or list of warning messages in the same field
                            # it will be handled by the UI
                            instance_status.error or instance_status.warnings
                            or ""))
                else:
                    agent_checks.append(
                        (check.name, check.source_type_name, "initialization",
                         check.status, repr(check.init_failed_error)))
            payload['agent_checks'] = agent_checks
            payload['meta'] = self.metadata_cache  # add hostname metadata
        collect_duration = timer.step()

        if self.os != 'windows':
            payload['metrics'].extend(
                self._agent_metrics.check(payload, self.agentConfig,
                                          collect_duration, self.emit_duration,
                                          time.clock() - cpu_clock))
        else:
            payload['metrics'].extend(
                self._agent_metrics.check(payload, self.agentConfig,
                                          collect_duration,
                                          self.emit_duration))

        emitter_statuses = self._emit(payload)
        self.emit_duration = timer.step()

        # Persist the status of the collection run.
        try:
            CollectorStatus(check_statuses, emitter_statuses,
                            self.metadata_cache).persist()
        except Exception:
            log.exception("Error persisting collector status")

        if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
            log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                     (self.run_count, round(collect_duration,
                                            2), round(self.emit_duration, 2)))
            if self.run_count == FLUSH_LOGGING_INITIAL:
                log.info(
                    "First flushes done, next flushes will be logged every %s flushes."
                    % FLUSH_LOGGING_PERIOD)

        else:
            log.debug(
                "Finished run #%s. Collection time: %ss. Emit time: %ss" %
                (self.run_count, round(collect_duration,
                                       2), round(self.emit_duration, 2)))

        return payload
Exemplo n.º 6
0
    def run(self, checksd=None, start_event=True):
        """
        Collect data from each check and submit their data.
        """
        timer = Timer()
        if self.os != 'windows':
            cpu_clock = time.clock()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        if checksd:
            self.initialized_checks_d = checksd['initialized_checks'] # is a list of AgentCheck instances
            self.init_failed_checks_d = checksd['init_failed_checks'] # is of type {check_name: {error, traceback}}

            # Find the AgentMetrics check and pop it out
            # This check must run at the end of the loop to collect info on agent performance
            if not self._agent_metrics:
                for check in self.initialized_checks_d:
                    if check.name == AGENT_METRICS_CHECK_NAME:
                        self._agent_metrics = check
                        self.initialized_checks_d.remove(check)
                        break

        payload = self._build_payload(start_event=start_event)
        metrics = payload['metrics']
        events = payload['events']
        service_checks = payload['service_checks']

        # Run the system checks. Checks will depend on the OS
        if self.os == 'windows':
            # Win32 system checks
            try:
                metrics.extend(self._win32_system_checks['disk'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['network'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['io'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig))
            except Exception:
                log.exception('Unable to fetch Windows system metrics.')
        else:
            # Unix system checks
            sys_checks = self._unix_system_checks

            load = sys_checks['load'].check(self.agentConfig)
            payload.update(load)

            system = sys_checks['system'].check(self.agentConfig)
            payload.update(system)

            memory = sys_checks['memory'].check(self.agentConfig)

            if memory:
                payload.update({
                    'memPhysUsed' : memory.get('physUsed'),
                    'memPhysPctUsable' : memory.get('physPctUsable'),
                    'memPhysFree' : memory.get('physFree'),
                    'memPhysTotal' : memory.get('physTotal'),
                    'memPhysUsable' : memory.get('physUsable'),
                    'memSwapUsed' : memory.get('swapUsed'),
                    'memSwapFree' : memory.get('swapFree'),
                    'memSwapPctFree' : memory.get('swapPctFree'),
                    'memSwapTotal' : memory.get('swapTotal'),
                    'memCached' : memory.get('physCached'),
                    'memBuffers': memory.get('physBuffers'),
                    'memShared': memory.get('physShared')
                })

            ioStats = sys_checks['io'].check(self.agentConfig)
            if ioStats:
                payload['ioStats'] = ioStats

            processes = sys_checks['processes'].check(self.agentConfig)
            payload.update({'processes': processes})

            cpuStats = sys_checks['cpu'].check(self.agentConfig)
            if cpuStats:
                payload.update(cpuStats)

        # Run old-style checks
        gangliaData = self._ganglia.check(self.agentConfig)
        dogstreamData = self._dogstream.check(self.agentConfig)
        ddforwarderData = self._ddforwarder.check(self.agentConfig)


        if gangliaData is not False and gangliaData is not None:
            payload['ganglia'] = gangliaData

        # dogstream
        if dogstreamData:
            dogstreamEvents = dogstreamData.get('dogstreamEvents', None)
            if dogstreamEvents:
                if 'dogstream' in payload['events']:
                    events['dogstream'].extend(dogstreamEvents)
                else:
                    events['dogstream'] = dogstreamEvents
                del dogstreamData['dogstreamEvents']

            payload.update(dogstreamData)

        # metrics about the forwarder
        if ddforwarderData:
            payload['datadog'] = ddforwarderData

        # Resources checks
        if self.os != 'windows':
            has_resource = False
            for resources_check in self._resources_checks:
                resources_check.check()
                snaps = resources_check.pop_snapshots()
                if snaps:
                    has_resource = True
                    res_value = {'snaps': snaps,
                                 'format_version': resources_check.get_format_version()}
                    res_format = resources_check.describe_format_if_needed()
                    if res_format is not None:
                        res_value['format_description'] = res_format
                    payload['resources'][resources_check.RESOURCE_KEY] = res_value

            if has_resource:
                payload['resources']['meta'] = {
                    'api_key': self.agentConfig['api_key'],
                    'host': payload['internalHostname'],
                }

        # newer-style checks (not checks.d style)
        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)

        # checks.d checks
        check_statuses = []
        for check in self.initialized_checks_d:
            if not self.continue_running:
                return
            log.info("Running check %s" % check.name)
            instance_statuses = []
            metric_count = 0
            event_count = 0
            service_check_count = 0
            check_start_time = time.time()
            check_stats = None

            try:
                # Run the check.
                instance_statuses = check.run()

                # Collect the metrics and events.
                current_check_metrics = check.get_metrics()
                current_check_events = check.get_events()
                check_stats = check._get_internal_profiling_stats()

                # Save them for the payload.
                metrics.extend(current_check_metrics)
                if current_check_events:
                    if check.name not in events:
                        events[check.name] = current_check_events
                    else:
                        events[check.name] += current_check_events

                # Save the status of the check.
                metric_count = len(current_check_metrics)
                event_count = len(current_check_events)

            except Exception:
                log.exception("Error running check %s" % check.name)

            check_status = CheckStatus(
                check.name, instance_statuses, metric_count,
                event_count, service_check_count,
                library_versions=check.get_library_info(),
                source_type_name=check.SOURCE_TYPE_NAME or check.name,
                check_stats=check_stats
            )

            # Service check for Agent checks failures
            service_check_tags = ["check:%s" % check.name]
            if check_status.status == STATUS_OK:
                status = AgentCheck.OK
            elif check_status.status == STATUS_ERROR:
                status = AgentCheck.CRITICAL
            check.service_check('datadog.agent.check_status', status, tags=service_check_tags)

            # Collect the service checks and save them in the payload
            current_check_service_checks = check.get_service_checks()
            if current_check_service_checks:
                service_checks.extend(current_check_service_checks)
            service_check_count = len(current_check_service_checks)

            # Update the check status with the correct service_check_count
            check_status.service_check_count = service_check_count
            check_statuses.append(check_status)

            check_run_time = time.time() - check_start_time
            log.debug("Check %s ran in %.2f s" % (check.name, check_run_time))

            # Intrument check run timings if enabled.
            if self.check_timings:
                metric = 'datadog.agent.check_run_time'
                meta = {'tags': ["check:%s" % check.name]}
                metrics.append((metric, time.time(), check_run_time, meta))

        for check_name, info in self.init_failed_checks_d.iteritems():
            if not self.continue_running:
                return
            check_status = CheckStatus(check_name, None, None, None, None,
                                       init_failed_error=info['error'],
                                       init_failed_traceback=info['traceback'])
            check_statuses.append(check_status)

        # Add a service check for the agent
        service_checks.append(create_service_check('datadog.agent.up', AgentCheck.OK,
            hostname=self.hostname))

        # Store the metrics and events in the payload.
        payload['metrics'] = metrics
        payload['events'] = events
        payload['service_checks'] = service_checks

        if self._should_send_additional_data('agent_checks'):
            # Add agent checks statuses and error/warning messages
            agent_checks = []
            for check in check_statuses:
                if check.instance_statuses is not None:
                    for instance_status in check.instance_statuses:
                        agent_checks.append(
                            (
                                check.name, check.source_type_name,
                                instance_status.instance_id,
                                instance_status.status,
                                # put error message or list of warning messages in the same field
                                # it will be handled by the UI
                                instance_status.error or instance_status.warnings or ""
                            )
                        )
                else:
                    agent_checks.append(
                        (
                            check.name, check.source_type_name,
                            "initialization",
                            check.status, repr(check.init_failed_error)
                        )
                    )
            payload['agent_checks'] = agent_checks
            payload['meta'] = self.metadata_cache  # add hostname metadata
        collect_duration = timer.step()

        if self.os != 'windows':
            if self._agent_metrics is not None:
                self._agent_metrics.set_metric_context(payload,
                    {
                        'collection_time': collect_duration,
                        'emit_time': self.emit_duration,
                        'cpu_time': time.clock() - cpu_clock
                    })
                self._agent_metrics.run()
                agent_stats = self._agent_metrics.get_metrics()
                payload['metrics'].extend(agent_stats)
                # Dump the metrics to log when in developer mode
                if self.agentConfig.get('developer_mode', False):
                    log.info("\n AGENT STATS: \n {0}".format(Collector._stats_for_display(agent_stats)))
        else:
            if self._agent_metrics is not None:
                self._agent_metrics.set_metric_context(payload,
                    {
                        'collection_time': collect_duration,
                        'emit_time': self.emit_duration,
                    })
                self._agent_metrics.run()
                agent_stats = self._agent_metrics.get_metrics()
                payload['metrics'].extend(agent_stats)
                # Dump the metrics to log when in developer mode
                if self.agentConfig.get('developer_mode', False):
                    log.info("\n AGENT STATS: \n {0}".format(Collector._stats_for_display(agent_stats)))

        # Let's send our payload
        emitter_statuses = self._emit(payload)
        self.emit_duration = timer.step()

        # Persist the status of the collection run.
        try:
            CollectorStatus(check_statuses, emitter_statuses, self.metadata_cache).persist()
        except Exception:
            log.exception("Error persisting collector status")

        if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
            log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                    (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))
            if self.run_count == FLUSH_LOGGING_INITIAL:
                log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD)

        else:
            log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                    (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))

        return payload
Exemplo n.º 7
0
    def run(self, payload):
        # 运行时间计时器
        log.info("Calling %s" % self.path)
        while self.continue_running:
            timer = Timer()
            payload_temp = deepcopy(payload)
            try:
                self._standard_parser(self.path)
                # stderr输出在测试运行时都是空字符串 不管脚本运行正常还是失败
                # 此变量暂时保存 后续有需求时再处理
                stdout_data, stderr_data = self._get_value()
                for i in stdout_data:
                    if 'metric' in i:
                        self._parse_metric(i)
                    elif 'event' in i:
                        self._parse_events(i)
            except IOError as e:
                io_err_event = self._format_event(
                    str(e), "Reading script file failed", "error",
                    "path:%s" % self.path)
                self.events.append(io_err_event)
            except OSError as e:
                os_err_event = self._format_event(
                    str(e), "Executing script file failed", "error",
                    "path:%s" % self.path)
                self.events.append(os_err_event)
            except KeyError as e:
                key_err_event = self._format_event(
                    str(e),
                    "Output of script file missing or misspelled, or unsupported script file",
                    "error", "path:%s" % self.path)
                self.events.append(key_err_event)
            except Exception as e:
                common_err_event = self._format_event(
                    str(e), "Uncatergorized error when calling script file",
                    "error", "path:%s" % self.path)
                self.events.append(common_err_event)
            finally:
                payload_temp["metrics"].extend(self.aggregator.flush())
                payload_temp['events'][self.path] = self.events
                if not payload_temp["metrics"] and not payload_temp['events'][
                        self.path]:
                    no_out_event = self._format_event(
                        "There is no output when executing this script",
                        "Uncatergorized error when calling script file",
                        "error", "path:%s" % self.path)
                    payload_temp['events'][self.path] = no_out_event
                self.events = []
            # log.info(payload_temp['metrics'])
            # log.info(payload_temp['events'])
            collect_duration = timer.step()
            payload_temp.emit(log, self.agent_config, self.emitters,
                              self.continue_running)
            emit_duration = timer.step()

            # 运行状况记录到日志中
            if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
                log.info(
                    "Script: %s. Finished run #%s. Collection time: %ss. Emit time: %ss"
                    % (self.path, self.run_count, round(
                        collect_duration, 2), round(emit_duration, 2)))
                if self.run_count == FLUSH_LOGGING_INITIAL:
                    log.info(
                        "Script: %s. First flushes done, next flushes will be logged every %s flushes."
                        % self.path, FLUSH_LOGGING_PERIOD)
            else:
                log.debug(
                    "Script: %s. Finished run #%s. Collection time: %ss. Emit time: %ss"
                    % (self.path, self.run_count, round(
                        collect_duration, 2), round(emit_duration, 2)))
            time.sleep(self.interval)
Exemplo n.º 8
0
    def run(self, checksd=None, start_event=True, configs_reloaded=False):
        """
        Collect data from each check and submit their data.
        """
        log.debug("Found {num_checks} checks".format(num_checks=len(checksd["initialized_checks"])))
        timer = Timer()
        if not Platform.is_windows():
            cpu_clock = time.clock()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        if checksd:
            self.initialized_checks_d = checksd["initialized_checks"]  # is a list of AgentCheck instances
            self.init_failed_checks_d = checksd["init_failed_checks"]  # is of type {check_name: {error, traceback}}

        payload = AgentPayload()

        # Find the AgentMetrics check and pop it out
        # This check must run at the end of the loop to collect info on agent performance
        if not self._agent_metrics or configs_reloaded:
            for check in self.initialized_checks_d:
                if check.name == AGENT_METRICS_CHECK_NAME:
                    self._agent_metrics = check
                    self.initialized_checks_d.remove(check)
                    break

        # Initialize payload
        self._build_payload(payload)

        metrics = payload["metrics"]
        events = payload["events"]
        service_checks = payload["service_checks"]

        # Run the system checks. Checks will depend on the OS
        if Platform.is_windows():
            # Win32 system checks
            try:
                metrics.extend(self._win32_system_checks["memory"].check(self.agentConfig))
                metrics.extend(self._win32_system_checks["cpu"].check(self.agentConfig))
                metrics.extend(self._win32_system_checks["network"].check(self.agentConfig))
                metrics.extend(self._win32_system_checks["io"].check(self.agentConfig))
                metrics.extend(self._win32_system_checks["proc"].check(self.agentConfig))
            except Exception:
                log.exception("Unable to fetch Windows system metrics.")
        else:
            # Unix system checks
            sys_checks = self._unix_system_checks

            load = sys_checks["load"].check(self.agentConfig)
            payload.update(load)

            system = sys_checks["system"].check(self.agentConfig)
            payload.update(system)

            memory = sys_checks["memory"].check(self.agentConfig)

            if memory:
                memstats = {
                    "memPhysUsed": memory.get("physUsed"),
                    "memPhysPctUsable": memory.get("physPctUsable"),
                    "memPhysFree": memory.get("physFree"),
                    "memPhysTotal": memory.get("physTotal"),
                    "memPhysUsable": memory.get("physUsable"),
                    "memSwapUsed": memory.get("swapUsed"),
                    "memSwapFree": memory.get("swapFree"),
                    "memSwapPctFree": memory.get("swapPctFree"),
                    "memSwapTotal": memory.get("swapTotal"),
                    "memCached": memory.get("physCached"),
                    "memBuffers": memory.get("physBuffers"),
                    "memShared": memory.get("physShared"),
                }
                payload.update(memstats)

            ioStats = sys_checks["io"].check(self.agentConfig)
            if ioStats:
                payload["ioStats"] = ioStats

            processes = sys_checks["processes"].check(self.agentConfig)
            payload.update({"processes": processes})

            cpuStats = sys_checks["cpu"].check(self.agentConfig)
            if cpuStats:
                payload.update(cpuStats)

        # Run old-style checks
        gangliaData = self._ganglia.check(self.agentConfig)
        dogstreamData = self._dogstream.check(self.agentConfig)
        ddforwarderData = self._ddforwarder.check(self.agentConfig)

        if gangliaData is not False and gangliaData is not None:
            payload["ganglia"] = gangliaData

        # dogstream
        if dogstreamData:
            dogstreamEvents = dogstreamData.get("dogstreamEvents", None)
            if dogstreamEvents:
                if "dogstream" in payload["events"]:
                    events["dogstream"].extend(dogstreamEvents)
                else:
                    events["dogstream"] = dogstreamEvents
                del dogstreamData["dogstreamEvents"]

            payload.update(dogstreamData)

        # metrics about the forwarder
        if ddforwarderData:
            payload["datadog"] = ddforwarderData

        # Resources checks
        if not Platform.is_windows():
            has_resource = False
            for resources_check in self._resources_checks:
                try:
                    resources_check.check()
                    snaps = resources_check.pop_snapshots()
                    if snaps:
                        has_resource = True
                        res_value = {"snaps": snaps, "format_version": resources_check.get_format_version()}
                        res_format = resources_check.describe_format_if_needed()
                        if res_format is not None:
                            res_value["format_description"] = res_format
                        payload["resources"][resources_check.RESOURCE_KEY] = res_value
                except Exception:
                    log.exception("Error running resource check %s" % resources_check.RESOURCE_KEY)

            if has_resource:
                payload["resources"]["meta"] = {
                    "api_key": self.agentConfig["api_key"],
                    "host": payload["internalHostname"],
                }

        # newer-style checks (not checks.d style)
        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)

        # checks.d checks
        check_statuses = []
        for check in self.initialized_checks_d:
            if not self.continue_running:
                return
            log.info("Running check %s" % check.name)
            instance_statuses = []
            metric_count = 0
            event_count = 0
            service_check_count = 0
            check_start_time = time.time()
            check_stats = None

            try:
                # Run the check.
                instance_statuses = check.run()

                # Collect the metrics and events.
                current_check_metrics = check.get_metrics()
                current_check_events = check.get_events()
                check_stats = check._get_internal_profiling_stats()

                # Collect metadata
                current_check_metadata = check.get_service_metadata()

                # Save metrics & events for the payload.
                metrics.extend(current_check_metrics)
                if current_check_events:
                    if check.name not in events:
                        events[check.name] = current_check_events
                    else:
                        events[check.name] += current_check_events

                # Save the status of the check.
                metric_count = len(current_check_metrics)
                event_count = len(current_check_events)

            except Exception:
                log.exception("Error running check %s" % check.name)

            check_status = CheckStatus(
                check.name,
                instance_statuses,
                metric_count,
                event_count,
                service_check_count,
                service_metadata=current_check_metadata,
                library_versions=check.get_library_info(),
                source_type_name=check.SOURCE_TYPE_NAME or check.name,
                check_stats=check_stats,
            )

            # Service check for Agent checks failures
            service_check_tags = ["check:%s" % check.name]
            if check_status.status == STATUS_OK:
                status = AgentCheck.OK
            elif check_status.status == STATUS_ERROR:
                status = AgentCheck.CRITICAL
            check.service_check("datadog.agent.check_status", status, tags=service_check_tags)

            # Collect the service checks and save them in the payload
            current_check_service_checks = check.get_service_checks()
            if current_check_service_checks:
                service_checks.extend(current_check_service_checks)
            service_check_count = len(current_check_service_checks)

            # Update the check status with the correct service_check_count
            check_status.service_check_count = service_check_count
            check_statuses.append(check_status)

            check_run_time = time.time() - check_start_time
            log.debug("Check %s ran in %.2f s" % (check.name, check_run_time))

            # Intrument check run timings if enabled.
            if self.check_timings:
                metric = "datadog.agent.check_run_time"
                meta = {"tags": ["check:%s" % check.name]}
                metrics.append((metric, time.time(), check_run_time, meta))

        for check_name, info in self.init_failed_checks_d.iteritems():
            if not self.continue_running:
                return
            check_status = CheckStatus(
                check_name,
                None,
                None,
                None,
                None,
                init_failed_error=info["error"],
                init_failed_traceback=info["traceback"],
            )
            check_statuses.append(check_status)

        # Add a service check for the agent
        service_checks.append(create_service_check("datadog.agent.up", AgentCheck.OK, hostname=self.hostname))

        # Store the metrics and events in the payload.
        payload["metrics"] = metrics
        payload["events"] = events
        payload["service_checks"] = service_checks

        # Populate metadata
        self._populate_payload_metadata(payload, check_statuses, start_event)

        collect_duration = timer.step()

        if self._agent_metrics:
            metric_context = {"collection_time": collect_duration, "emit_time": self.emit_duration}
            if not Platform.is_windows():
                metric_context["cpu_time"] = time.clock() - cpu_clock

            self._agent_metrics.set_metric_context(payload, metric_context)
            self._agent_metrics.run()
            agent_stats = self._agent_metrics.get_metrics()
            payload["metrics"].extend(agent_stats)
            if self.agentConfig.get("developer_mode"):
                log.debug("\n Agent developer mode stats: \n {0}".format(Collector._stats_for_display(agent_stats)))

        # Let's send our payload
        emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running)
        self.emit_duration = timer.step()

        # Persist the status of the collection run.
        try:
            CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist()
        except Exception:
            log.exception("Error persisting collector status")

        if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
            log.info(
                "Finished run #%s. Collection time: %ss. Emit time: %ss"
                % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))
            )
            if self.run_count == FLUSH_LOGGING_INITIAL:
                log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD)
        else:
            log.debug(
                "Finished run #%s. Collection time: %ss. Emit time: %ss"
                % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))
            )

        return payload
Exemplo n.º 9
0
    def run(self, checksd=None):
        """
        Collect data from each check and submit their data.
        """
        timer = Timer()
        self.run_count += 1
        logger.info("Starting collection run #%s" % self.run_count)

        payload = self._build_payload()
        metrics = payload['metrics']
        events = payload['events']

        # Run the system checks. Checks will depend on the OS
        if self.os == 'windows':
            # Win32 system checks
            metrics.extend(self._win32_system_checks['disk'].check(
                self.agentConfig))
            metrics.extend(self._win32_system_checks['memory'].check(
                self.agentConfig))
            metrics.extend(self._win32_system_checks['cpu'].check(
                self.agentConfig))
            metrics.extend(self._win32_system_checks['network'].check(
                self.agentConfig))
            metrics.extend(self._win32_system_checks['io'].check(
                self.agentConfig))
            metrics.extend(self._win32_system_checks['proc'].check(
                self.agentConfig))
        else:
            # Unix system checks
            sys_checks = self._unix_system_checks

            diskUsage = sys_checks['disk'].check(self.agentConfig)
            if diskUsage and len(diskUsage) == 2:
                payload["diskUsage"] = diskUsage[0]
                payload["inodes"] = diskUsage[1]

            load = sys_checks['load'].check(self.agentConfig)
            payload.update(load)

            memory = sys_checks['memory'].check(self.agentConfig)
            payload.update({
                'memPhysUsed': memory.get('physUsed'),
                'memPhysFree': memory.get('physFree'),
                'memPhysTotal': memory.get('physTotal'),
                'memPhysUsable': memory.get('physUsable'),
                'memSwapUsed': memory.get('swapUsed'),
                'memSwapFree': memory.get('swapFree'),
                'memSwapTotal': memory.get('swapTotal'),
                'memCached': memory.get('physCached'),
                'memBuffers': memory.get('physBuffers'),
                'memShared': memory.get('physShared')
            })

            ioStats = sys_checks['io'].check(checks_logger, self.agentConfig)
            if ioStats:
                payload['ioStats'] = ioStats

            processes = sys_checks['processes'].check(checks_logger,
                                                      self.agentConfig)
            payload.update({'processes': processes})

            networkTraffic = sys_checks['network'].check(self.agentConfig)
            payload.update({'networkTraffic': networkTraffic})

            cpuStats = sys_checks['cpu'].check(self.agentConfig)
            if cpuStats:
                payload.update(cpuStats)

        # Run old-style checks
        mysqlStatus = self._mysql.check(self.agentConfig)
        rabbitmq = self._rabbitmq.check(checks_logger, self.agentConfig)
        mongodb = self._mongodb.check(self.agentConfig)
        couchdb = self._couchdb.check(self.agentConfig)
        gangliaData = self._ganglia.check(self.agentConfig)
        cassandraData = self._cassandra.check(checks_logger, self.agentConfig)
        dogstreamData = self._dogstream.check(self.agentConfig)
        ddforwarderData = self._ddforwarder.check(self.agentConfig)

        if gangliaData is not False and gangliaData is not None:
            payload['ganglia'] = gangliaData

        if cassandraData is not False and cassandraData is not None:
            payload['cassandra'] = cassandraData

        # MySQL Status
        if mysqlStatus:
            payload.update(mysqlStatus)

        # RabbitMQ
        if rabbitmq:
            payload['rabbitMQ'] = rabbitmq

        # MongoDB
        if mongodb:
            if mongodb.has_key('events'):
                events['Mongo'] = mongodb['events']['Mongo']
                del mongodb['events']
            payload['mongoDB'] = mongodb

        # CouchDB
        if couchdb:
            payload['couchDB'] = couchdb

        # dogstream
        if dogstreamData:
            dogstreamEvents = dogstreamData.get('dogstreamEvents', None)
            if dogstreamEvents:
                if 'dogstream' in payload['events']:
                    events['dogstream'].extend(dogstreamEvents)
                else:
                    events['dogstream'] = dogstreamEvents
                del dogstreamData['dogstreamEvents']

            payload.update(dogstreamData)

        # metrics about the forwarder
        if ddforwarderData:
            payload['datadog'] = ddforwarderData

        # Process the event checks.
        for event_check in self._event_checks:
            event_data = event_check.check(checks_logger, self.agentConfig)
            if event_data:
                events[event_check.key] = event_data

        # Resources checks
        if self.os != 'windows':
            has_resource = False
            for resources_check in self._resources_checks:
                resources_check.check()
                snaps = resources_check.pop_snapshots()
                if snaps:
                    has_resource = True
                    res_value = {
                        'snaps': snaps,
                        'format_version': resources_check.get_format_version()
                    }
                    res_format = resources_check.describe_format_if_needed()
                    if res_format is not None:
                        res_value['format_description'] = res_format
                    payload['resources'][
                        resources_check.RESOURCE_KEY] = res_value

            if has_resource:
                payload['resources']['meta'] = {
                    'api_key': self.agentConfig['api_key'],
                    'host': payload['internalHostname'],
                }

        # newer-style checks (not checks.d style)
        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)

        # checks.d checks
        checksd = checksd or []
        for check in checksd:
            check_cls = check['class']
            for instance in check['instances']:
                try:
                    # Run the check for each configuration
                    check_cls.check(instance)
                    metrics.extend(check_cls.get_metrics())
                    if check_cls.has_events():
                        if check['name'] not in events:
                            events[check['name']] = []
                        for ev in check_cls.get_events():
                            events[check['name']].append(ev)
                except Exception:
                    logger.exception("Check %s failed" % check_cls.name)

        # Store the metrics and events in the payload.
        payload['metrics'] = metrics
        payload['events'] = events
        collect_duration = timer.step()

        # Pass the payload along to the emitters.
        for emitter in self.emitters:
            emitter(payload, checks_logger, self.agentConfig)
        emit_duration = timer.step()

        logger.info("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                    (self.run_count, round(collect_duration,
                                           2), round(emit_duration, 2)))
Exemplo n.º 10
0
    def run(self, checksd=None, start_event=True):
        """
        Collect data from each check and submit their data.
        """
        timer = Timer()
        if self.os != 'windows':
            cpu_clock = time.clock()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        if checksd:
            self.initialized_checks_d = checksd['initialized_checks']  # is a list of AgentCheck instances
            self.init_failed_checks_d = checksd['init_failed_checks']  # is of type {check_name: {error, traceback}}

        payload = AgentPayload()

        # Find the AgentMetrics check and pop it out
        # This check must run at the end of the loop to collect info on agent performance
        if not self._agent_metrics:
            for check in self.initialized_checks_d:
                if check.name == AGENT_METRICS_CHECK_NAME:
                    self._agent_metrics = check
                    self.initialized_checks_d.remove(check)
                    break

        # Initialize payload
        self._build_payload(payload)

        metrics = payload['metrics']
        events = payload['events']
        service_checks = payload['service_checks']

        # Run the system checks. Checks will depend on the OS
        if self.os == 'windows':
            # Win32 system checks
            try:
                metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['network'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['io'].check(self.agentConfig))
                metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig))
            except Exception:
                log.exception('Unable to fetch Windows system metrics.')
        else:

            sd_checks = self._server_density_checks

            identifier = sd_checks['identifier'].check(self.agentConfig)
            payload.update(identifier)

            # SDv1 plugins
            pluginsData = sd_checks['plugins'].check(self.agentConfig)
            if pluginsData:
                payload['plugins'] = pluginsData

            # Unix system checks
            sys_checks = self._unix_system_checks

            load = sys_checks['load'].check(self.agentConfig)
            payload.update(load)

            system = sys_checks['system'].check(self.agentConfig)
            payload.update(system)

            memory = sys_checks['memory'].check(self.agentConfig)

            if memory:
                payload.update({
                    'memPhysUsed': memory.get('physUsed'),
                    'memPhysPctUsable': memory.get('physPctUsable'),
                    'memPhysFree': memory.get('physFree'),
                    'memPhysTotal': memory.get('physTotal'),
                    'memPhysUsable': memory.get('physUsable'),
                    'memSwapUsed': memory.get('swapUsed'),
                    'memSwapFree': memory.get('swapFree'),
                    'memSwapPctFree': memory.get('swapPctFree'),
                    'memSwapTotal': memory.get('swapTotal'),
                    'memCached': memory.get('physCached'),
                    'memBuffers': memory.get('physBuffers'),
                    'memShared': memory.get('physShared')
                })

            ioStats = sys_checks['io'].check(self.agentConfig)
            if ioStats:
                payload['ioStats'] = ioStats

            processes = sys_checks['processes'].check(self.agentConfig)
            payload.update({'processes': processes})

            cpuStats = sys_checks['cpu'].check(self.agentConfig)
            if cpuStats:
                payload.update(cpuStats)

        # Run old-style checks
        gangliaData = self._ganglia.check(self.agentConfig)

        if gangliaData is not False and gangliaData is not None:
            payload['ganglia'] = gangliaData

        # Resources checks
        if self.os != 'windows':
            has_resource = False
            for resources_check in self._resources_checks:
                resources_check.check()
                snaps = resources_check.pop_snapshots()
                if snaps:
                    has_resource = True
                    res_value = {
                        'snaps': snaps,
                        'format_version': resources_check.get_format_version()
                    }
                    res_format = resources_check.describe_format_if_needed()
                    if res_format is not None:
                        res_value['format_description'] = res_format
                    payload['resources'][resources_check.RESOURCE_KEY] = res_value

            if has_resource:
                payload['resources']['meta'] = {
                    'agent_key': self.agentConfig['agent_key'],
                    'host': payload['internalHostname'],
                }

        # newer-style checks (not checks.d style)
        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)

        # checks.d checks
        check_statuses = []
        for check in self.initialized_checks_d:
            if not self.continue_running:
                return
            log.info("Running check %s" % check.name)
            instance_statuses = []
            metric_count = 0
            event_count = 0
            service_check_count = 0
            check_start_time = time.time()
            check_stats = None

            try:
                # Run the check.
                instance_statuses = check.run()

                # Collect the metrics and events.
                current_check_metrics = check.get_metrics()
                current_check_events = check.get_events()
                check_stats = check._get_internal_profiling_stats()

                # Collect metadata
                current_check_metadata = check.get_service_metadata()

                # Save metrics & events for the payload.
                metrics.extend(current_check_metrics)
                if current_check_events:
                    if check.name not in events:
                        events[check.name] = current_check_events
                    else:
                        events[check.name] += current_check_events

                # Save the status of the check.
                metric_count = len(current_check_metrics)
                event_count = len(current_check_events)

            except Exception:
                log.exception("Error running check %s" % check.name)

            check_status = CheckStatus(
                check.name, instance_statuses, metric_count,
                event_count, service_check_count, service_metadata=current_check_metadata,
                library_versions=check.get_library_info(),
                source_type_name=check.SOURCE_TYPE_NAME or check.name,
                check_stats=check_stats
            )

            # Service check for Agent checks failures
            service_check_tags = ["check:%s" % check.name]
            if check_status.status == STATUS_OK:
                status = AgentCheck.OK
            elif check_status.status == STATUS_ERROR:
                status = AgentCheck.CRITICAL
            check.service_check('sd.agent.check_status', status, tags=service_check_tags)

            # Collect the service checks and save them in the payload
            current_check_service_checks = check.get_service_checks()
            if current_check_service_checks:
                service_checks.extend(current_check_service_checks)
            service_check_count = len(current_check_service_checks)

            # Update the check status with the correct service_check_count
            check_status.service_check_count = service_check_count
            check_statuses.append(check_status)

            check_run_time = time.time() - check_start_time
            log.debug("Check %s ran in %.2f s" % (check.name, check_run_time))

            # Intrument check run timings if enabled.
            if self.check_timings:
                metric = 'sd.agent.check_run_time'
                meta = {'tags': ["check:%s" % check.name]}
                metrics.append((metric, time.time(), check_run_time, meta))

        for check_name, info in self.init_failed_checks_d.iteritems():
            if not self.continue_running:
                return
            check_status = CheckStatus(check_name, None, None, None, None,
                                       init_failed_error=info['error'],
                                       init_failed_traceback=info['traceback'])
            check_statuses.append(check_status)

        # Add a service check for the agent
        service_checks.append(create_service_check('sd.agent.up', AgentCheck.OK,
                              hostname=self.hostname))

        # Store the metrics and events in the payload.
        payload['metrics'] = metrics
        payload['events'] = events
        payload['service_checks'] = service_checks

        # Populate metadata
        self._populate_payload_metadata(payload, check_statuses, start_event)

        collect_duration = timer.step()

        if self.os != 'windows':
            if self._agent_metrics is not None:
                self._agent_metrics.set_metric_context(payload,
                    {
                        'collection_time': collect_duration,
                        'emit_time': self.emit_duration,
                        'cpu_time': time.clock() - cpu_clock
                    })
                self._agent_metrics.run()
                agent_stats = self._agent_metrics.get_metrics()
                payload['metrics'].extend(agent_stats)
                # Dump the metrics to log when in developer mode
                if self.agentConfig.get('developer_mode', False):
                    log.info("\n AGENT STATS: \n {0}".format(Collector._stats_for_display(agent_stats)))
        else:
            if self._agent_metrics is not None:
                self._agent_metrics.set_metric_context(payload,
                    {
                        'collection_time': collect_duration,
                        'emit_time': self.emit_duration,
                    })
                self._agent_metrics.run()
                agent_stats = self._agent_metrics.get_metrics()
                payload['metrics'].extend(agent_stats)
                # Dump the metrics to log when in developer mode
                if self.agentConfig.get('developer_mode', False):
                    log.info("\n AGENT STATS: \n {0}".format(Collector._stats_for_display(agent_stats)))

        # Let's send our payload
        emitter_statuses = payload.emit(log, self.agentConfig, self.emitters,
                                        self.continue_running)
        self.emit_duration = timer.step()

        # Persist the status of the collection run.
        try:
            CollectorStatus(check_statuses, emitter_statuses,
                            self.hostname_metadata_cache).persist()
        except Exception:
            log.exception("Error persisting collector status")

        if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
            log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                     (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))
            if self.run_count == FLUSH_LOGGING_INITIAL:
                log.info("First flushes done, next flushes will be logged every %s flushes." %
                         FLUSH_LOGGING_PERIOD)
        else:
            log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                      (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))

        return payload
Exemplo n.º 11
0
    def run(self, checksd=None):
        """
        Collect data from each check and submit their data.
        """
        timer = Timer()
        self.run_count += 1
        logger.info("Starting collection run #%s" % self.run_count)

        payload = self._build_payload()
        metrics = payload['metrics']
        events = payload['events']

        # Run the system checks. Checks will depend on the OS
        if self.os == 'windows':
            # Win32 system checks
            metrics.extend(self._win32_system_checks['disk'].check(self.agentConfig))
            metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig))
            metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig))
            metrics.extend(self._win32_system_checks['network'].check(self.agentConfig))
            metrics.extend(self._win32_system_checks['io'].check(self.agentConfig))
            metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig))
        else:
            # Unix system checks
            sys_checks = self._unix_system_checks

            diskUsage = sys_checks['disk'].check(self.agentConfig)
            if diskUsage and len(diskUsage) == 2:
                payload["diskUsage"] = diskUsage[0]
                payload["inodes"] = diskUsage[1]

            load = sys_checks['load'].check(self.agentConfig)
            payload.update(load)
                
            memory = sys_checks['memory'].check(self.agentConfig)
            payload.update({
                'memPhysUsed' : memory.get('physUsed'), 
                'memPhysFree' : memory.get('physFree'), 
                'memPhysTotal' : memory.get('physTotal'), 
                'memPhysUsable' : memory.get('physUsable'), 
                'memSwapUsed' : memory.get('swapUsed'), 
                'memSwapFree' : memory.get('swapFree'), 
                'memSwapTotal' : memory.get('swapTotal'), 
                'memCached' : memory.get('physCached'), 
                'memBuffers': memory.get('physBuffers'),
                'memShared': memory.get('physShared')
            })

            ioStats = sys_checks['io'].check(checks_logger, self.agentConfig)
            if ioStats:
                payload['ioStats'] = ioStats

            processes = sys_checks['processes'].check(checks_logger, self.agentConfig)
            payload.update({'processes': processes})

            networkTraffic = sys_checks['network'].check(self.agentConfig)
            payload.update({'networkTraffic': networkTraffic})

            cpuStats = sys_checks['cpu'].check(self.agentConfig)
            if cpuStats:
                payload.update(cpuStats)

        # Run old-style checks
        mysqlStatus = self._mysql.check(self.agentConfig)
        rabbitmq = self._rabbitmq.check(checks_logger, self.agentConfig)
        mongodb = self._mongodb.check(self.agentConfig)
        couchdb = self._couchdb.check(self.agentConfig)
        gangliaData = self._ganglia.check(self.agentConfig)
        cassandraData = self._cassandra.check(checks_logger, self.agentConfig)
        dogstreamData = self._dogstream.check(self.agentConfig)
        ddforwarderData = self._ddforwarder.check(self.agentConfig)

        if gangliaData is not False and gangliaData is not None:
            payload['ganglia'] = gangliaData
           
        if cassandraData is not False and cassandraData is not None:
            payload['cassandra'] = cassandraData
            
        # MySQL Status
        if mysqlStatus:
            payload.update(mysqlStatus)
       
        # RabbitMQ
        if rabbitmq:
            payload['rabbitMQ'] = rabbitmq
        
        # MongoDB
        if mongodb:
            if mongodb.has_key('events'):
                events['Mongo'] = mongodb['events']['Mongo']
                del mongodb['events']
            payload['mongoDB'] = mongodb
            
        # CouchDB
        if couchdb:
            payload['couchDB'] = couchdb
        
        # dogstream
        if dogstreamData:
            dogstreamEvents = dogstreamData.get('dogstreamEvents', None)
            if dogstreamEvents:
                if 'dogstream' in payload['events']:
                    events['dogstream'].extend(dogstreamEvents)
                else:
                    events['dogstream'] = dogstreamEvents
                del dogstreamData['dogstreamEvents']

            payload.update(dogstreamData)

        # metrics about the forwarder
        if ddforwarderData:
            payload['datadog'] = ddforwarderData
 
        # Process the event checks. 
        for event_check in self._event_checks:
            event_data = event_check.check(checks_logger, self.agentConfig)
            if event_data:
                events[event_check.key] = event_data

        # Resources checks
        if self.os != 'windows':
            has_resource = False
            for resources_check in self._resources_checks:
                resources_check.check()
                snaps = resources_check.pop_snapshots()
                if snaps:
                    has_resource = True
                    res_value = { 'snaps': snaps,
                                  'format_version': resources_check.get_format_version() }                              
                    res_format = resources_check.describe_format_if_needed()
                    if res_format is not None:
                        res_value['format_description'] = res_format
                    payload['resources'][resources_check.RESOURCE_KEY] = res_value
     
            if has_resource:
                payload['resources']['meta'] = {
                            'api_key': self.agentConfig['api_key'],
                            'host': payload['internalHostname'],
                        }

        # newer-style checks (not checks.d style)
        for metrics_check in self._metrics_checks:
            res = metrics_check.check(self.agentConfig)
            if res:
                metrics.extend(res)

        # checks.d checks
        checksd = checksd or []
        for check in checksd:
            check_cls = check['class']
            for instance in check['instances']:
                try:
                    # Run the check for each configuration
                    check_cls.check(instance)
                    metrics.extend(check_cls.get_metrics())
                    if check_cls.has_events():
                        if check['name'] not in events:
                            events[check['name']] = []
                        for ev in check_cls.get_events():
                            events[check['name']].append(ev)
                except Exception:
                    logger.exception("Check %s failed" % check_cls.name)

        # Store the metrics and events in the payload.
        payload['metrics'] = metrics
        payload['events'] = events
        collect_duration = timer.step()

        # Pass the payload along to the emitters.
        for emitter in self.emitters:
            emitter(payload, checks_logger, self.agentConfig)
        emit_duration = timer.step()

        logger.info("Finished run #%s. Collection time: %ss. Emit time: %ss" %
                    (self.run_count, round(collect_duration, 2), round(emit_duration, 2)))