示例#1
0
    def _add_conntrack_stats_metrics(self, conntrack_path, use_sudo_conntrack,
                                     tags):
        """
        Parse the output of conntrack -S
        Add the parsed metrics
        """
        try:
            cmd = [conntrack_path, "-S"]
            if use_sudo_conntrack:
                cmd.insert(0, "sudo")
            output, _, _ = get_subprocess_output(cmd, self.log)
            # conntrack -S sample:
            # cpu=0 found=27644 invalid=19060 ignore=485633411 insert=0 insert_failed=1 \
            #       drop=1 early_drop=0 error=0 search_restart=39936711
            # cpu=1 found=21960 invalid=17288 ignore=475938848 insert=0 insert_failed=1 \
            #       drop=1 early_drop=0 error=0 search_restart=36983181

            lines = output.splitlines()

            for line in lines:
                cols = line.split()
                cpu_num = cols[0].split('=')[-1]
                cpu_tag = ['cpu:{}'.format(cpu_num)]
                cols = cols[1:]

                for cell in cols:
                    metric, value = cell.split('=')
                    self.monotonic_count(
                        'system.net.conntrack.{}'.format(metric),
                        int(value),
                        tags=tags + cpu_tag)
        except SubprocessOutputEmptyError:
            self.log.debug("Couldn't use %s to get conntrack stats",
                           conntrack_path)
示例#2
0
    def check(self, instance):
        stat_out, err, _ = get_subprocess_output(self.nfs_cmd, self.log)
        all_devices = []
        this_device = []
        custom_tags = instance.get("tags", [])
        stats = stat_out.splitlines()

        if 'No NFS mount point' in stats[0]:
            if not self.autofs_enabled:
                self.warning("No NFS mount points were found.")
            else:
                self.log.debug("AutoFS enabled: no mount points currently.")
            return

        for l in stats:
            if not l:
                continue
            elif l.find('mounted on') >= 0 and len(this_device) > 0:
                # if it's a new device, create the device and add it to the array
                device = Device(this_device, self.log)
                all_devices.append(device)
                this_device = []
            this_device.append(l.strip().split())

        # Add the last device into the array
        device = Device(this_device, self.log)
        all_devices.append(device)

        # Disregard the first half of device stats (report 1 of 2)
        # as that is the moving average
        all_devices = all_devices[len(all_devices) // 2:]

        for device in all_devices:
            device.send_metrics(self.gauge, custom_tags)
示例#3
0
    def _collect_raw(self, ceph_cmd, ceph_cluster, instance):
        use_sudo = _is_affirmative(instance.get('use_sudo', False))
        ceph_args = []
        if use_sudo:
            test_sudo = os.system('setsid sudo -l < /dev/null')
            if test_sudo != 0:
                raise Exception('The dd-agent user does not have sudo access')
            ceph_args = 'sudo {}'.format(ceph_cmd)
        else:
            ceph_args = ceph_cmd

        ceph_args = '{} --cluster {}'.format(ceph_args, ceph_cluster)

        raw = {}
        for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats',
                    'osd perf', 'health detail'):
            try:
                args = '{} {} -fjson'.format(ceph_args, cmd)
                output, _, _ = get_subprocess_output(args.split(), self.log)
                res = json.loads(output)
            except Exception as e:
                self.log.warning('Unable to parse data from cmd=%s: %s', cmd,
                                 e)
                continue

            name = cmd.replace(' ', '_')
            raw[name] = res

        return raw
示例#4
0
    def _get_varnish_adm(self, version):
        cmd = []
        if geteuid() != 0:
            cmd.append('sudo')

        if version < LooseVersion('4.1.0'):
            cmd.extend(self.varnishadm_path +
                       ['-S', self.secretfile_path, 'debug.health'])
        else:
            cmd.extend(self.varnishadm_path + [
                '-T',
                '{}:{}'.format(self.daemon_host, self.daemon_port),
                '-S',
                self.secretfile_path,
                'backend.list',
                '-p',
            ])

        err, output = None, None
        try:
            output, err, _ = get_subprocess_output(cmd,
                                                   self.log,
                                                   raise_on_empty_output=False)
        except OSError as e:
            self.log.error(
                "There was an error running varnishadm. Make sure 'sudo' is available. %s",
                e)
            output = None
        if err or not output:
            self.log.error('Error getting service check from varnishadm: %s',
                           err)
        return output
示例#5
0
    def check(self, instance):
        if instance is None:
            instance = {}

        cmd = "ss --numeric --listening --tcp"
        output, _, _ = get_subprocess_output(["sh", "-c", cmd], self.log, raise_on_empty_output=True)

        # Run "ss --numeric --listening --tcp" command on host.
        # Expected output:
        # State             Recv-Q             Send-Q                          Local Address:Port                          Peer Address:Port
        # LISTEN            0                  128                                 127.0.0.1:6062                               0.0.0.0:*
        # LISTEN            0                  128                                   0.0.0.0:111                                0.0.0.0:*
        # LISTEN            0                  128                                   0.0.0.0:22                                 0.0.0.0:*
        # LISTEN            0                  100                                 127.0.0.1:25                                 0.0.0.0:*
        # LISTEN            0                  128                                 127.0.0.1:8126                               0.0.0.0:*
        # LISTEN            0                  128                                 127.0.0.1:5000                               0.0.0.0:*
        # LISTEN            0                  128                                 127.0.0.1:5001                               0.0.0.0:*
        # LISTEN            0                  80                                          *:3306                                     *:*
        # LISTEN            0                  128                                      [::]:111                                   [::]:*
        # LISTEN            0                  128                                      [::]:22                                    [::]:*


        lines = output.splitlines()

        # Parse the output into Datadog metrics
        for l in lines[1:]:
            cols = l.split()

            ip, port=cols[3].rsplit(':', 1)
            #print(cols[1], cols[2], cols[3], port)
            self.gauge("ss.listening.recvq", cols[1], tags=["port:"+str(port), "type:tcp"])
            self.gauge("ss.listening.sendq", cols[2], tags=["port:"+str(port), "type:tcp"])
示例#6
0
    def call_unbound_control(self, command, tags):
        try:
            # Pass raise_on_empty_output as False so we get a chance to log stderr
            ub_out, ub_err, returncode = get_subprocess_output(command, self.log, raise_on_empty_output=False)
        except Exception as e:
            self.service_check(
                self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="exception collecting stats", tags=tags
            )
            raise Exception("Unable to get unbound stats: {}".format(str(e)))

        for line in ub_err.splitlines():
            self.log.debug('stderr from %s: %s', command, line)

        # Check the return value
        if returncode != 0:
            self.service_check(
                self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="non-zero return code collecting stats", tags=tags
            )
            raise Exception('"{}" failed, return code: {}'.format(command, returncode))

        # And because we pass raise_on_empty_output as False, check that too
        if not ub_out:
            self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="no stats", tags=tags)
            raise Exception('no output from "{}"'.format(command))

        return ub_out
示例#7
0
def which(program, use_sudo, log):
    def is_exe(fpath):
        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

    if use_sudo:
        # Pass raise_on_empty_output as False to Leave it to the caller to handle the not
        # found case.
        stdout, stderr, returncode = get_subprocess_output(['sudo', 'which', program], log, raise_on_empty_output=False)
        if returncode == 0:
            return stdout

        for line in stderr.splitlines():
            log.debug('stderr from sudo which %s: %s', program, line)

        return None

    fpath, fname = os.path.split(program)
    if fpath:
        if is_exe(program):
            return program
    else:
        for path in os.environ["PATH"].split(os.pathsep):
            exe_file = os.path.join(path, program)
            if is_exe(exe_file):
                return exe_file

    return None
示例#8
0
    def _exec_ping(self, timeout, target_host):
        if platform.system() == "Windows":  # pragma: nocover
            countOption = "-n"
            timeoutOption = "-w"
            # The timeout option is in ms on Windows
            # https://docs.microsoft.com/en-us/windows-server/administration/windows-commands/ping
            timeout = timeout * 1000
        elif platform.system() == "Darwin":
            countOption = "-c"
            timeoutOption = "-W"  # Also in ms on Mac
            timeout = timeout * 1000
        else:
            # The timeout option is is seconds on Linux, leaving timeout as is
            # https://linux.die.net/man/8/ping
            countOption = "-c"
            timeoutOption = "-W"

        self.log.debug("Running: ping %s %s %s %s %s", countOption, "1", timeoutOption, timeout, target_host)

        lines, err, retcode = get_subprocess_output(
            ["ping", countOption, "1", timeoutOption, str(timeout), target_host], self.log, raise_on_empty_output=True
        )
        self.log.debug("ping returned %s - %s - %s", retcode, lines, err)
        if retcode != 0:
            raise CheckException("ping returned {}: {}".format(retcode, err))

        return lines
示例#9
0
 def check(self, instance):
     files, err, retcode = get_subprocess_output(["exim", "-bpc"],
                                                 self.log,
                                                 raise_on_empty_output=True)
     queue_count = int(files.strip())
     self.gauge('exim.queued.messages.count',
                queue_count,
                tags=['TAG_KEY:TAG_VALUE'])
示例#10
0
    def _get_varnish_stats(self, varnishstat_format):
        cmd = self.varnishstat_path + [
            self.VARNISHSTAT_FORMAT_OPTION[varnishstat_format]
        ]
        for metric in self.metrics_filter:
            cmd.extend(["-f", metric])
        if self.name is not None:
            cmd.extend(['-n', self.name])

        output, _, _ = get_subprocess_output(cmd, self.log)
        return output
示例#11
0
    def pingable(self, host: str):
        """
        Returns True if host (str) responds to a ping request.
        """

        param = '-n' if platform.system().lower() == 'windows' else '-c'
        command = ['ping', param, '1', host]
        out, err, retcode = get_subprocess_output(command,
                                                  self.log,
                                                  raise_on_empty_output=False)
        return retcode == 0
示例#12
0
    def check(self, instance):

        file_info, err, retcode = get_subprocess_output(
            ["ls", "-al", "/var/log/nginx/error.log"],
            self.log,
            raise_on_empty_output=True)

        file_size = file_info.split(" ")[4]

        self.gauge("kurian.nginx.error_log.size",
                   file_size,
                   tags=['component:nginx'])
示例#13
0
    def _get_queue_count(self, directory, queues, tags):
        for queue in queues:
            queue_path = os.path.join(directory, queue)
            if not os.path.exists(queue_path):
                raise Exception('{} does not exist'.format(queue_path))

            count = 0
            if os.geteuid() == 0:
                # dd-agent is running as root (not recommended)
                count = sum(
                    len(files) for root, dirs, files in os.walk(queue_path))
            else:
                # can dd-agent user run sudo?
                test_sudo = ['sudo', '-l']
                _, _, exit_code = get_subprocess_output(
                    test_sudo, self.log, False)
                if exit_code == 0:
                    # default to `root` for backward compatibility
                    postfix_user = self.init_config.get('postfix_user', 'root')
                    cmd = [
                        'sudo', '-u', postfix_user, 'find', queue_path,
                        '-type', 'f'
                    ]
                    output, _, _ = get_subprocess_output(cmd, self.log, False)
                    count = len(output.splitlines())
                else:
                    raise Exception(
                        'The dd-agent user does not have sudo access')

            # emit an individually tagged metric
            self.gauge(
                'postfix.queue.size',
                count,
                tags=tags + [
                    'queue:{}'.format(queue), 'instance:{}'.format(
                        os.path.basename(directory))
                ],
            )
示例#14
0
    def _collect_metadata(self):
        try:
            pc_output, _, _ = get_subprocess_output(['postconf', 'mail_version'], self.log, False)
        except Exception as e:
            self.log.warning('unable to call `postconf mail_version`: %s', e)
            return

        self.log.debug('postconf mail_version output: %s', pc_output)

        if pc_output:
            postfix_version = pc_output.strip('\n').split('=')[1].strip()
            self.log.debug('Postfix Version: %s', postfix_version)
            if postfix_version:
                self.set_metadata('version', postfix_version)
示例#15
0
    def _get_version_info(self):
        # Get the varnish version from varnishstat
        output, error, _ = get_subprocess_output(self.varnishstat_path +
                                                 ["-V"],
                                                 self.log,
                                                 raise_on_empty_output=False)

        # Assumptions regarding varnish's version
        varnishstat_format = "json"
        raw_version = None

        m1 = self.version_pattern.search(output, re.MULTILINE)
        # v2 prints the version on stderr, v3 on stdout
        m2 = self.version_pattern.search(error, re.MULTILINE)

        if m1 is None and m2 is None:
            self.log.warning(
                "Cannot determine the version of varnishstat, assuming 3 or greater"
            )
            self.warning(
                "Cannot determine the version of varnishstat, assuming 3 or greater"
            )
        else:
            if m1 is not None:
                raw_version = m1.group()
            elif m2 is not None:
                raw_version = m2.group()

        self.log.debug("Varnish version: %s", raw_version)

        if raw_version:
            self.set_metadata('version', raw_version)

        if raw_version is None:
            raw_version = '3.0.0'

        version = LooseVersion(raw_version)

        # Location of varnishstat
        if version < LooseVersion('3.0.0'):
            varnishstat_format = "text"
        elif version < LooseVersion(
                '5.0.0'):  # we default to json starting version 5.0.0
            varnishstat_format = "xml"

        return version, varnishstat_format
示例#16
0
    def _get_devices_label_from_blkid(self):
        devices_label = {}
        try:
            blkid_out, _, _ = get_subprocess_output(['blkid'], self.log)
            all_devices = [l.split(':', 1) for l in blkid_out.splitlines()]

            for d in all_devices:
                # Line sample
                # /dev/sda1: LABEL="MYLABEL" UUID="5eea373d-db36-4ce2-8c71-12ce544e8559" TYPE="ext4"
                labels = self._blkid_label_re.findall(d[1])
                if labels:
                    devices_label[d[0]] = 'label:{}'.format(labels[0])

        except SubprocessOutputEmptyError:
            self.log.debug("Couldn't use blkid to have device labels")

        return devices_label
示例#17
0
    def _get_version(self):
        """ Get version from `gunicorn --version` """
        cmd = '{} --version'.format(self.gunicorn_cmd)
        try:
            pc_out, pc_err, _ = get_subprocess_output(cmd, self.log, False)
        except OSError:
            self.log.warning("Error collecting gunicorn version.")
            return None

        match = re.match(self.VERSION_PATTERN, pc_out)
        if not match:
            match = re.match(self.VERSION_PATTERN, pc_err)

        if match:
            return match.groups()[0]
        else:
            self.log.warning("Version not found in stdout `%s` and stderr `%s`", pc_out, pc_err)
        return None
示例#18
0
    def _get_version_from_command_line(self):
        version_command = '{} --version'.format(self._fluentd_command)

        try:
            out, _, _ = get_subprocess_output(version_command,
                                              self.log,
                                              raise_on_empty_output=False)
        except OSError as exc:
            self.log.debug("Error collecting fluentd version: %s", exc)
            return None

        match = re.match(self.VERSION_PATTERN, out)

        if match is None:
            self.log.debug("fluentd version not found in stdout: `%s`", out)
            return None

        return match.group('version')
示例#19
0
    def _get_sendmail_stats(self, sendmail_command, use_sudo):

        if not os.path.exists(sendmail_command):
            raise Exception('{} does not exist'.format(sendmail_command))

        self.log.debug(sendmail_command)

        # mailq sample output. sendmail output is similar.
        ##
        # MSP Queue status...
        # /var/spool/mqueue-client is empty
        #    Total requests: 0
        # MTA Queue status...
        # /var/spool/mqueue is empty
        #     Total requests: 0

        # if we want to use sendmail, we need to append -bp to it
        # https://www.electrictoolbox.com/show-sendmail-mail-queue/
        if "sendmail" in sendmail_command:
            command = [sendmail_command, '-bp']
        else:
            command = [sendmail_command]

        # Listing the directory might require sudo privileges
        if use_sudo:
            try:
                os.system('setsid sudo -l < /dev/null')
                command.insert(0, 'sudo')
            except OSError as e:
                self.log.exception(
                    "trying to retrieve %s with sudo failed with return code %s",
                    command, e)

        self.log.debug(command)

        mail_queue, err, retcode = get_subprocess_output(
            command, self.log, False)
        self.log.debug("Error: %s", err)
        count = mail_queue.splitlines()
        # Retrieve the last total number of requests
        queue_count = int(count[-1][-1])
        self.log.info("Number of mails in the queue: %s", queue_count)

        return queue_count
示例#20
0
    def collect_metrics_manually(self):
        df_out, _, _ = get_subprocess_output(self.DF_COMMAND + ['-k'],
                                             self.log)
        self.log.debug(df_out)

        for device in self._list_devices(df_out):
            self.log.debug("Passed: {}".format(device))
            device_name = device[-1] if self._use_mount else device[0]

            tags = [device[1], 'filesystem:{}'.format(device[1])
                    ] if self._tag_by_filesystem else []
            tags.extend(self._custom_tags)

            # apply device/mountpoint specific tags
            for regex, device_tags in self._device_tag_re:
                if regex.match(device_name):
                    tags += device_tags
            tags.append('device:{}'.format(device_name))
            for metric_name, value in iteritems(
                    self._collect_metrics_manually(device)):
                self.gauge(metric_name, value, tags=tags)
示例#21
0
    def get_process_states(self):
        state_counts = defaultdict(int)
        prio_counts = defaultdict(int)
        ps = get_subprocess_output(['ps', '--no-header', '-eo', 'stat'], self.log)
        for state in ps[0]:
            # Each process state is a flag in a list of characters. See ps(1) for details.
            for _ in list(state):
                if state in PROCESS_STATES:
                    state_counts[PROCESS_STATES[state]] += 1
                elif state in PROCESS_PRIOS:
                    prio_counts[PROCESS_PRIOS[state]] += 1

        for state in state_counts:
            state_tags = list(self.tags)
            state_tags.append("state:" + state)
            self.gauge('system.processes.states', float(state_counts[state]), state_tags)

        for prio in prio_counts:
            prio_tags = list(self.tags)
            prio_tags.append("priority:" + prio)
            self.gauge('system.processes.priorities', float(prio_counts[prio]), prio_tags)
示例#22
0
    def _get_devices_label_from_lsblk(self):
        """
        Get device labels using the `lsblk` command. Returns a map of device name to label:value
        """
        devices_labels = dict()
        try:
            # Use raw output mode (space-separated fields encoded in UTF-8).
            # We want to be compatible with lsblk version 2.19 since
            # it is the last version supported by CentOS 6 and SUSE 11.
            lsblk_out, _, _ = get_subprocess_output(["lsblk", "--noheadings", "--raw", "--output=NAME,LABEL"], self.log)

            for line in lsblk_out.splitlines():
                device, _, label = line.partition(' ')
                if label:
                    # Line sample (device "/dev/sda1" with label " MY LABEL")
                    # sda1  MY LABEL
                    devices_labels["/dev/" + device] = ['label:{}'.format(label), 'device_label:{}'.format(label)]

        except SubprocessOutputEmptyError:
            self.log.debug("Couldn't use lsblk to have device labels")

        return devices_labels
示例#23
0
    def _collect_raw(self, ceph_cmd, ceph_cluster, instance):
        use_sudo = _is_affirmative(instance.get('use_sudo', False))
        if use_sudo:
            test_sudo = os.system('setsid sudo -l < /dev/null')
            if test_sudo != 0:
                raise CheckException(
                    'The dd-agent user does not have sudo access')
            ceph_args = 'sudo {}'.format(ceph_cmd)
        else:
            ceph_args = ceph_cmd

        ceph_args = '{} --cluster {}'.format(ceph_args, ceph_cluster)

        raw = {}
        for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats',
                    'osd perf', 'health detail'):
            try:
                args = '{} {} -fjson'.format(ceph_args, cmd)
                output, _, _ = get_subprocess_output(args.split(), self.log)
                res = json.loads(output)
            except Exception as e:
                self.log.warning('Unable to parse data from cmd=%s: %s', cmd,
                                 e)
                continue

            name = cmd.replace(' ', '_')
            raw[name] = res

        mon_map = raw.get('status', {}).get('monmap')
        if mon_map is None:
            raise RuntimeError("Could not detect Ceph release series")
        if 'min_mon_release_name' in mon_map and mon_map[
                'min_mon_release_name'] == 'octopus':
            self.log.debug("Detected octopus version of ceph...")
            self._octopus = True
        else:
            self._octopus = False

        return raw
示例#24
0
    def _check_linux(self, instance):
        """
        _check_linux can be run inside a container and still collects the network metrics from the host
        For that procfs_path can be set to something like "/host/proc"
        When a custom procfs_path is set, the collect_connection_state option is ignored
        """
        proc_location = self.agentConfig.get('procfs_path',
                                             '/proc').rstrip('/')
        custom_tags = instance.get('tags', [])

        net_proc_base_location = self._get_net_proc_base_location(
            proc_location)

        if self._is_collect_cx_state_runnable(net_proc_base_location):
            try:
                self.log.debug("Using `ss` to collect connection state")
                # Try using `ss` for increased performance over `netstat`
                metrics = self._get_metrics()
                for ip_version in ['4', '6']:
                    # Call `ss` for each IP version because there's no built-in way of distinguishing
                    # between the IP versions in the output
                    # Also calls `ss` for each protocol, because on some systems (e.g. Ubuntu 14.04), there is a
                    # bug that print `tcp` even if it's `udp`
                    # The `-H` flag isn't available on old versions of `ss`.
                    cmd = "ss --numeric --tcp --all --ipv{} | cut -d ' ' -f 1 | sort | uniq -c".format(
                        ip_version)
                    output, _, _ = get_subprocess_output(["sh", "-c", cmd],
                                                         self.log)

                    # 7624 CLOSE-WAIT
                    #   72 ESTAB
                    #    9 LISTEN
                    #    1 State
                    #   37 TIME-WAIT
                    lines = output.splitlines()

                    self._parse_short_state_lines(lines,
                                                  metrics,
                                                  self.tcp_states['ss'],
                                                  ip_version=ip_version)

                    cmd = "ss --numeric --udp --all --ipv{} | wc -l".format(
                        ip_version)
                    output, _, _ = get_subprocess_output(["sh", "-c", cmd],
                                                         self.log)
                    metric = self.cx_state_gauge[('udp{}'.format(ip_version),
                                                  'connections')]
                    metrics[metric] = int(output) - 1  # Remove header

                for metric, value in iteritems(metrics):
                    self.gauge(metric, value, tags=custom_tags)

            except OSError:
                self.log.info("`ss` not found: using `netstat` as a fallback")
                output, _, _ = get_subprocess_output(
                    ["netstat", "-n", "-u", "-t", "-a"], self.log)
                lines = output.splitlines()
                # Active Internet connections (w/o servers)
                # Proto Recv-Q Send-Q Local Address           Foreign Address         State
                # tcp        0      0 46.105.75.4:80          79.220.227.193:2032     SYN_RECV
                # tcp        0      0 46.105.75.4:143         90.56.111.177:56867     ESTABLISHED
                # tcp        0      0 46.105.75.4:50468       107.20.207.175:443      TIME_WAIT
                # tcp6       0      0 46.105.75.4:80          93.15.237.188:58038     FIN_WAIT2
                # tcp6       0      0 46.105.75.4:80          79.220.227.193:2029     ESTABLISHED
                # udp        0      0 0.0.0.0:123             0.0.0.0:*
                # udp6       0      0 :::41458                :::*

                metrics = self._parse_linux_cx_state(
                    lines[2:], self.tcp_states['netstat'], 5)
                for metric, value in iteritems(metrics):
                    self.gauge(metric, value, tags=custom_tags)
            except SubprocessOutputEmptyError:
                self.log.exception("Error collecting connection stats.")

        proc_dev_path = "{}/net/dev".format(net_proc_base_location)
        with open(proc_dev_path, 'r') as proc:
            lines = proc.readlines()
        # Inter-|   Receive                                                 |  Transmit
        #  face |bytes     packets errs drop fifo frame compressed multicast|bytes       packets errs drop fifo colls carrier compressed # noqa: E501
        #     lo:45890956   112797   0    0    0     0          0         0    45890956   112797    0    0    0     0       0          0 # noqa: E501
        #   eth0:631947052 1042233   0   19    0   184          0      1206  1208625538  1320529    0    0    0     0       0          0 # noqa: E501
        #   eth1:       0        0   0    0    0     0          0         0           0        0    0    0    0     0       0          0 # noqa: E501
        for l in lines[2:]:
            cols = l.split(':', 1)
            x = cols[1].split()
            # Filter inactive interfaces
            if self._parse_value(x[0]) or self._parse_value(x[8]):
                iface = cols[0].strip()
                metrics = {
                    'bytes_rcvd':
                    self._parse_value(x[0]),
                    'bytes_sent':
                    self._parse_value(x[8]),
                    'packets_in.count':
                    self._parse_value(x[1]),
                    'packets_in.error':
                    self._parse_value(x[2]) + self._parse_value(x[3]),
                    'packets_out.count':
                    self._parse_value(x[9]),
                    'packets_out.error':
                    self._parse_value(x[10]) + self._parse_value(x[11]),
                }
                self._submit_devicemetrics(iface, metrics, custom_tags)

        netstat_data = {}
        for f in ['netstat', 'snmp']:
            proc_data_path = "{}/net/{}".format(net_proc_base_location, f)
            try:
                with open(proc_data_path, 'r') as netstat:
                    while True:
                        n_header = netstat.readline()
                        if not n_header:
                            break  # No more? Abort!
                        n_data = netstat.readline()

                        h_parts = n_header.strip().split(' ')
                        h_values = n_data.strip().split(' ')
                        ns_category = h_parts[0][:-1]
                        netstat_data[ns_category] = {}
                        # Turn the data into a dictionary
                        for idx, hpart in enumerate(h_parts[1:]):
                            netstat_data[ns_category][hpart] = h_values[idx +
                                                                        1]
            except IOError:
                # On Openshift, /proc/net/snmp is only readable by root
                self.log.debug("Unable to read %s.", proc_data_path)

        nstat_metrics_names = {
            'Tcp': {
                'RetransSegs': 'system.net.tcp.retrans_segs',
                'InSegs': 'system.net.tcp.in_segs',
                'OutSegs': 'system.net.tcp.out_segs',
            },
            'TcpExt': {
                'ListenOverflows': 'system.net.tcp.listen_overflows',
                'ListenDrops': 'system.net.tcp.listen_drops',
                'TCPBacklogDrop': 'system.net.tcp.backlog_drops',
                'TCPRetransFail': 'system.net.tcp.failed_retransmits',
            },
            'Udp': {
                'InDatagrams': 'system.net.udp.in_datagrams',
                'NoPorts': 'system.net.udp.no_ports',
                'InErrors': 'system.net.udp.in_errors',
                'OutDatagrams': 'system.net.udp.out_datagrams',
                'RcvbufErrors': 'system.net.udp.rcv_buf_errors',
                'SndbufErrors': 'system.net.udp.snd_buf_errors',
                'InCsumErrors': 'system.net.udp.in_csum_errors',
            },
        }

        # Skip the first line, as it's junk
        for k in nstat_metrics_names:
            for met in nstat_metrics_names[k]:
                if met in netstat_data.get(k, {}):
                    self._submit_netmetric(nstat_metrics_names[k][met],
                                           self._parse_value(
                                               netstat_data[k][met]),
                                           tags=custom_tags)

        # Get the conntrack -S information
        conntrack_path = instance.get('conntrack_path')
        if conntrack_path is not None:
            self._add_conntrack_stats_metrics(conntrack_path, custom_tags)

        # Get the rest of the metric by reading the files. Metrics available since kernel 3.6
        conntrack_files_location = os.path.join(proc_location, 'sys', 'net',
                                                'netfilter')
        # By default, only max and count are reported. However if the blacklist is set,
        # the whitelist is loosing its default value
        blacklisted_files = instance.get('blacklist_conntrack_metrics')
        whitelisted_files = instance.get('whitelist_conntrack_metrics')
        if blacklisted_files is None and whitelisted_files is None:
            whitelisted_files = ['max', 'count']

        available_files = []

        # Get the metrics to read
        try:
            for metric_file in os.listdir(conntrack_files_location):
                if (os.path.isfile(
                        os.path.join(conntrack_files_location, metric_file))
                        and 'nf_conntrack_' in metric_file):
                    available_files.append(metric_file[len('nf_conntrack_'):])
        except Exception as e:
            self.log.debug("Unable to list the files in {}. {}".format(
                conntrack_files_location, e))

        filtered_available_files = pattern_filter(available_files,
                                                  whitelist=whitelisted_files,
                                                  blacklist=blacklisted_files)

        for metric_name in filtered_available_files:
            metric_file_location = os.path.join(
                conntrack_files_location,
                'nf_conntrack_{}'.format(metric_name))
            try:
                with open(metric_file_location, 'r') as conntrack_file:
                    # Checking it's an integer
                    try:
                        value = int(conntrack_file.read().rstrip())
                        self.gauge(
                            'system.net.conntrack.{}'.format(metric_name),
                            value,
                            tags=custom_tags)
                    except ValueError:
                        self.log.debug(
                            "{} is not an integer".format(metric_name))
            except IOError as e:
                self.log.debug("Unable to read {}, skipping {}.".format(
                    metric_file_location, e))
示例#25
0
    def _check_bsd(self, instance):
        netstat_flags = ['-i', '-b']

        custom_tags = instance.get('tags', [])

        # FreeBSD's netstat truncates device names unless you pass '-W'
        if Platform.is_freebsd():
            netstat_flags.append('-W')

        try:
            output, _, _ = get_subprocess_output(["netstat"] + netstat_flags,
                                                 self.log)
            lines = output.splitlines()
            # Name  Mtu   Network       Address            Ipkts Ierrs     Ibytes    Opkts Oerrs     Obytes  Coll
            # lo0   16384 <Link#1>                        318258     0  428252203   318258     0  428252203     0
            # lo0   16384 localhost   fe80:1::1           318258     -  428252203   318258     -  428252203     -
            # lo0   16384 127           localhost         318258     -  428252203   318258     -  428252203     -
            # lo0   16384 localhost   ::1                 318258     -  428252203   318258     -  428252203     -
            # gif0* 1280  <Link#2>                             0     0          0        0     0          0     0
            # stf0* 1280  <Link#3>                             0     0          0        0     0          0     0
            # en0   1500  <Link#4>    04:0c:ce:db:4e:fa 20801309     0 13835457425 15149389     0 11508790198     0
            # en0   1500  seneca.loca fe80:4::60c:ceff: 20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  2001:470:1f 2001:470:1f07:11d 20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  2001:470:1f 2001:470:1f07:11d 20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  192.168.1     192.168.1.63    20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  2001:470:1f 2001:470:1f07:11d 20801309     - 13835457425 15149389     - 11508790198     -
            # p2p0  2304  <Link#5>    06:0c:ce:db:4e:fa        0     0          0        0     0          0     0
            # ham0  1404  <Link#6>    7a:79:05:4d:bf:f5    30100     0    6815204    18742     0    8494811     0
            # ham0  1404  5             5.77.191.245       30100     -    6815204    18742     -    8494811     -
            # ham0  1404  seneca.loca fe80:6::7879:5ff:    30100     -    6815204    18742     -    8494811     -
            # ham0  1404  2620:9b::54 2620:9b::54d:bff5    30100     -    6815204    18742     -    8494811     -

            headers = lines[0].split()

            # Given the irregular structure of the table above, better to parse from the end of each line
            # Verify headers first
            #          -7       -6       -5        -4       -3       -2        -1
            for h in ("Ipkts", "Ierrs", "Ibytes", "Opkts", "Oerrs", "Obytes",
                      "Coll"):
                if h not in headers:
                    self.log.error("%s not found in %s; cannot parse" %
                                   (h, headers))
                    return False

            current = None
            for l in lines[1:]:
                # Another header row, abort now, this is IPv6 land
                if "Name" in l:
                    break

                x = l.split()
                if len(x) == 0:
                    break

                iface = x[0]
                if iface.endswith("*"):
                    iface = iface[:-1]
                if iface == current:
                    # skip multiple lines of same interface
                    continue
                else:
                    current = iface

                # Filter inactive interfaces
                if self._parse_value(x[-5]) or self._parse_value(x[-2]):
                    iface = current
                    metrics = {
                        'bytes_rcvd': self._parse_value(x[-5]),
                        'bytes_sent': self._parse_value(x[-2]),
                        'packets_in.count': self._parse_value(x[-7]),
                        'packets_in.error': self._parse_value(x[-6]),
                        'packets_out.count': self._parse_value(x[-4]),
                        'packets_out.error': self._parse_value(x[-3]),
                    }
                    self._submit_devicemetrics(iface, metrics, custom_tags)
        except SubprocessOutputEmptyError:
            self.log.exception("Error collecting connection stats.")

        try:
            netstat, _, _ = get_subprocess_output(
                ["netstat", "-s", "-p"
                 "tcp"], self.log)
            # 3651535 packets sent
            #         972097 data packets (615753248 bytes)
            #         5009 data packets (2832232 bytes) retransmitted
            #         0 resends initiated by MTU discovery
            #         2086952 ack-only packets (471 delayed)
            #         0 URG only packets
            #         0 window probe packets
            #         310851 window update packets
            #         336829 control packets
            #         0 data packets sent after flow control
            #         3058232 checksummed in software
            #         3058232 segments (571218834 bytes) over IPv4
            #         0 segments (0 bytes) over IPv6
            # 4807551 packets received
            #         1143534 acks (for 616095538 bytes)
            #         165400 duplicate acks
            #         ...

            self._submit_regexed_values(netstat, BSD_TCP_METRICS, custom_tags)
        except SubprocessOutputEmptyError:
            self.log.exception("Error collecting TCP stats.")
示例#26
0
    def check(self, instance):
        # Not configured? Not a problem.
        if instance.get("varnishstat", None) is None:
            raise Exception("varnishstat is not configured")
        custom_tags = instance.get('tags', [])
        if custom_tags is None:
            custom_tags = []
        else:
            custom_tags = list(set(custom_tags))
        # Split the varnishstat command so that additional arguments can be passed in
        # In order to support monitoring a Varnish instance which is running as a Docker
        # container we need to wrap commands (varnishstat, varnishadm) with scripts which
        # perform a docker exec on the running container. This works fine when running a
        # single container on the host but breaks down when attempting to use the auto
        # discovery feature. This change allows for passing in additional parameters to
        # the script (i.e. %%host%%) so that the command is properly formatted and the
        # desired container is queried.
        varnishstat_path = instance.get('varnishstat', '').split()
        name = instance.get('name')
        metrics_filter = instance.get("metrics_filter", [])
        if not isinstance(metrics_filter, list):
            raise Exception("The parameter 'metrics_filter' must be a list")

        # Get version and version-specific args from varnishstat -V.
        version, varnishstat_format = self._get_version_info(varnishstat_path)

        cmd = varnishstat_path + [self.VARNISHSTAT_FORMAT_OPTION[varnishstat_format]]
        for metric in metrics_filter:
            cmd.extend(["-f", metric])

        if name is not None:
            cmd.extend(['-n', name])
            tags = custom_tags + [u'varnish_name:%s' % name]
        else:
            tags = custom_tags + [u'varnish_name:default']

        output, _, _ = get_subprocess_output(cmd, self.log)

        self._parse_varnishstat(output, varnishstat_format, tags)

        # Parse service checks from varnishadm.
        if instance.get("varnishadm", None):
            # Split the varnishadm command so that additional arguments can be passed in
            # In order to support monitoring a Varnish instance which is running as a Docker
            # container we need to wrap commands (varnishstat, varnishadm) with scripts which
            # perform a docker exec on the running container. This works fine when running a
            # single container on the host but breaks down when attempting to use the auto
            # discovery feature. This change allows for passing in additional parameters to
            # the script (i.e. %%host%%) so that the command is properly formatted and the
            # desired container is queried.
            varnishadm_path = instance.get('varnishadm', '').split()
            secretfile_path = instance.get('secretfile', '/etc/varnish/secret')

            daemon_host = instance.get('daemon_host', 'localhost')
            daemon_port = instance.get('daemon_port', '6082')

            cmd = []
            if geteuid() != 0:
                cmd.append('sudo')

            if version < LooseVersion('4.1.0'):
                cmd.extend(varnishadm_path + ['-S', secretfile_path, 'debug.health'])
            else:
                cmd.extend(
                    varnishadm_path
                    + ['-T', '{}:{}'.format(daemon_host, daemon_port), '-S', secretfile_path, 'backend.list', '-p']
                )

            try:
                output, err, _ = get_subprocess_output(cmd, self.log)
            except OSError as e:
                self.log.error("There was an error running varnishadm. Make sure 'sudo' is available. %s", e)
                output = None
            if err:
                self.log.error('Error getting service check from varnishadm: %s', err)

            if output:
                self._parse_varnishadm(output, custom_tags)
示例#27
0
    def _check(self, instance):
        if not self.binary:
            raise BinaryUnavailable("Cannot find executable: {}".format(self.expected_bin))

        ip_address = self._get_instance_addr(instance)
        metrics = instance.get('metrics', [])
        community_string = instance.get('community_string', 'public')
        timeout = int(instance.get('timeout', self.DEFAULT_TIMEOUT))
        retries = int(instance.get('retries', self.DEFAULT_RETRIES))

        hostname = instance.get('metric_host', None)

        # Build up our dataset
        data = defaultdict(dict)
        types = {}
        for metric in metrics:
            mib = metric['MIB']
            table = metric['table']
            cmd = [self.binary, '-c{}'.format(community_string), '-v2c', '-t', str(timeout), '-r', str(retries)]
            if self.mib_dirs:
                cmd.extend(['-M', self.mib_dirs])
            cmd.extend([ip_address, '{}:{}'.format(mib, table)])

            try:
                output = get_subprocess_output(cmd, self.log)[0]
            except Exception as e:
                error = "Fail to collect metrics for {0} - {1}".format(instance['name'], e)
                self.log.warning(error)
                return [(self.SC_NAME, Status.CRITICAL, error)]

            for line in output.splitlines():
                if not line:
                    continue
                match = self.output_re.match(line)
                if match is not None:
                    symbol = match.group('symbol')
                    index = int(match.group('index'))
                    value = match.group('value')
                    typ = match.group('type')
                    types[symbol] = typ
                    if typ == 'INTEGER':
                        try:
                            value = int(value)
                        except ValueError:
                            pass
                    elif value == '':
                        value = None
                    data[symbol][index] = value
                else:
                    # TODO: remove this
                    self.log.warning('Problem parsing output of snmp walk: %s', line)

        # Get any base configured tags and add our primary tag
        tags = instance.get('tags', []) + ['snmp_device:{}'.format(ip_address)]

        # It seems kind of weird, but from what I can tell the snmp check allows
        # you to add symbols to a metric that were retrieved by another metric,
        # both for values and tags. So you can add a symbol in the 1st metric
        # that pulls data from the 2nd. Same applies to tag lookups. Seems like
        # symbols should have been at the instance level rather than
        # per-metric... That way the bahavior would match up with schema, but oh
        # well.

        # Time to emit metrics
        for metric in metrics:

            # Build a list of dynamic tags per-index
            dynamic_tags = defaultdict(list)
            for metric_tag in metric.get('metric_tags', []):
                if 'column' in metric_tag:
                    tag = metric_tag['tag']
                    column = metric_tag['column']
                    regex = metric_tag.get('regex', None)
                    if regex is not None:
                        # pre-compile our regex
                        regex = re.compile(regex)
                    for i, v in data[column].items():
                        if v is None:
                            # No value for the column, ignore
                            continue
                        elif types[column] == 'INTEGER':
                            # enum/bool etc, use the human readable name
                            v = v.split('(')[0]

                        if regex is not None:
                            # There's a regex for this tag
                            match = regex.match(v)
                            if match is not None:
                                # It matches so we'll apply it, group(1) becomes
                                # the value
                                v = match.group(1)
                                dynamic_tags[i].append('{}:{}'.format(tag, v))
                                additional_tags = metric_tag.get('additional_tags', [])
                                # and we add any additional tags
                                dynamic_tags[i].extend(additional_tags)
                        else:
                            # This is a standard tag, just use the value
                            dynamic_tags[i].append('{}:{}'.format(tag, v))
                else:
                    self.log.debug('unsupported metric_tag: %s', metric_tag)
                    continue

            symbols = metric.get('symbols', [])
            # For each of the symbols we'll be recording as a metric
            for symbol in symbols:
                # For each value for that symbol
                for i, value in data[symbol].items():
                    if value is None:
                        # skip empty
                        continue
                    # metric key
                    key = '{}.{}'.format(SOURCE_TYPE_NAME, symbol)
                    value = int(value)

                    typ = types[symbol]
                    if typ in self.COUNTER_TYPES:
                        self.rate(key, value, tags + dynamic_tags[i], hostname=hostname)
                    elif typ in self.GAUGE_TYPES:
                        self.gauge(key, value, tags + dynamic_tags[i], hostname=hostname)
                    else:
                        raise Exception('unsupported metric symbol type: {}'.format(typ))

        return [(self.SC_NAME, Status.UP, None)]
 def _get_lighthouse_report(command, logger, raise_on_empty=False):
     json, err_msg, exit_code = get_subprocess_output(command, logger, raise_on_empty_output=raise_on_empty)
     return json, err_msg, exit_code
示例#29
0
    def _check_bsd(self, instance):
        netstat_flags = ['-i', '-b']

        custom_tags = instance.get('tags', [])

        # FreeBSD's netstat truncates device names unless you pass '-W'
        if Platform.is_freebsd():
            netstat_flags.append('-W')

        try:
            output, _, _ = get_subprocess_output(["netstat"] + netstat_flags,
                                                 self.log)
            lines = output.splitlines()
            # Name  Mtu   Network       Address            Ipkts Ierrs     Ibytes    Opkts Oerrs     Obytes  Coll
            # lo0   16384 <Link#1>                        318258     0  428252203   318258     0  428252203     0
            # lo0   16384 localhost   fe80:1::1           318258     -  428252203   318258     -  428252203     -
            # lo0   16384 127           localhost         318258     -  428252203   318258     -  428252203     -
            # lo0   16384 localhost   ::1                 318258     -  428252203   318258     -  428252203     -
            # gif0* 1280  <Link#2>                             0     0          0        0     0          0     0
            # stf0* 1280  <Link#3>                             0     0          0        0     0          0     0
            # en0   1500  <Link#4>    04:0c:ce:db:4e:fa 20801309     0 13835457425 15149389     0 11508790198     0
            # en0   1500  seneca.loca fe80:4::60c:ceff: 20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  2001:470:1f 2001:470:1f07:11d 20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  2001:470:1f 2001:470:1f07:11d 20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  192.168.1     192.168.1.63    20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  2001:470:1f 2001:470:1f07:11d 20801309     - 13835457425 15149389     - 11508790198     -
            # p2p0  2304  <Link#5>    06:0c:ce:db:4e:fa        0     0          0        0     0          0     0
            # ham0  1404  <Link#6>    7a:79:05:4d:bf:f5    30100     0    6815204    18742     0    8494811     0
            # ham0  1404  5             5.77.191.245       30100     -    6815204    18742     -    8494811     -
            # ham0  1404  seneca.loca fe80:6::7879:5ff:    30100     -    6815204    18742     -    8494811     -
            # ham0  1404  2620:9b::54 2620:9b::54d:bff5    30100     -    6815204    18742     -    8494811     -

            headers = lines[0].split()

            # Given the irregular structure of the table above, better to parse from the end of each line
            # Verify headers first
            #          -7       -6       -5        -4       -3       -2        -1
            for h in ("Ipkts", "Ierrs", "Ibytes", "Opkts", "Oerrs", "Obytes",
                      "Coll"):
                if h not in headers:
                    self.log.error("%s not found in %s; cannot parse", h,
                                   headers)
                    return False

            current = None
            for l in lines[1:]:
                # Another header row, abort now, this is IPv6 land
                if "Name" in l:
                    break

                x = l.split()
                if len(x) == 0:
                    break

                iface = x[0]
                if iface.endswith("*"):
                    iface = iface[:-1]
                if iface == current:
                    # skip multiple lines of same interface
                    continue
                else:
                    current = iface

                # Filter inactive interfaces
                if self._parse_value(x[-5]) or self._parse_value(x[-2]):
                    iface = current
                    metrics = {
                        'bytes_rcvd': self._parse_value(x[-5]),
                        'bytes_sent': self._parse_value(x[-2]),
                        'packets_in.count': self._parse_value(x[-7]),
                        'packets_in.error': self._parse_value(x[-6]),
                        'packets_out.count': self._parse_value(x[-4]),
                        'packets_out.error': self._parse_value(x[-3]),
                    }
                    self._submit_devicemetrics(iface, metrics, custom_tags)
        except SubprocessOutputEmptyError:
            self.log.exception("Error collecting connection stats.")

        try:
            netstat, _, _ = get_subprocess_output(
                ["netstat", "-s", "-p"
                 "tcp"], self.log)
            # 3651535 packets sent
            #         972097 data packets (615753248 bytes)
            #         5009 data packets (2832232 bytes) retransmitted
            #         0 resends initiated by MTU discovery
            #         2086952 ack-only packets (471 delayed)
            #         0 URG only packets
            #         0 window probe packets
            #         310851 window update packets
            #         336829 control packets
            #         0 data packets sent after flow control
            #         3058232 checksummed in software
            #         3058232 segments (571218834 bytes) over IPv4
            #         0 segments (0 bytes) over IPv6
            # 4807551 packets received
            #         1143534 acks (for 616095538 bytes)
            #         165400 duplicate acks
            #         ...

            self._submit_regexed_values(netstat, BSD_TCP_METRICS, custom_tags)
        except SubprocessOutputEmptyError:
            self.log.exception("Error collecting TCP stats.")

        proc_location = self.agentConfig.get('procfs_path',
                                             '/proc').rstrip('/')

        net_proc_base_location = self._get_net_proc_base_location(
            proc_location)

        if self._is_collect_cx_state_runnable(net_proc_base_location):
            try:
                self.log.debug("Using `netstat` to collect connection state")
                output_TCP, _, _ = get_subprocess_output(
                    ["netstat", "-n", "-a", "-p", "tcp"], self.log)
                output_UDP, _, _ = get_subprocess_output(
                    ["netstat", "-n", "-a", "-p", "udp"], self.log)
                lines = output_TCP.splitlines() + output_UDP.splitlines()
                # Active Internet connections (w/o servers)
                # Proto Recv-Q Send-Q Local Address           Foreign Address         State
                # tcp        0      0 46.105.75.4:80          79.220.227.193:2032     SYN_RECV
                # tcp        0      0 46.105.75.4:143         90.56.111.177:56867     ESTABLISHED
                # tcp        0      0 46.105.75.4:50468       107.20.207.175:443      TIME_WAIT
                # tcp6       0      0 46.105.75.4:80          93.15.237.188:58038     FIN_WAIT2
                # tcp6       0      0 46.105.75.4:80          79.220.227.193:2029     ESTABLISHED
                # udp        0      0 0.0.0.0:123             0.0.0.0:*
                # udp6       0      0 :::41458                :::*

                metrics = self._parse_linux_cx_state(
                    lines[2:], self.tcp_states['netstat'], 5)
                for metric, value in iteritems(metrics):
                    self.gauge(metric, value, tags=custom_tags)
            except SubprocessOutputEmptyError:
                self.log.exception("Error collecting connection states.")
    def check(self, instance):
        # Allow to specify a complete command for nodetool such as `docker exec container nodetool`
        nodetool_cmd = instance.get("nodetool", self.nodetool_cmd).split()
        host = instance.get("host", DEFAULT_HOST)
        port = instance.get("port", DEFAULT_PORT)
        keyspaces = instance.get("keyspaces", [])
        username = instance.get("username", "")
        password = instance.get("password", "")
        ssl = instance.get("ssl", False)
        tags = instance.get("tags", [])

        # Flag to send service checks only once and not for every keyspace
        send_service_checks = True

        if not keyspaces:
            self.log.info(
                "No keyspaces set in the configuration: no metrics will be sent"
            )

        for keyspace in keyspaces:
            # Build the nodetool command
            cmd = nodetool_cmd + ['-h', host, '-p', str(port)]
            if username and password:
                cmd += ['-u', username, '-pw', password]
            # add ssl if requested
            if ssl:
                cmd += ['--ssl']
            cmd += ['status', '--', keyspace]

            # Execute the command
            out, err, code = get_subprocess_output(cmd,
                                                   self.log,
                                                   False,
                                                   log_debug=False)
            if err or 'Error:' in out or code != 0:
                self.log.error('Error executing nodetool status: %s', err
                               or out)
                continue
            nodes = self._process_nodetool_output(out)

            percent_up_by_dc = defaultdict(float)
            percent_total_by_dc = defaultdict(float)
            # Send the stats per node and compute the stats per datacenter
            for node in nodes:

                node_tags = [
                    'node_address:%s' % node['address'],
                    'node_id:%s' % node['id'],
                    'datacenter:%s' % node['datacenter'],
                    'rack:%s' % node['rack'],
                ]

                # nodetool prints `?` when it can't compute the value of `owns` for certain keyspaces (e.g. system)
                # don't send metric in this case
                if node['owns'] != '?':
                    owns = float(node['owns'])
                    if node['status'] == 'U':
                        percent_up_by_dc[node['datacenter']] += owns
                    percent_total_by_dc[node['datacenter']] += owns
                    self.gauge('cassandra.nodetool.status.owns',
                               owns,
                               tags=tags + node_tags +
                               ['keyspace:%s' % keyspace])

                # Send service check only once for each node
                if send_service_checks:
                    status = AgentCheck.OK if node[
                        'status'] == 'U' else AgentCheck.CRITICAL
                    self.service_check('cassandra.nodetool.node_up', status,
                                       tags + node_tags)

                self.gauge('cassandra.nodetool.status.status',
                           1 if node['status'] == 'U' else 0,
                           tags=tags + node_tags)
                self.gauge(
                    'cassandra.nodetool.status.load',
                    float(node['load']) * TO_BYTES[node['load_unit']],
                    tags=tags + node_tags,
                )

            # All service checks have been sent, don't resend
            send_service_checks = False

            # Send the stats per datacenter
            for datacenter, percent_up in percent_up_by_dc.items():
                self.gauge(
                    'cassandra.nodetool.status.replication_availability',
                    percent_up,
                    tags=tags +
                    ['keyspace:%s' % keyspace,
                     'datacenter:%s' % datacenter],
                )
            for datacenter, percent_total in percent_total_by_dc.items():
                self.gauge(
                    'cassandra.nodetool.status.replication_factor',
                    int(round(percent_total / 100)),
                    tags=tags +
                    ['keyspace:%s' % keyspace,
                     'datacenter:%s' % datacenter],
                )