예제 #1
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        # ad stands for access denied
        # We cache the PIDs getting this error and don't iterate on them more often than `access_denied_cache_duration``
        # This cache is for all PIDs so it's global, but it should be refreshed by instance
        self.last_ad_cache_ts = {}
        self.ad_cache = set()
        self.access_denied_cache_duration = int(
            init_config.get('access_denied_cache_duration',
                            DEFAULT_AD_CACHE_DURATION))

        # By default cache the PID list for a while
        # Sometimes it's not wanted b/c it can mess with no-data monitoring
        # This cache is indexed per instance
        self.last_pid_cache_ts = {}
        self.pid_cache = {}
        self.pid_cache_duration = int(
            init_config.get('pid_cache_duration', DEFAULT_PID_CACHE_DURATION))

        self._conflicting_procfs = False
        self._deprecated_init_procfs = False
        if Platform.is_linux():
            procfs_path = init_config.get('procfs_path')
            if procfs_path:
                if 'procfs_path' in agentConfig and procfs_path != agentConfig.get(
                        'procfs_path').rstrip('/'):
                    self._conflicting_procfs = True
                else:
                    self._deprecated_init_procfs = True
                    psutil.PROCFS_PATH = procfs_path

        # Process cache, indexed by instance
        self.process_cache = defaultdict(dict)
예제 #2
0
    def check(self, instance):
        if instance is None:
            instance = {}

        self._excluded_ifaces = instance.get('excluded_interfaces', [])
        self._collect_cx_state = instance.get(
            'collect_connection_state', False)
        self._collect_rate_metrics = instance.get(
            'collect_rate_metrics', True)
        self._collect_count_metrics = instance.get(
            'collect_count_metrics', False)

        # This decides whether we should split or combine connection states,
        # along with a few other things
        self._setup_metrics(instance)

        self._exclude_iface_re = None
        exclude_re = instance.get('excluded_interface_re', None)
        if exclude_re:
            self.log.debug("Excluding network devices matching: %s" % exclude_re)
            self._exclude_iface_re = re.compile(exclude_re)

        if Platform.is_linux():
            self._check_linux(instance)
        elif Platform.is_bsd():
            self._check_bsd(instance)
        elif Platform.is_solaris():
            self._check_solaris(instance)
        elif Platform.is_windows():
            self._check_psutil(instance)
예제 #3
0
def spin_up_haproxy():
    env = os.environ
    env['HAPROXY_CONFIG_DIR'] = os.path.join(common.HERE, 'compose')
    env['HAPROXY_CONFIG'] = os.path.join(common.HERE, 'compose', 'haproxy.cfg')
    env['HAPROXY_CONFIG_OPEN'] = os.path.join(common.HERE, 'compose',
                                              'haproxy-open.cfg')
    env['HAPROXY_SOCKET_DIR'] = common.UNIXSOCKET_DIR
    if Platform.is_linux() and not os.path.exists(common.UNIXSOCKET_DIR):
        # make the temp directory on linux
        os.makedirs(common.UNIXSOCKET_DIR)
    args = [
        "docker-compose", "-f",
        os.path.join(common.HERE, 'compose', 'haproxy.yaml')
    ]
    subprocess.check_call(args + ["down"], env=env)
    subprocess.check_call(args + ["up", "-d"], env=env)
    wait_for_haproxy()
    # subprocess.check_call(["ls", "-al", "/tmp/"], env=env)
    # subprocess.check_call(["ls", "-al", "/tmp/haproxy"], env=env)
    try:
        if Platform.is_linux():
            # on linux this needs access to the socket
            # it won't work without access
            chown_args = []
            user = getpass.getuser()
            if user != 'root':
                chown_args += ['sudo']
            chown_args += ["chown", user, common.UNIXSOCKET_PATH]
            subprocess.check_call(chown_args, env=env)
    except subprocess.CalledProcessError:
        # it's not always bad if this fails
        pass
    time.sleep(20)
    yield
    subprocess.check_call(args + ["down"], env=env)
    if Platform.is_linux():
        # make the temp directory on linux
        try:
            os.removedirs(common.UNIXSOCKET_DIR)
        except OSError:
            pass
예제 #4
0
def test_unixsocket_config(aggregator, spin_up_haproxy):
    if not Platform.is_linux():
        return

    haproxy_check = HAProxy(common.CHECK_NAME, {}, {})
    haproxy_check.check(common.CONFIG_UNIXSOCKET)

    shared_tag = ["instance_url:{0}".format(common.UNIXSOCKET_URL)]

    _test_frontend_metrics(aggregator, shared_tag)
    _test_backend_metrics(aggregator, shared_tag)
    _test_service_checks(aggregator)

    aggregator.assert_all_metrics_covered()
예제 #5
0
    def get_pagefault_stats(self, pid):
        if not Platform.is_linux():
            return None

        def file_to_string(path):
            with open(path, 'r') as f:
                res = f.read()
            return res

        # http://man7.org/linux/man-pages/man5/proc.5.html
        try:
            data = file_to_string('/{}/{}/stat'.format(psutil.PROCFS_PATH, pid))
        except Exception:
            self.log.debug('error getting proc stats: file_to_string failed for /%s/%s/stat', psutil.PROCFS_PATH, pid)
            return None
        return (int(i) for i in data.split()[9:13])
예제 #6
0
def test_relocated_procfs(aggregator):
    from datadog_checks.utils.platform import Platform
    import tempfile
    import shutil
    import uuid

    already_linux = Platform.is_linux()
    unique_process_name = str(uuid.uuid4())
    my_procfs = tempfile.mkdtemp()

    def _fake_procfs(arg, root=my_procfs):
        for key, val in arg.iteritems():
            path = os.path.join(root, key)
            if isinstance(val, dict):
                os.mkdir(path)
                _fake_procfs(val, path)
            else:
                with open(path, "w") as f:
                    f.write(str(val))

    _fake_procfs({
        '1': {
            'status': ("Name:\t{}\nThreads:\t1\n").format(unique_process_name),
            'stat':
            ('1 ({}) S 0 1 1 ' + ' 0' * 46).format(unique_process_name),
            'cmdline': unique_process_name,
        },
        'stat': ("cpu  13034 0 18596 380856797 2013 2 2962 0 0 0\n"
                 "btime 1448632481\n"),
    })

    config = {
        'init_config': {
            'procfs_path': my_procfs
        },
        'instances': [{
            'name': 'moved_procfs',
            'search_string': [unique_process_name],
            'exact_match': False,
            'ignored_denied_access': True,
            'thresholds': {
                'warning': [1, 10],
                'critical': [1, 100]
            },
        }]
    }
    version = int(psutil.__version__.replace(".", ""))
    process = ProcessCheck(common.CHECK_NAME, config['init_config'], {},
                           config['instances'])

    try:

        def import_mock(name,
                        i_globals={},
                        i_locals={},
                        fromlist=[],
                        level=-1,
                        orig_import=__import__):
            # _psutil_linux and _psutil_posix are the
            #  C bindings; use a mock for those
            if name in ('_psutil_linux', '_psutil_posix') or level >= 1 and\
               ('_psutil_linux' in fromlist or '_psutil_posix' in fromlist):
                m = MagicMock()
                # the import system will ask us for our own name
                m._psutil_linux = m
                m._psutil_posix = m
                # there's a version safety check in psutil/__init__.py;
                # this skips it
                m.version = version
                return m
            return orig_import(name, i_globals, i_locals, fromlist, level)

        # contextlib.nested is deprecated in favor of with MGR1, MGR2, ... etc
        # but we have too many mocks to fit on one line and apparently \ line
        # continuation is not flake8 compliant, even when semantically
        # required (as here). Patch is unlikely to throw errors that are
        # suppressed, so the main downside of contextlib is avoided.
        with contextlib.nested(
                patch('sys.platform', 'linux'),
                patch('socket.AF_PACKET', create=True),
                patch('__builtin__.__import__', side_effect=import_mock)):
            if not already_linux:
                # Reloading psutil fails on linux, but we only
                # need to do so if we didn't start out on a linux platform
                reload(psutil)
            assert Platform.is_linux()
            process.check(config["instances"][0])
    finally:
        shutil.rmtree(my_procfs)
        if not already_linux:
            # restore the original psutil that doesn't have our mocks
            reload(psutil)
        else:
            psutil.PROCFS_PATH = '/proc'

    expected_tags = generate_expected_tags(config['instances'][0])
    expected_tags += ['process:moved_procfs']
    aggregator.assert_service_check('process.up', count=1, tags=expected_tags)
예제 #7
0
def test_complex_config(aggregator, spin_up_mysql):
    mysql_check = MySql(common.CHECK_NAME, {}, {}, instances=[common_config.MYSQL_COMPLEX_CONFIG])
    mysql_check.check(common_config.MYSQL_COMPLEX_CONFIG)

    # Test service check
    aggregator.assert_service_check('mysql.can_connect', status=MySql.OK,
                                    tags=tags.SC_TAGS, count=1)

    aggregator.assert_service_check('mysql.replication.slave_running', status=MySql.OK,
                                    tags=tags.SC_TAGS, at_least=1)

    ver = map(lambda x: int(x), mysql_check.mysql_version[mysql_check._get_host_key()])
    ver = tuple(ver)

    testable_metrics = (variables.STATUS_VARS + variables.VARIABLES_VARS +
                        variables.INNODB_VARS + variables.BINLOG_VARS +
                        variables.SYSTEM_METRICS + variables.SCHEMA_VARS +
                        variables.SYNTHETIC_VARS)

    if ver >= (5, 6, 0) and environ.get('MYSQL_FLAVOR') != 'mariadb':
        testable_metrics.extend(variables.PERFORMANCE_VARS)

    # Test metrics
    for mname in testable_metrics:
        # These two are currently not guaranteed outside of a Linux
        # environment.
        if mname == 'mysql.performance.user_time' and not Platform.is_linux():
            continue
        if mname == 'mysql.performance.kernel_time' and not Platform.is_linux():
            continue
        if mname == 'mysql.performance.cpu_time' and Platform.is_windows():
            continue

        if mname == 'mysql.performance.query_run_time.avg':
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:testdb'],
                                     count=1)
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:mysql'],
                                     count=1)
        elif mname == 'mysql.info.schema.size':
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:testdb'],
                                     count=1)
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:information_schema'],
                                     count=1)
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:performance_schema'],
                                     count=1)
        else:
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS,
                                     at_least=0)

    # TODO: test this if it is implemented
    # Assert service metadata
    # version_metadata = mysql_check.service_metadata['version']
    # assert len(version_metadata) == 1

    # test custom query metrics
    aggregator.assert_metric('alice.age', value=25)
    aggregator.assert_metric('bob.age', value=20)

    # test optional metrics
    optional_metrics = (variables.OPTIONAL_REPLICATION_METRICS +
                        variables.OPTIONAL_INNODB_VARS +
                        variables.OPTIONAL_STATUS_VARS +
                        variables.OPTIONAL_STATUS_VARS_5_6_6)
    _test_optional_metrics(aggregator, optional_metrics, 1)

    # Raises when coverage < 100%
    aggregator.assert_all_metrics_covered()
예제 #8
0
def test_complex_config_replica(aggregator, spin_up_mysql):
    mysql_check = MySql(common.CHECK_NAME, {}, {})
    config = copy.deepcopy(common_config.MYSQL_COMPLEX_CONFIG)
    config['port'] = common.SLAVE_PORT
    mysql_check.check(config)

    # self.assertMetricTag('mysql.replication.seconds_behind_master', 'channel:default')

    # Test service check
    aggregator.assert_service_check('mysql.can_connect', status=MySql.OK,
                                    tags=tags.SC_TAGS_REPLICA, count=1)

    # Travis MySQL not running replication - FIX in flavored test.
    aggregator.assert_service_check('mysql.replication.slave_running', status=MySql.OK,
                                    tags=tags.SC_TAGS_REPLICA, at_least=1)

    ver = map(lambda x: int(x), mysql_check.mysql_version[mysql_check._get_host_key()])
    ver = tuple(ver)

    testable_metrics = (variables.STATUS_VARS + variables.VARIABLES_VARS +
                        variables.INNODB_VARS + variables.BINLOG_VARS +
                        variables.SYSTEM_METRICS + variables.SCHEMA_VARS +
                        variables.SYNTHETIC_VARS)

    # Test metrics
    for mname in testable_metrics:
        # These two are currently not guaranteed outside of a Linux
        # environment.
        if mname == 'mysql.performance.user_time' and not Platform.is_linux():
            continue
        if mname == 'mysql.performance.kernel_time' and not Platform.is_linux():
            continue
        if mname == 'mysql.performance.cpu_time' and Platform.is_windows():
            continue

        if mname == 'mysql.performance.query_run_time.avg':
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:testdb'], count=1)
        elif mname == 'mysql.info.schema.size':
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:testdb'], count=1)
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:information_schema'], count=1)
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:performance_schema'], count=1)
        else:
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS, at_least=0)

    # test custom query metrics
    aggregator.assert_metric('alice.age', value=25)
    aggregator.assert_metric('bob.age', value=20)

    # test optional metrics
    optional_metrics = (variables.OPTIONAL_REPLICATION_METRICS +
                        variables.OPTIONAL_INNODB_VARS +
                        variables.OPTIONAL_STATUS_VARS +
                        variables.OPTIONAL_STATUS_VARS_5_6_6)
    _test_optional_metrics(aggregator, optional_metrics, 1)

    # Raises when coverage < 100%
    aggregator.assert_all_metrics_covered()
예제 #9
0
    aggregator.assert_all_metrics_covered()


def test_check_ssl(aggregator, check, openldap_server, instance_ssl):
    tags = ["url:{}".format(instance_ssl["url"]), "test:integration"]
    # Should fail certificate verification
    with pytest.raises(ldap3.core.exceptions.LDAPExceptionError):
        check.check(instance_ssl)
        aggregator.assert_service_check("openldap.can_connect", check.CRITICAL, tags=tags)
    instance_ssl["ssl_verify"] = False
    # Should work now
    check.check(instance_ssl)
    aggregator.assert_service_check("openldap.can_connect", check.OK, tags=tags)


def test_check_connection_failure(aggregator, check, openldap_server, instance):
    instance["url"] = "bad_url"
    tags = ["url:{}".format(instance["url"]), "test:integration"]
    # Should fail certificate verification
    with pytest.raises(ldap3.core.exceptions.LDAPExceptionError):
        check.check(instance)
        aggregator.assert_service_check("openldap.can_connect", check.CRITICAL, tags=tags)


@pytest.mark.skipif(not Platform.is_linux(), reason='Windows sockets are not file handles')
def test_check_socket(aggregator, check, openldap_server, instance):
    instance["url"] = "ldapi://{}".format(openldap_server)
    tags = ["url:{}".format(instance["url"]), "test:integration"]
    check.check(instance)
    aggregator.assert_service_check("openldap.can_connect", check.OK, tags=tags)
예제 #10
0
    # Should work now
    check.check(instance_ssl)
    aggregator.assert_service_check("openldap.can_connect",
                                    check.OK,
                                    tags=tags)


@pytest.mark.usefixtures('dd_environment')
def test_check_connection_failure(aggregator, check, instance):
    instance["url"] = "bad_url"
    tags = ["url:{}".format(instance["url"]), "test:integration"]
    # Should fail certificate verification
    with pytest.raises(ldap3.core.exceptions.LDAPExceptionError):
        check.check(instance)
        aggregator.assert_service_check("openldap.can_connect",
                                        check.CRITICAL,
                                        tags=tags)


@pytest.mark.skipif(not Platform.is_linux(),
                    reason='Windows sockets are not file handles')
@pytest.mark.usefixtures('dd_environment')
def test_check_socket(aggregator, check, instance):
    host_socket_path = os.path.join(os.environ['HOST_SOCKET_DIR'], 'ldapi')
    instance["url"] = "ldapi://{}".format(host_socket_path)
    tags = ["url:{}".format(instance["url"]), "test:integration"]
    check.check(instance)
    aggregator.assert_service_check("openldap.can_connect",
                                    check.OK,
                                    tags=tags)