예제 #1
0
    def _discover_queues(self, queue_manager, mq_pattern_filter):
        queues = []

        for queue_type in SUPPORTED_QUEUE_TYPES:
            args = {pymqi.CMQC.MQCA_Q_NAME: ensure_bytes(mq_pattern_filter), pymqi.CMQC.MQIA_Q_TYPE: queue_type}
            try:
                pcf = pymqi.PCFExecute(queue_manager)
                response = pcf.MQCMD_INQUIRE_Q(args)
            except pymqi.MQMIError as e:
                self.warning("Error discovering queue: {}".format(e))
            else:
                for queue_info in response:
                    queue = queue_info[pymqi.CMQC.MQCA_Q_NAME]
                    queues.append(ensure_unicode(queue).strip())

        return queues
예제 #2
0
    def submit_metrics(self, child, prefix, tags):
        value = child.get(metrics.METRIC_VALUE_FIELDS[child.tag])
        metric_name = self.normalize(ensure_unicode(child.get('name')),
                                     prefix='{}.{}'.format(
                                         self.METRIC_PREFIX, prefix),
                                     fix_case=True)

        tag = child.tag
        if (child.get('unit') in self.custom_queries_units_gauge
                and prefix in self.custom_stats and tag == 'CountStatistic'):
            tag = 'TimeStatistic'
        self.metric_type_mapping[tag](metric_name, value, tags=tags)

        # creates new JVM metrics correctly as gauges
        if prefix == "jvm":
            jvm_metric_name = "{}_gauge".format(metric_name)
            self.gauge(jvm_metric_name, value, tags=tags)
예제 #3
0
    def _get_data(self, instance):
        host = instance.get('host')
        port = int(instance.get('port', 2222))  # 2222 is default
        tags = instance.get('tags', [])
        if tags is None:
            tags = []

        service_check_tags = ['host:{}'.format(host), 'port:{}'.format(port)
                              ] + tags
        service_check_tags = list(set(service_check_tags))

        try:
            addrs = socket.getaddrinfo(host, port, 0, 0, socket.IPPROTO_TCP)
        except socket.gaierror as e:
            self.log.warning("unable to retrieve address info for %s:%s - %s",
                             host, port, e)
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               tags=service_check_tags)
            return None

        response = ""
        for addr in addrs:
            try:
                if addr[1] == socket.SOCK_STREAM:
                    client = socket.socket(*addr[0:3])
                    client.connect(addr[-1])

                    self.log.debug(u"Querying: {0}:{1}".format(host, port))
                    while 1:
                        data = ensure_unicode(client.recv(1024))
                        if not data:
                            break
                        response = ''.join([response, data])

                client.close()
                break
            except socket.error as e:
                self.log.warning("unable to connect to %s - %s", addr[-1], e)

        status = AgentCheck.OK if response else AgentCheck.CRITICAL
        self.service_check(self.SERVICE_CHECK_NAME,
                           status,
                           tags=service_check_tags)

        return response
예제 #4
0
def test_version():
    """
    If the docker image is in a different repository, we check that the
    version requested in the VARNISH_VERSION env var is the one running inside the container.
    """
    varnishstat = common.get_varnish_stat_path()

    # Version info is printed to stderr
    output = subprocess.check_output(shlex.split(varnishstat) + ["-V"],
                                     stderr=subprocess.STDOUT)
    res = re.search(r"varnish-(\d+\.\d\.\d)", ensure_unicode(output))
    if res is None:
        raise Exception("Could not retrieve varnish version from docker")

    version = res.groups()[0]
    assert version == os.environ.get('VARNISH_VERSION',
                                     common.VARNISH_DEFAULT_VERSION)
예제 #5
0
def poll_mock(mock_http_response):
    registry = CollectorRegistry()
    g1 = Gauge('metric1',
               'processor usage', ['matched_label', 'node', 'flavor'],
               registry=registry)
    g1.labels(matched_label="foobar", node="host1", flavor="test").set(99.9)
    g2 = Gauge('metric2',
               'memory usage', ['matched_label', 'node', 'timestamp'],
               registry=registry)
    g2.labels(matched_label="foobar", node="host2", timestamp="123").set(12.2)
    c1 = Counter('counter1', 'hits', ['node'], registry=registry)
    c1.labels(node="host2").inc(42)
    g3 = Gauge('metric3',
               'memory usage', ['matched_label', 'node', 'timestamp'],
               registry=registry)
    g3.labels(matched_label="foobar", node="host2",
              timestamp="456").set(float('inf'))

    mock_http_response(ensure_unicode(generate_latest(registry)),
                       normalize_content=False)
예제 #6
0
    def _process_mor_objects_queue(self, instance):
        """
        Pops `batch_morlist_size` items from the mor objects queue and run asynchronously
        the _process_mor_objects_queue_async method to fill the Mor cache.
        """
        i_key = self._instance_key(instance)
        self.mor_cache.init_instance(i_key)

        if not self.mor_objects_queue.contains(i_key):
            self.log.debug("Objects queue is not initialized yet for instance {}, skipping processing".format(i_key))
            return

        for resource_type in RESOURCE_TYPE_METRICS:
            # Batch size can prevent querying large payloads at once if the environment is too large
            # If batch size is set to 0, process everything at once
            batch_size = self.batch_morlist_size or self.mor_objects_queue.size(i_key, resource_type)
            while self.mor_objects_queue.size(i_key, resource_type):
                mors = []
                for _ in range(batch_size):
                    mor = self.mor_objects_queue.pop(i_key, resource_type)
                    if mor is None:
                        self.log.debug(
                            "No more objects of type '{}' left in the queue".format(ensure_unicode(resource_type))
                        )
                        break

                    mor_name = str(mor['mor'])
                    mor['interval'] = REAL_TIME_INTERVAL if mor['mor_type'] in REALTIME_RESOURCES else None
                    # Always update the cache to account for Mors that might have changed parent
                    # in the meantime (e.g. a migrated VM).
                    self.mor_cache.set_mor(i_key, mor_name, mor)

                    # Only do this for non real-time resources i.e. datacenter, datastore and cluster
                    # For hosts and VMs, we can rely on a precomputed list of metrics
                    realtime_only = is_affirmative(instance.get("collect_realtime_only", True))
                    if mor["mor_type"] not in REALTIME_RESOURCES and not realtime_only:
                        mors.append(mor)

                # We will actually schedule jobs for non realtime resources only.
                if mors:
                    self.pool.apply_async(self._process_mor_objects_queue_async, args=(instance, mors))
예제 #7
0
    def test_check(self, aggregator):
        instance = self.INSTANCES['main']
        c = NfsStatCheck(self.CHECK_NAME, self.INIT_CONFIG, {}, [instance])

        with open(os.path.join(FIXTURE_DIR, 'nfsiostat'), 'rb') as f:
            mock_output = ensure_unicode(f.read())

        with mock.patch('datadog_checks.nfsstat.nfsstat.get_subprocess_output',
                        return_value=(mock_output, '', 0)):
            c.check(instance)

        tags = list(instance['tags'])
        tags.extend([
            'nfs_server:192.168.34.1', 'nfs_export:/exports/nfs/datadog/two',
            'nfs_mount:/mnt/datadog/two'
        ])

        for metric in metrics:
            aggregator.assert_metric(metric, tags=tags)

        assert aggregator.metrics_asserted_pct == 100.0
예제 #8
0
def poll_mock():
    registry = CollectorRegistry()
    g1 = Gauge('metric1', 'processor usage', ['matched_label', 'node', 'flavor'], registry=registry)
    g1.labels(matched_label="foobar", node="host1", flavor="test").set(99.9)
    g2 = Gauge('metric2', 'memory usage', ['matched_label', 'node', 'timestamp'], registry=registry)
    g2.labels(matched_label="foobar", node="host2", timestamp="123").set(12.2)
    c1 = Counter('counter1', 'hits', ['node'], registry=registry)
    c1.labels(node="host2").inc(42)
    g3 = Gauge('metric3', 'memory usage', ['matched_label', 'node', 'timestamp'], registry=registry)
    g3.labels(matched_label="foobar", node="host2", timestamp="456").set(float('inf'))

    poll_mock_patch = mock.patch(
        'requests.get',
        return_value=mock.MagicMock(
            status_code=200,
            iter_lines=lambda **kwargs: ensure_unicode(generate_latest(registry)).split("\n"),
            headers={'Content-Type': "text/plain"},
        ),
    )
    with poll_mock_patch:
        yield
예제 #9
0
    def _end_element(self, name, tags):
        if name == "stat":
            m_name = ensure_unicode(self.normalize(self._current_metric))
            if self._current_type in ("a", "c"):
                self.rate(m_name, long(self._current_value), tags=tags)
            elif self._current_type in ("i", "g"):
                self.gauge(m_name, long(self._current_value), tags=tags)
                if 'n_purges' in m_name:
                    self.rate('varnish.n_purgesps',
                              long(self._current_value),
                              tags=tags)
            else:
                # Unsupported data type, ignore
                self._reset()
                return  # don't save

            # reset for next stat element
            self._reset()
        elif name in ("ident", "name") or (name == "type"
                                           and self._current_str != "MAIN"):
            self._current_metric += "." + self._current_str
예제 #10
0
    def _get_parent_tags(self, mor, all_objects):
        properties = all_objects.get(mor, {})
        parent = properties.get('parent')
        if parent:
            tags = []
            parent_name = ensure_unicode(all_objects.get(parent, {}).get('name', 'unknown'))
            if isinstance(parent, vim.HostSystem):
                tags.append('vsphere_host:{}'.format(parent_name))
            elif isinstance(parent, vim.Folder):
                tags.append('vsphere_folder:{}'.format(parent_name))
            elif isinstance(parent, vim.ComputeResource):
                if isinstance(parent, vim.ClusterComputeResource):
                    tags.append('vsphere_cluster:{}'.format(parent_name))
                tags.append('vsphere_compute:{}'.format(parent_name))
            elif isinstance(parent, vim.Datacenter):
                tags.append('vsphere_datacenter:{}'.format(parent_name))

            parent_tags = self._get_parent_tags(parent, all_objects)
            parent_tags.extend(tags)
            return parent_tags
        return []
예제 #11
0
    def _submit_channel_status(self,
                               queue_manager,
                               search_channel_name,
                               tags,
                               config,
                               channels_to_skip=None):
        """Submit channel status
        :param search_channel_name might contain wildcard characters
        """
        channels_to_skip = channels_to_skip or []
        search_channel_tags = tags + ["channel:{}".format(search_channel_name)]
        try:
            args = {
                pymqi.CMQCFC.MQCACH_CHANNEL_NAME:
                ensure_bytes(search_channel_name)
            }
            pcf = pymqi.PCFExecute(queue_manager)
            response = pcf.MQCMD_INQUIRE_CHANNEL_STATUS(args)
            self.service_check(self.CHANNEL_SERVICE_CHECK, AgentCheck.OK,
                               search_channel_tags)
        except pymqi.MQMIError as e:
            self.log.warning("Error getting CHANNEL stats {}".format(e))
            self.service_check(self.CHANNEL_SERVICE_CHECK, AgentCheck.CRITICAL,
                               search_channel_tags)
        else:
            for channel_info in response:
                channel_name = ensure_unicode(
                    channel_info[pymqi.CMQCFC.MQCACH_CHANNEL_NAME]).strip()
                if channel_name in channels_to_skip:
                    continue
                channel_tags = tags + ["channel:{}".format(channel_name)]

                channel_status = channel_info[
                    pymqi.CMQCFC.MQIACH_CHANNEL_STATUS]

                self._submit_channel_count(channel_name, channel_status,
                                           channel_tags)
                self._submit_status_check(channel_name, channel_status,
                                          channel_tags, config)
예제 #12
0
    def convert_and_filter_stats(self, stats):
        """ Converts raw query stats to native python types

        Drops string results as well.

        :param stats: raw query stats
        :type stats: dict(str, str)
        :return: converted results and tags
        :rtype: tuple[dict(str, float), list[str]]
        """
        results = {}
        tags = set(self.additional_tags)
        for k, v in stats.items():
            if k in self.excluded:
                continue
            found_re = False
            for r in self.excluded_re:
                if r.match(k):
                    found_re = True
                    break
            if found_re:
                continue

            try:  # try a number conversion
                value = float(v.strip())
                results[k] = value
            except Exception:  # this is a string value instead
                if k == 'ups.status':
                    if v.lower().startswith('ol') or v.lower().startswith(
                            'on'):
                        results[k] = 1.0
                    else:
                        results[k] = 0.0
                if k in self.string_tags:
                    tags.add('{}:{}'.format(
                        k,
                        ensure_unicode(
                            self.convert_to_underscore_separated(v))))
        return results, tags
예제 #13
0
    def report_as_service_check(self, sc_name, status, instance, msg=None):
        instance_name = ensure_unicode(self.normalize(instance['name']))
        host = instance.get('host', None)
        port = instance.get('port', None)
        custom_tags = instance.get('tags', [])

        if status == Status.UP:
            msg = None

        tags = custom_tags + [
            'target_host:{}'.format(host), 'port:{}'.format(port),
            'instance:{}'.format(instance_name)
        ]

        self.service_check(self.SERVICE_CHECK_NAME,
                           NetworkCheck.STATUS_TO_SERVICE_CHECK[status],
                           tags=tags,
                           message=msg)
        # Report as a metric as well
        self.gauge("network.tcp.can_connect",
                   1 if status == Status.UP else 0,
                   tags=tags)
예제 #14
0
    def _get_parent_tags(self, mor, all_objects):
        tags = []
        properties = all_objects.get(mor, {})
        parent = properties.get("parent")
        if parent:
            parent_name = ensure_unicode(all_objects.get(parent, {}).get("name", "unknown"))
            tag = []
            if isinstance(parent, vim.HostSystem):
                tag.append('vsphere_host:{}'.format(parent_name))
            elif isinstance(parent, vim.Folder):
                tag.append('vsphere_folder:{}'.format(parent_name))
            elif isinstance(parent, vim.ComputeResource):
                if isinstance(parent, vim.ClusterComputeResource):
                    tag.append('vsphere_cluster:{}'.format(parent_name))
                tag.append('vsphere_compute:{}'.format(parent_name))
            elif isinstance(parent, vim.Datacenter):
                tag.append('vsphere_datacenter:{}'.format(parent_name))

            tags = self._get_parent_tags(parent, all_objects)
            if tag:
                tags.extend(tag)

        return tags
예제 #15
0
    def _get_server_instance(self, instance):
        i_key = self._instance_key(instance)
        tags = instance.get('tags', [])

        service_check_tags = [
            'vcenter_server:{}'.format(self._instance_key(instance)),
            'vcenter_host:{}'.format(ensure_unicode(instance.get('host'))),
        ] + tags
        service_check_tags = list(set(service_check_tags))

        with self.server_instances_lock:
            if i_key not in self.server_instances:
                self.server_instances[i_key] = self._smart_connect(instance, service_check_tags)

            # Test if the connection is working
            try:
                self.server_instances[i_key].CurrentTime()
                self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags)
            except Exception:
                # Try to reconnect. If the connection is definitely broken,
                # this will send CRITICAL service check and raise
                self.server_instances[i_key] = self._smart_connect(instance, service_check_tags)

            return self.server_instances[i_key]
예제 #16
0
    def _get_all_objs(self,
                      server_instance,
                      regexes,
                      include_only_marked,
                      tags,
                      use_guest_hostname=False):
        """
        Explore vCenter infrastructure to discover hosts, virtual machines, etc.
        and compute their associated tags.
        Start at the vCenter `rootFolder`, so as to collect every objet.

        Example topology:
            ```
            rootFolder
                - datacenter1
                    - compute_resource1 == cluster
                        - host1
                        - host2
                        - host3
                    - compute_resource2
                        - host5
                            - vm1
                            - vm2
            ```

        If it's a node we want to query metric for, it will be enqueued at the
        instance level and will be processed by a subsequent job.
        """
        start = time.time()
        obj_list = defaultdict(list)

        # Collect objects and their attributes
        all_objects = self._collect_mors_and_attributes(server_instance)

        # Add rootFolder since it is not explored by the propertyCollector
        rootFolder = server_instance.content.rootFolder
        all_objects[rootFolder] = {"name": rootFolder.name, "parent": None}

        for obj, properties in all_objects.items():
            instance_tags = []
            if not self._is_excluded(obj, properties, regexes,
                                     include_only_marked) and isinstance(
                                         obj, RESOURCE_TYPE_METRICS):
                if use_guest_hostname:
                    hostname = properties.get(
                        "guest.hostName", properties.get("name", "unknown"))
                else:
                    hostname = properties.get("name", "unknown")
                if properties.get("parent"):
                    instance_tags.extend(
                        self._get_parent_tags(obj, all_objects))

                if isinstance(obj, vim.VirtualMachine):
                    vsphere_type = 'vsphere_type:vm'
                    vimtype = vim.VirtualMachine
                    mor_type = "vm"
                    power_state = properties.get("runtime.powerState")
                    if power_state != vim.VirtualMachinePowerState.poweredOn:
                        self.log.debug("Skipping VM in state %s",
                                       ensure_unicode(power_state))
                        continue
                    host_mor = properties.get("runtime.host")
                    host_props = all_objects.get(host_mor, {})
                    host = "unknown"
                    if host_mor and host_props:
                        host = ensure_unicode(host_props.get(
                            "name", "unknown"))
                        if self._is_excluded(host_mor, host_props, regexes,
                                             include_only_marked):
                            self.log.debug(
                                "Skipping VM because host %s is excluded by rule %s.",
                                host, regexes.get('host_include'))
                            continue
                    instance_tags.append('vsphere_host:{}'.format(host))
                elif isinstance(obj, vim.HostSystem):
                    vsphere_type = 'vsphere_type:host'
                    vimtype = vim.HostSystem
                    mor_type = "host"
                elif isinstance(obj, vim.Datastore):
                    vsphere_type = 'vsphere_type:datastore'
                    instance_tags.append('vsphere_datastore:{}'.format(
                        ensure_unicode(properties.get("name", "unknown"))))
                    hostname = None
                    vimtype = vim.Datastore
                    mor_type = "datastore"
                elif isinstance(obj, vim.Datacenter):
                    vsphere_type = 'vsphere_type:datacenter'
                    instance_tags.append("vsphere_datacenter:{}".format(
                        ensure_unicode(properties.get("name", "unknown"))))
                    hostname = None
                    vimtype = vim.Datacenter
                    mor_type = "datacenter"
                elif isinstance(obj, vim.ClusterComputeResource):
                    vsphere_type = 'vsphere_type:cluster'
                    instance_tags.append("vsphere_cluster:{}".format(
                        ensure_unicode(properties.get("name", "unknown"))))
                    hostname = None
                    vimtype = vim.ClusterComputeResource
                    mor_type = "cluster"
                else:
                    vsphere_type = None

                if vsphere_type:
                    instance_tags.append(vsphere_type)

                obj_list[vimtype].append({
                    "mor_type": mor_type,
                    "mor": obj,
                    "hostname": hostname,
                    "tags": tags + instance_tags
                })

        self.log.debug("All objects with attributes cached in %s seconds.",
                       time.time() - start)
        return obj_list
예제 #17
0
    def _collect_mors_and_attributes(self, server_instance):
        resources = list(RESOURCE_TYPE_METRICS)
        resources.extend(RESOURCE_TYPE_NO_METRIC)

        content = server_instance.content
        view_ref = content.viewManager.CreateContainerView(
            content.rootFolder, resources, True)

        # Object used to query MORs as well as the attributes we require in one API call
        # See https://code.vmware.com/apis/358/vsphere#/doc/vmodl.query.PropertyCollector.html
        collector = content.propertyCollector

        # Specify the root object from where we collect the rest of the objects
        obj_spec = vmodl.query.PropertyCollector.ObjectSpec()
        obj_spec.obj = view_ref
        obj_spec.skip = True

        # Specify the attribute of the root object to traverse to obtain all the attributes
        traversal_spec = vmodl.query.PropertyCollector.TraversalSpec()
        traversal_spec.path = "view"
        traversal_spec.skip = False
        traversal_spec.type = view_ref.__class__
        obj_spec.selectSet = [traversal_spec]

        property_specs = []
        # Specify which attributes we want to retrieve per object
        for resource in resources:
            property_spec = vmodl.query.PropertyCollector.PropertySpec()
            property_spec.type = resource
            property_spec.pathSet = ["name", "parent", "customValue"]
            if resource == vim.VirtualMachine:
                property_spec.pathSet.append("runtime.powerState")
                property_spec.pathSet.append("runtime.host")
                property_spec.pathSet.append("guest.hostName")
            property_specs.append(property_spec)

        # Create our filter spec from the above specs
        filter_spec = vmodl.query.PropertyCollector.FilterSpec()
        filter_spec.objectSet = [obj_spec]
        filter_spec.propSet = property_specs

        retr_opts = vmodl.query.PropertyCollector.RetrieveOptions()
        # To limit the number of objects retrieved per call.
        # If batch_collector_size is 0, collect maximum number of objects.
        retr_opts.maxObjects = self.batch_collector_size or None

        # Collect the objects and their properties
        res = collector.RetrievePropertiesEx([filter_spec], retr_opts)
        objects = res.objects
        # Results can be paginated
        while res.token is not None:
            res = collector.ContinueRetrievePropertiesEx(res.token)
            objects.extend(res.objects)

        mor_attrs = {}
        error_counter = 0
        for obj in objects:
            if obj.missingSet and error_counter < 10:
                for prop in obj.missingSet:
                    error_counter += 1
                    self.log.error(
                        "Unable to retrieve property %s for object %s: %s",
                        ensure_unicode(prop.path),
                        ensure_unicode(obj.obj),
                        ensure_unicode(prop.fault),
                    )
                    if error_counter == 10:
                        self.log.error(
                            "Too many errors during object collection, stop logging"
                        )
                        break
            mor_attrs[obj.obj] = {prop.name: prop.val
                                  for prop in obj.propSet
                                  } if obj.propSet else {}

        return mor_attrs
예제 #18
0
 def _instance_key(instance):
     i_key = ensure_unicode(instance.get('name'))
     if i_key is None:
         raise BadConfigError(
             "Must define a unique 'name' per vCenter instance")
     return i_key
예제 #19
0
 def submit_metrics(self, child, prefix, tags):
     value = child.get(metrics.METRIC_VALUE_FIELDS[child.tag])
     metric_name = self.normalize(
         ensure_unicode(child.get('name')), prefix='{}.{}'.format(self.METRIC_PREFIX, prefix), fix_case=True
     )
     self.metric_type_mapping[child.tag](metric_name, value, tags=tags)
예제 #20
0
    def _check_key_lengths(self, conn, instance, tags):
        """
        Compute the length of the configured keys across all the databases
        """
        key_list = instance.get('keys')

        if key_list is None:
            return

        if not isinstance(key_list, list) or len(key_list) == 0:
            self.warning("keys in redis configuration is either not a list or empty")
            return

        # get all the available databases
        databases = list(conn.info('keyspace'))
        if not databases:
            self.warning("Redis database is empty")
            return

        # convert to integer the output of `keyspace`, from `db0` to `0`
        # and store items in a set
        databases = [int(dbstring[2:]) for dbstring in databases]

        # user might have configured the instance to target one specific db
        if 'db' in instance:
            db = instance['db']
            if db not in databases:
                self.warning("Cannot find database {}".format(instance['db']))
                return
            databases = [db]

        # maps a key to the total length across databases
        lengths_overall = defaultdict(int)

        # don't overwrite the configured instance, use a copy
        tmp_instance = deepcopy(instance)

        for db in databases:
            lengths = defaultdict(lambda: defaultdict(int))
            tmp_instance['db'] = db
            db_conn = self._get_conn(tmp_instance)

            for key_pattern in key_list:
                if re.search(r"(?<!\\)[*?[]", key_pattern):
                    keys = db_conn.scan_iter(match=key_pattern)
                else:
                    keys = [key_pattern]

                for key in keys:
                    text_key = ensure_unicode(key)
                    try:
                        key_type = ensure_unicode(db_conn.type(key))
                    except redis.ResponseError:
                        self.log.info("key {} on remote server; skipping".format(text_key))
                        continue

                    if key_type == 'list':
                        keylen = db_conn.llen(key)
                        lengths[text_key]["length"] += keylen
                        lengths_overall[text_key] += keylen
                    elif key_type == 'set':
                        keylen = db_conn.scard(key)
                        lengths[text_key]["length"] += keylen
                        lengths_overall[text_key] += keylen
                    elif key_type == 'zset':
                        keylen = db_conn.zcard(key)
                        lengths[text_key]["length"] += keylen
                        lengths_overall[text_key] += keylen
                    elif key_type == 'hash':
                        keylen = db_conn.hlen(key)
                        lengths[text_key]["length"] += keylen
                        lengths_overall[text_key] += keylen
                    elif key_type == 'string':
                        # Send 1 if the key exists as a string
                        lengths[text_key]["length"] += 1
                        lengths_overall[text_key] += 1
                    else:
                        # If the type is unknown, it might be because the key doesn't exist,
                        # which can be because the list is empty. So always send 0 in that case.
                        lengths[text_key]["length"] += 0
                        lengths_overall[text_key] += 0

                    # Tagging with key_type since the same key can exist with a
                    # different key_type in another db
                    lengths[text_key]["key_type"] = key_type

            # Send the metrics for each db in the redis instance.
            for key, total in iteritems(lengths):
                # Only send non-zeros if tagged per db.
                if total["length"] > 0:
                    self.gauge(
                        'redis.key.length',
                        total["length"],
                        tags=tags
                        + ['key:{}'.format(key), 'key_type:{}'.format(total["key_type"]), 'redis_db:db{}'.format(db)],
                    )

        # Warn if a key is missing from the entire redis instance.
        # Send 0 if the key is missing/empty from the entire redis instance.
        for key, total in iteritems(lengths_overall):
            if total == 0 and instance.get("warn_on_missing_keys", True):
                self.gauge('redis.key.length', total, tags=tags + ['key:{}'.format(key)])
                self.warning("{0} key not found in redis".format(key))
예제 #21
0
    def _check_slowlog(self, instance, custom_tags):
        """Retrieve length and entries from Redis' SLOWLOG

        This will parse through all entries of the SLOWLOG and select ones
        within the time range between the last seen entries and now

        """
        conn = self._get_conn(instance)

        tags = self._get_tags(custom_tags, instance)

        if not instance.get(MAX_SLOW_ENTRIES_KEY):
            try:
                max_slow_entries = int(
                    conn.config_get(MAX_SLOW_ENTRIES_KEY)
                    [MAX_SLOW_ENTRIES_KEY])
                if max_slow_entries > DEFAULT_MAX_SLOW_ENTRIES:
                    self.warning(
                        "Redis {0} is higher than {1}. Defaulting to {1}. "  # noqa: G001
                        "If you need a higher value, please set {0} in your check config"
                        .format(MAX_SLOW_ENTRIES_KEY,
                                DEFAULT_MAX_SLOW_ENTRIES))
                    max_slow_entries = DEFAULT_MAX_SLOW_ENTRIES
            # No config on AWS Elasticache
            except redis.ResponseError:
                max_slow_entries = DEFAULT_MAX_SLOW_ENTRIES
        else:
            max_slow_entries = int(instance.get(MAX_SLOW_ENTRIES_KEY))

        # Generate a unique id for this instance to be persisted across runs
        ts_key = self._generate_instance_key(instance)

        # Get all slowlog entries

        slowlogs = conn.slowlog_get(max_slow_entries)

        # Find slowlog entries between last timestamp and now using start_time
        slowlogs = [
            s for s in slowlogs
            if s['start_time'] > self.last_timestamp_seen[ts_key]
        ]

        max_ts = 0
        # Slowlog entry looks like:
        #  {'command': 'LPOP somekey',
        #   'duration': 11238,
        #   'id': 496L,
        #   'start_time': 1422529869}
        for slowlog in slowlogs:
            if slowlog['start_time'] > max_ts:
                max_ts = slowlog['start_time']

            slowlog_tags = list(tags)
            command = slowlog['command'].split()
            # When the "Garantia Data" custom Redis is used, redis-py returns
            # an empty `command` field
            # FIXME when https://github.com/andymccurdy/redis-py/pull/622 is released in redis-py
            if command:
                slowlog_tags.append('command:{}'.format(
                    ensure_unicode(command[0])))

            value = slowlog['duration']
            self.histogram('redis.slowlog.micros', value, tags=slowlog_tags)

        self.last_timestamp_seen[ts_key] = max_ts
예제 #22
0
    def _check(self, instance):
        (
            addr,
            ntlm_domain,
            username,
            password,
            client_cert,
            client_key,
            method,
            data,
            http_response_status_code,
            timeout,
            include_content,
            headers,
            response_time,
            content_match,
            reverse_content_match,
            tags,
            disable_ssl_validation,
            ssl_expire,
            instance_ca_certs,
            weakcipher,
            check_hostname,
            ignore_ssl_warning,
            skip_proxy,
            allow_redirects,
            stream,
        ) = from_instance(instance, self.ca_certs)

        start = time.time()

        def send_status_up(logMsg):
            # TODO: A6 log needs bytes and cannot handle unicode
            self.log.debug(logMsg)
            service_checks.append((self.SC_STATUS, Status.UP, "UP"))

        def send_status_down(loginfo, down_msg):
            # TODO: A6 log needs bytes and cannot handle unicode
            self.log.info(loginfo)
            if include_content:
                down_msg += '\nContent: {}'.format(content[:CONTENT_LENGTH])
            service_checks.append((self.SC_STATUS, Status.DOWN, down_msg))

        # Store tags in a temporary list so that we don't modify the global tags data structure
        tags_list = list(tags)
        tags_list.append('url:{}'.format(addr))
        instance_name = self.normalize(instance['name'])
        tags_list.append("instance:{}".format(instance_name))
        service_checks = []
        r = None
        try:
            parsed_uri = urlparse(addr)
            self.log.debug("Connecting to {}".format(addr))

            suppress_warning = False
            if disable_ssl_validation and parsed_uri.scheme == "https":
                explicit_validation = 'disable_ssl_validation' in instance
                if ignore_ssl_warning:
                    if explicit_validation:
                        suppress_warning = True
                else:
                    # Log if we're skipping SSL validation for HTTPS URLs
                    if explicit_validation:
                        self.log.debug(
                            "Skipping SSL certificate validation for {} based on configuration"
                            .format(addr))

                    # Emit a warning if disable_ssl_validation is not explicitly set and we're not ignoring warnings
                    else:
                        self.warning(
                            "Parameter disable_ssl_validation for {} is not explicitly set, "
                            "defaults to true".format(addr))

            instance_proxy = self.get_instance_proxy(instance, addr)
            self.log.debug("Proxies used for {} - {}".format(
                addr, instance_proxy))

            auth = None
            if password is not None:
                if username is not None:
                    auth = (username, password)
                elif ntlm_domain is not None:
                    auth = HttpNtlmAuth(ntlm_domain, password)

            sess = requests.Session()
            sess.trust_env = False
            if weakcipher:
                base_addr = '{uri.scheme}://{uri.netloc}/'.format(
                    uri=parsed_uri)
                sess.mount(base_addr, WeakCiphersAdapter())
                self.log.debug(
                    "Weak Ciphers will be used for {}. Supported Cipherlist: {}"
                    .format(base_addr,
                            WeakCiphersHTTPSConnection.SUPPORTED_CIPHERS))

            with warnings.catch_warnings():
                # Suppress warnings from urllib3 only if disable_ssl_validation is explicitly set to True
                #  and ignore_ssl_warning is True
                if suppress_warning:
                    warnings.simplefilter('ignore', InsecureRequestWarning)

                # Add 'Content-Type' for non GET requests when they have not been specified in custom headers
                if method.upper(
                ) in DATA_METHODS and not headers.get('Content-Type'):
                    headers[
                        'Content-Type'] = 'application/x-www-form-urlencoded'

                r = sess.request(
                    method.upper(),
                    addr,
                    auth=auth,
                    timeout=timeout,
                    headers=headers,
                    proxies=instance_proxy,
                    allow_redirects=allow_redirects,
                    stream=stream,
                    verify=False
                    if disable_ssl_validation else instance_ca_certs,
                    json=data if method.upper() in DATA_METHODS
                    and isinstance(data, dict) else None,
                    data=data if method.upper() in DATA_METHODS
                    and isinstance(data, string_types) else None,
                    cert=(client_cert,
                          client_key) if client_cert and client_key else None,
                )

        except (socket.timeout, requests.exceptions.ConnectionError,
                requests.exceptions.Timeout) as e:
            length = int((time.time() - start) * 1000)
            self.log.info(
                "{} is DOWN, error: {}. Connection failed after {} ms".format(
                    addr, str(e), length))
            service_checks.append(
                (self.SC_STATUS, Status.DOWN,
                 "{}. Connection failed after {} ms".format(str(e), length)))

        except socket.error as e:
            length = int((time.time() - start) * 1000)
            self.log.info(
                "{} is DOWN, error: {}. Connection failed after {} ms".format(
                    addr, repr(e), length))
            service_checks.append(
                (self.SC_STATUS, Status.DOWN,
                 "Socket error: {}. Connection failed after {} ms".format(
                     repr(e), length)))

        except Exception as e:
            length = int((time.time() - start) * 1000)
            self.log.error(
                "Unhandled exception {}. Connection failed after {} ms".format(
                    str(e), length))
            raise

        else:
            # Only add the URL tag if it's not already present
            if not any(filter(re.compile('^url:').match, tags_list)):
                tags_list.append('url:{}'.format(addr))

            # Only report this metric if the site is not down
            if response_time and not service_checks:
                # Stop the timer as early as possible
                running_time = time.time() - start
                self.gauge('network.http.response_time',
                           running_time,
                           tags=tags_list)

            content = r.text

            # Check HTTP response status code
            if not (service_checks or re.match(http_response_status_code,
                                               str(r.status_code))):
                if http_response_status_code == DEFAULT_EXPECTED_CODE:
                    expected_code = "1xx or 2xx or 3xx"
                else:
                    expected_code = http_response_status_code

                message = "Incorrect HTTP return code for url {}. Expected {}, got {}.".format(
                    addr, expected_code, str(r.status_code))

                if include_content:
                    message += '\nContent: {}'.format(content[:CONTENT_LENGTH])

                self.log.info(message)

                service_checks.append((self.SC_STATUS, Status.DOWN, message))

            if not service_checks:
                # Host is UP
                # Check content matching is set
                if content_match:
                    if re.search(content_match, content, re.UNICODE):
                        if reverse_content_match:
                            send_status_down(
                                '{} is found in return content with the reverse_content_match option'
                                .format(ensure_unicode(content_match)),
                                'Content "{}" found in response with the reverse_content_match'
                                .format(ensure_unicode(content_match)),
                            )
                        else:
                            send_status_up(
                                "{} is found in return content".format(
                                    ensure_unicode(content_match)))

                    else:
                        if reverse_content_match:
                            send_status_up(
                                "{} is not found in return content with the reverse_content_match option"
                                .format(ensure_unicode(content_match)))
                        else:
                            send_status_down(
                                "{} is not found in return content".format(
                                    ensure_unicode(content_match)),
                                'Content "{}" not found in response.'.format(
                                    ensure_unicode(content_match)),
                            )

                else:
                    send_status_up("{} is UP".format(addr))
        finally:
            if r is not None:
                r.close()

        # Report status metrics as well
        if service_checks:
            can_status = 1 if service_checks[0][1] == "UP" else 0
            self.gauge('network.http.can_connect', can_status, tags=tags_list)

            # cant_connect is useful for top lists
            cant_status = 0 if service_checks[0][1] == "UP" else 1
            self.gauge('network.http.cant_connect',
                       cant_status,
                       tags=tags_list)

        if ssl_expire and parsed_uri.scheme == "https":
            status, days_left, seconds_left, msg = self.check_cert_expiration(
                instance, timeout, instance_ca_certs, check_hostname,
                client_cert, client_key)
            tags_list = list(tags)
            tags_list.append('url:{}'.format(addr))
            tags_list.append("instance:{}".format(instance_name))
            self.gauge('http.ssl.days_left', days_left, tags=tags_list)
            self.gauge('http.ssl.seconds_left', seconds_left, tags=tags_list)

            service_checks.append((self.SC_SSL_CERT, status, msg))

        return service_checks
예제 #23
0
def from_instance(instance, default_ca_certs=None):
    """
    Create a config object from an instance dictionary
    """
    method = instance.get('method', 'get')
    data = instance.get('data', {})
    tags = instance.get('tags', [])
    client_cert = instance.get('tls_cert') or instance.get('client_cert')
    client_key = instance.get('tls_private_key') or instance.get('client_key')
    http_response_status_code = str(
        instance.get('http_response_status_code', DEFAULT_EXPECTED_CODE))
    config_headers = instance.get('headers', {})
    default_headers = is_affirmative(
        instance.get("include_default_headers", True))
    if default_headers:
        headers = agent_headers({})
    else:
        headers = {}
    headers.update(config_headers)
    url = instance.get('url')
    if url is not None:
        url = ensure_unicode(url)
    content_match = instance.get('content_match')
    if content_match is not None:
        content_match = ensure_unicode(content_match)
    reverse_content_match = is_affirmative(
        instance.get('reverse_content_match', False))
    response_time = is_affirmative(instance.get('collect_response_time', True))
    if not url:
        raise ConfigurationError("Bad configuration. You must specify a url")
    if not url.startswith("http"):
        raise ConfigurationError(
            "The url {} must start with the scheme http or https".format(url))
    include_content = is_affirmative(instance.get('include_content', False))
    ssl_expire = is_affirmative(
        instance.get('check_certificate_expiration', True))
    instance_ca_certs = instance.get(
        'tls_ca_cert', instance.get('ca_certs', default_ca_certs))
    weakcipher = is_affirmative(instance.get('weakciphers', False))
    check_hostname = is_affirmative(instance.get('check_hostname', True))
    allow_redirects = is_affirmative(instance.get('allow_redirects', True))
    stream = is_affirmative(instance.get('stream', False))

    return Config(
        url,
        client_cert,
        client_key,
        method,
        data,
        http_response_status_code,
        include_content,
        headers,
        response_time,
        content_match,
        reverse_content_match,
        tags,
        ssl_expire,
        instance_ca_certs,
        weakcipher,
        check_hostname,
        allow_redirects,
        stream,
    )
예제 #24
0
    def _collect_metrics_async(self, instance, query_specs):
        """ Task that collects the metrics listed in the morlist for one MOR
        """
        # ## <TEST-INSTRUMENTATION>
        t = Timer()
        # ## </TEST-INSTRUMENTATION>
        i_key = self._instance_key(instance)
        server_instance = self._get_server_instance(instance)
        perfManager = server_instance.content.perfManager
        results = perfManager.QueryPerf(query_specs)
        if results:
            for mor_perfs in results:
                mor_name = str(mor_perfs.entity)
                try:
                    mor = self.mor_cache.get_mor(i_key, mor_name)
                except MorNotFoundError:
                    self.log.error(
                        "Trying to get metrics from object %s deleted from the cache, skipping. "
                        "Consider increasing the parameter `clean_morlist_interval` to avoid that",
                        mor_name,
                    )
                    continue

                for result in mor_perfs.value:
                    counter_id = result.id.counterId
                    if not self.metadata_cache.contains(i_key, counter_id):
                        self.log.debug(
                            "Skipping value for counter %s, because there is no metadata about it",
                            ensure_unicode(counter_id),
                        )
                        continue

                    # Metric types are absolute, delta, and rate
                    metric_name = self.metadata_cache.get_metadata(
                        i_key, result.id.counterId).get('name')

                    if self.in_compatibility_mode(instance):
                        if metric_name not in ALL_METRICS:
                            self.log.debug("Skipping unknown `%s` metric.",
                                           ensure_unicode(metric_name))
                            continue

                    if not result.value:
                        self.log.debug(
                            "Skipping `%s` metric because the value is empty",
                            ensure_unicode(metric_name))
                        continue

                    instance_name = result.id.instance or "none"
                    value = self._transform_value(instance,
                                                  result.id.counterId,
                                                  result.value[0])

                    hostname = mor['hostname']

                    tags = [
                        'instance:{}'.format(ensure_unicode(instance_name))
                    ]
                    if not hostname:  # no host tags available
                        tags.extend(mor['tags'])
                    else:
                        hostname = to_string(hostname)

                    tags.extend(instance.get('tags', []))

                    # vsphere "rates" should be submitted as gauges (rate is
                    # precomputed).
                    self.gauge("vsphere.{}".format(
                        ensure_unicode(metric_name)),
                               value,
                               hostname=hostname,
                               tags=tags)

        # ## <TEST-INSTRUMENTATION>
        custom_tags = instance.get('tags', []) + ['instance:{}'.format(i_key)]
        self.histogram('datadog.agent.vsphere.metric_colection.time',
                       t.total(),
                       tags=custom_tags)
예제 #25
0
    def _process_mor_objects_queue_async(self, instance, mors):
        """
        Process a batch of items popped from the objects queue by querying the available
        metrics for these MORs and then putting them in the Mor cache
        """
        t = time.time()
        i_key = self._instance_key(instance)
        server_instance = self._get_server_instance(instance)
        perfManager = server_instance.content.perfManager

        # For non realtime metrics, we need to specifically ask which counters are available for which entity,
        # so we call perfManager.QueryAvailablePerfMetric for each cluster, datacenter, datastore
        # This should be okay since the number of such entities shouldn't be excessively large
        for mor in mors:
            mor_name = str(mor['mor'])
            available_metrics = {m.counterId for m in perfManager.QueryAvailablePerfMetric(entity=mor["mor"])}
            try:
                self.mor_cache.set_metrics(i_key, mor_name, self._compute_needed_metrics(instance, available_metrics))
            except MorNotFoundError:
                self.log.error("Object '{}' is missing from the cache, skipping. ".format(ensure_unicode(mor_name)))
                continue

        # TEST-INSTRUMENTATION
        self.histogram(
            'datadog.agent.vsphere.morlist_process_atomic.time', time.time() - t, tags=instance.get('tags', [])
        )
예제 #26
0
    def _compute_needed_metrics(self, instance, available_metrics):
        """ Compare the available metrics for one MOR we have computed and intersect them
        with the set of metrics we want to report
        """
        i_key = self._instance_key(instance)
        if self.in_compatibility_mode(instance):
            if instance.get('all_metrics', False):
                return available_metrics

            wanted_metrics = []
            # Get only the basic metrics
            for counter_id in available_metrics:
                # No cache yet, skip it for now
                if not self.metadata_cache.contains(i_key, counter_id):
                    self.log.debug(
                        "No metadata found for counter {}, will not collect it".format(ensure_unicode(counter_id))
                    )
                    continue
                metadata = self.metadata_cache.get_metadata(i_key, counter_id)
                if metadata.get('name') in BASIC_METRICS:
                    wanted_metrics.append(vim.PerformanceManager.MetricId(counterId=counter_id, instance="*"))

            return wanted_metrics
        else:
            # The metadata cache contains only metrics of the desired level, so use it to filter the metrics to keep
            return [
                vim.PerformanceManager.MetricId(counterId=counter_id, instance="*")
                for counter_id in available_metrics
                if self.metadata_cache.contains(i_key, counter_id)
            ]
예제 #27
0
    def collect_metrics(self, instance):
        """
        Calls asynchronously _collect_metrics_async on all MORs, as the
        job queue is processed the Aggregator will receive the metrics.
        """
        i_key = self._instance_key(instance)
        if not self.mor_cache.contains(i_key):
            self.log.debug(
                "Not collecting metrics for instance '%s', nothing to do yet.",
                i_key)
            return

        server_instance = self._get_server_instance(instance)
        max_historical_metrics = DEFAULT_MAX_HIST_METRICS

        if self._should_collect_historical(instance):
            try:
                if 'max_query_metrics' in instance:
                    max_historical_metrics = int(instance['max_query_metrics'])
                    self.log.info("Collecting up to %d metrics",
                                  max_historical_metrics)
                else:
                    vcenter_settings = server_instance.content.setting.QueryOptions(
                        "config.vpxd.stats.maxQueryMetrics")
                    max_historical_metrics = int(vcenter_settings[0].value)
                if max_historical_metrics < 0:
                    max_historical_metrics = float('inf')
            except Exception:
                pass

        # TODO: Remove me once the fix for `max_query_metrics` is here by default
        mors_batch_method = (self.mor_cache.mors_batch if is_affirmative(
            instance.get('fix_max_query_metrics')) else
                             self.mor_cache.legacy_mors_batch)

        vm_count = 0
        custom_tags = instance.get('tags', [])
        tags = [
            "vcenter_server:{}".format(ensure_unicode(instance.get('name')))
        ] + custom_tags

        n_mors = self.mor_cache.instance_size(i_key)
        if not n_mors:
            if self._is_main_instance(instance):
                self.gauge('vsphere.vm.count', vm_count, tags=tags)
            self.log.debug(
                "No Mor objects to process for instance '%s', skip...", i_key)
            return

        self.log.debug("Collecting metrics for %s mors",
                       ensure_unicode(n_mors))

        # Request metrics for several objects at once. We can limit the number of objects with batch_size
        # If batch_size is 0, process everything at once
        batch_size = self.batch_morlist_size or n_mors
        for batch in mors_batch_method(i_key, batch_size,
                                       max_historical_metrics):
            query_specs = []
            for mor in itervalues(batch):
                if mor['mor_type'] == 'vm':
                    vm_count += 1
                if mor['mor_type'] not in REALTIME_RESOURCES and (
                        'metrics' not in mor or not mor['metrics']):
                    continue

                query_spec = vim.PerformanceManager.QuerySpec()
                query_spec.entity = mor["mor"]
                query_spec.intervalId = mor.get("interval")
                query_spec.maxSample = 1
                if mor['mor_type'] in REALTIME_RESOURCES:
                    query_spec.metricId = self.metadata_cache.get_metric_ids(
                        i_key)
                else:
                    query_spec.metricId = mor["metrics"]
                query_specs.append(query_spec)

            if query_specs:
                self.pool.apply_async(self._collect_metrics_async,
                                      args=(instance, query_specs))

        if self._is_main_instance(instance):
            self.gauge('vsphere.vm.count', vm_count, tags=tags)
예제 #28
0
 def _parse_tags(self):
     self.tags = []
     self.tags.append(u'nfs_server:{0}'.format(ensure_unicode(self.nfs_server)))
     self.tags.append(u'nfs_export:{0}'.format(ensure_unicode(self.nfs_export)))
     self.tags.append(u'nfs_mount:{0}'.format(ensure_unicode(self.mount)))
예제 #29
0
    def check(self, instance):
        (
            addr,
            client_cert,
            client_key,
            method,
            data,
            http_response_status_code,
            include_content,
            headers,
            response_time,
            content_match,
            reverse_content_match,
            tags,
            ssl_expire,
            instance_ca_certs,
            weakcipher,
            check_hostname,
            allow_redirects,
            stream,
        ) = from_instance(instance, self.ca_certs)
        timeout = self.http.options['timeout'][0]
        start = time.time()
        # allows default headers to be included based on `include_default_headers` flag
        self.http.options['headers'] = headers

        def send_status_up(logMsg):
            # TODO: A6 log needs bytes and cannot handle unicode
            self.log.debug(logMsg)
            service_checks.append((self.SC_STATUS, AgentCheck.OK, "UP"))

        def send_status_down(loginfo, down_msg):
            # TODO: A6 log needs bytes and cannot handle unicode
            self.log.info(loginfo)
            down_msg = self._include_content(include_content, down_msg,
                                             content)
            service_checks.append(
                (self.SC_STATUS, AgentCheck.CRITICAL, down_msg))

        # Store tags in a temporary list so that we don't modify the global tags data structure
        tags_list = list(tags)
        tags_list.append('url:{}'.format(addr))
        instance_name = self.normalize_tag(instance['name'])
        tags_list.append("instance:{}".format(instance_name))
        service_checks = []
        service_checks_tags = self._get_service_checks_tags(instance)
        r = None
        try:
            parsed_uri = urlparse(addr)
            self.log.debug("Connecting to %s", addr)
            self.http.session.trust_env = False
            if weakcipher:
                base_addr = '{uri.scheme}://{uri.netloc}/'.format(
                    uri=parsed_uri)
                self.http.session.mount(base_addr, WeakCiphersAdapter())
                self.log.debug(
                    "Weak Ciphers will be used for %s. Supported Cipherlist: %s",
                    base_addr,
                    WeakCiphersHTTPSConnection.SUPPORTED_CIPHERS,
                )

            # Add 'Content-Type' for non GET requests when they have not been specified in custom headers
            if method.upper(
            ) in DATA_METHODS and not headers.get('Content-Type'):
                self.http.options['headers'][
                    'Content-Type'] = 'application/x-www-form-urlencoded'

            r = getattr(self.http, method.lower())(
                addr,
                persist=True,
                allow_redirects=allow_redirects,
                stream=stream,
                json=data if method.upper() in DATA_METHODS
                and isinstance(data, dict) else None,
                data=data if method.upper() in DATA_METHODS
                and isinstance(data, string_types) else None,
            )
        except (socket.timeout, requests.exceptions.ConnectionError,
                requests.exceptions.Timeout) as e:
            length = int((time.time() - start) * 1000)
            self.log.info(
                "%s is DOWN, error: %s. Connection failed after %s ms", addr,
                e, length)
            service_checks.append(
                (self.SC_STATUS, AgentCheck.CRITICAL,
                 "{}. Connection failed after {} ms".format(str(e), length)))

        except socket.error as e:
            length = int((time.time() - start) * 1000)
            self.log.info(
                "%s is DOWN, error: %s. Connection failed after %s ms", addr,
                repr(e), length)
            service_checks.append((
                self.SC_STATUS,
                AgentCheck.CRITICAL,
                "Socket error: {}. Connection failed after {} ms".format(
                    repr(e), length),
            ))
        except IOError as e:  # Py2 throws IOError on invalid cert path while py3 throws a socket.error
            length = int((time.time() - start) * 1000)
            self.log.info(
                "Host %s could not be reached: %s. Connection failed after %s ms",
                addr, repr(e), length)
            service_checks.append((
                self.SC_STATUS,
                AgentCheck.CRITICAL,
                "Socket error: {}. Connection failed after {} ms".format(
                    repr(e), length),
            ))
        except Exception as e:
            length = int((time.time() - start) * 1000)
            self.log.error(
                "Unhandled exception %s. Connection failed after %s ms", e,
                length)
            raise

        else:
            # Only add the URL tag if it's not already present
            if not any(filter(re.compile('^url:').match, tags_list)):
                tags_list.append('url:{}'.format(addr))

            # Only report this metric if the site is not down
            if response_time and not service_checks:
                self.gauge('network.http.response_time',
                           r.elapsed.total_seconds(),
                           tags=tags_list)

            content = r.text

            # Check HTTP response status code
            if not (service_checks or re.match(http_response_status_code,
                                               str(r.status_code))):
                if http_response_status_code == DEFAULT_EXPECTED_CODE:
                    expected_code = "1xx or 2xx or 3xx"
                else:
                    expected_code = http_response_status_code

                message = "Incorrect HTTP return code for url {}. Expected {}, got {}.".format(
                    addr, expected_code, str(r.status_code))
                message = self._include_content(include_content, message,
                                                content)

                self.log.info(message)

                service_checks.append(
                    (self.SC_STATUS, AgentCheck.CRITICAL, message))

            if not service_checks:
                # Host is UP
                # Check content matching is set
                if content_match:
                    if re.search(content_match, content, re.UNICODE):
                        if reverse_content_match:
                            send_status_down(
                                '{} is found in return content with the reverse_content_match option'
                                .format(ensure_unicode(content_match)),
                                'Content "{}" found in response with the reverse_content_match'
                                .format(ensure_unicode(content_match)),
                            )
                        else:
                            send_status_up(
                                "{} is found in return content".format(
                                    ensure_unicode(content_match)))

                    else:
                        if reverse_content_match:
                            send_status_up(
                                "{} is not found in return content with the reverse_content_match option"
                                .format(ensure_unicode(content_match)))
                        else:
                            send_status_down(
                                "{} is not found in return content".format(
                                    ensure_unicode(content_match)),
                                'Content "{}" not found in response.'.format(
                                    ensure_unicode(content_match)),
                            )

                else:
                    send_status_up("{} is UP".format(addr))
        finally:
            if r is not None:
                r.close()
            # resets the wrapper Session object
            self.http._session.close()
            self.http._session = None

        # Report status metrics as well
        if service_checks:
            can_status = 1 if service_checks[0][1] == AgentCheck.OK else 0
            self.gauge('network.http.can_connect', can_status, tags=tags_list)

            # cant_connect is useful for top lists
            cant_status = 0 if service_checks[0][1] == AgentCheck.OK else 1
            self.gauge('network.http.cant_connect',
                       cant_status,
                       tags=tags_list)

        if ssl_expire and parsed_uri.scheme == "https":
            status, days_left, seconds_left, msg = self.check_cert_expiration(
                instance, timeout, instance_ca_certs, check_hostname,
                client_cert, client_key)
            tags_list = list(tags)
            tags_list.append('url:{}'.format(addr))
            tags_list.append("instance:{}".format(instance_name))
            self.gauge('http.ssl.days_left', days_left, tags=tags_list)
            self.gauge('http.ssl.seconds_left', seconds_left, tags=tags_list)

            service_checks.append((self.SC_SSL_CERT, status, msg))

        for status in service_checks:
            sc_name, status, msg = status
            self.report_as_service_check(sc_name, status, service_checks_tags,
                                         msg)
예제 #30
0
    def check(self, instance):
        sites = instance.get('sites')
        if sites is None:
            expected_sites = set()
        else:
            expected_sites = set(sites)
        # _Total should always be in the list of expected sites; we always
        # report _Total
        if "_Total" not in expected_sites:
            expected_sites.add("_Total")

        self.log.debug("expected sites is {}".format(str(expected_sites)))
        key = hash_mutable(instance)
        for inst_name, dd_name, metric_func, counter in self._metrics[key]:
            try:
                try:
                    vals = counter.get_all_values()
                except Exception as e:
                    self.log.error("Failed to get_all_values {} {}".format(
                        inst_name, dd_name))
                    continue

                for sitename, val in iteritems(vals):
                    tags = []
                    if key in self._tags:
                        tags = list(self._tags[key])

                    try:
                        if not counter.is_single_instance():
                            # Skip any sites we don't specifically want.
                            if not sites:
                                tags.append("site:{0}".format(
                                    ensure_unicode(self.normalize(sitename))))
                            # always report total
                            elif sitename == "_Total":
                                tags.append("site:{0}".format(
                                    ensure_unicode(self.normalize(sitename))))
                            elif sitename not in sites:
                                continue
                            else:
                                tags.append("site:{0}".format(
                                    ensure_unicode(self.normalize(sitename))))
                    except Exception as e:
                        self.log.error(
                            "Caught exception {} setting tags".format(str(e)))

                    try:
                        metric_func(dd_name, val, tags)
                    except Exception as e:
                        self.log.error("metric_func: {} {} {}".format(
                            dd_name, str(val), str(e)))
                        pass

                    if dd_name == "iis.uptime":
                        uptime = int(val)
                        status = AgentCheck.CRITICAL if uptime == 0 else AgentCheck.OK
                        self.service_check(self.SERVICE_CHECK, status, tags)
                        if sitename in expected_sites:
                            self.log.debug(
                                "Removing {} from expected sites".format(
                                    sitename))
                            expected_sites.remove(sitename)
                        else:
                            self.log.warning(
                                "site not in expected_sites {}".format(
                                    sitename))

            except Exception as e:
                # don't give up on all of the metrics because one failed
                self.log.error(
                    "IIS Failed to get metric data for {} {}: {}".format(
                        inst_name, dd_name, str(e)))
                pass

        for site in expected_sites:
            tags = []
            if key in self._tags:
                tags = list(self._tags[key])
            tags.append("site:{}".format(ensure_unicode(self.normalize(site))))
            self.service_check(self.SERVICE_CHECK, AgentCheck.CRITICAL, tags)