Пример #1
0
    def discover_api(self):
        # type: () -> Tuple[str, str]
        self.log.info(
            "Discovering Cloud Foundry API version and authentication endpoint"
        )
        try:
            res = self.http.get(self._api_url)
        except RequestException:
            self.log.exception("Error connecting to the API server")
            raise
        try:
            res.raise_for_status()
        except HTTPError:
            self.log.exception("Error querying API information: response: %s",
                               res.text)
            raise
        try:
            payload = res.json()
        except ValueError:
            self.log.exception("Error decoding API information: response: %s",
                               res.text)
            raise

        links = payload.get("links")
        if not links:
            raise CheckException(
                "Unable to inspect API information from payload {}".format(
                    payload))

        api_v3_version = "0.0.0"
        try:
            api_v3_version = links["cloud_controller_v3"]["meta"]["version"]
        except Exception:
            self.log.debug(
                "cloud_controller_v3 information not found, defaulting to v2")

        try:
            uaa_url = links["uaa"]["href"]
        except Exception:
            raise CheckException(
                "Unable to collect API version and/or UAA URL from links {}".
                format(links))

        api_version = "v2"
        if semver.parse_version_info(api_v3_version) >= MIN_V3_VERSION:
            api_version = "v3"
        self.log.info("Discovered API `%s` and UAA URL `%s`", api_version,
                      uaa_url)
        return api_version, uaa_url
Пример #2
0
    def _create_gitlab_runner_prometheus_instance(self, instance, init_config):
        """
        Set up the gitlab_runner instance so it can be used in OpenMetricsBaseCheck
        """
        # Mapping from Prometheus metrics names to Datadog ones
        # For now it's a 1:1 mapping
        allowed_metrics = init_config.get('allowed_metrics')
        if allowed_metrics is None:
            raise CheckException("At least one metric must be whitelisted in `allowed_metrics`.")

        # Users may want to only report the version
        # OpenMetricsCheck doesn't allow the metadata_metric_name to be one of the metrics
        if 'ci_runner_version_info' in allowed_metrics:
            allowed_metrics.remove('ci_runner_version_info')

        gitlab_runner_instance = deepcopy(instance)

        # gitlab_runner uses 'prometheus_endpoint' and not 'prometheus_url', so we have to rename the key
        gitlab_runner_instance['prometheus_url'] = instance.get('prometheus_endpoint', None)

        gitlab_runner_instance.update(
            {
                'namespace': 'gitlab_runner',
                'metrics': allowed_metrics,
                # Defaults that were set when gitlab_runner was based on PrometheusCheck
                'send_monotonic_counter': instance.get('send_monotonic_counter', False),
                'health_service_check': instance.get('health_service_check', False),
                'metadata_metric_name': 'ci_runner_version_info',
                'metadata_label_map': {'version': 'version'},
            }
        )

        return gitlab_runner_instance
Пример #3
0
    def _run_socket_commands(parsed_url, commands):
        if parsed_url.scheme == 'tcp':
            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            splitted_loc = parsed_url.netloc.split(':')
            host = splitted_loc[0]
            port = int(splitted_loc[1])
            sock.connect((host, port))
        else:
            sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
            sock.connect(parsed_url.path)

        sock.send(b';'.join(commands) + b"\r\n")

        response = ""
        output = sock.recv(BUFSIZE)
        while output:
            response += output.decode("ASCII")
            output = sock.recv(BUFSIZE)
        sock.close()

        responses = response.split('\n\n')
        if len(responses) != len(commands) + 1 or responses[len(responses) -
                                                            1] != '':
            raise CheckException(
                "Got a different number of responses than expected")

        return tuple(r.splitlines() for r in responses[:len(commands)])
Пример #4
0
    def check(self, instance):
        # Metrics collection
        endpoint = instance.get('prometheus_endpoint')
        if endpoint is None:
            raise CheckException(
                "Unable to find prometheus_endpoint in config file.")

        scraper_config = self.config_map[endpoint]
        custom_tags = instance.get('tags', [])

        try:
            self.process(scraper_config)
            self.service_check(self.PROMETHEUS_SERVICE_CHECK_NAME,
                               OpenMetricsBaseCheck.OK,
                               tags=custom_tags)
        except requests.exceptions.ConnectionError as e:
            # Unable to connect to the metrics endpoint
            self.service_check(
                self.PROMETHEUS_SERVICE_CHECK_NAME,
                OpenMetricsBaseCheck.CRITICAL,
                message=
                "Unable to retrieve Prometheus metrics from endpoint {}: {}".
                format(endpoint, e),
                tags=custom_tags,
            )

        # Service check to check whether the Runner can talk to the Gitlab master
        self._check_connectivity_to_master(instance, custom_tags)
Пример #5
0
    def check(self, instance):
        host, custom_tags, timeout, response_time = self._load_conf(instance)

        custom_tags.append("target_host:{}".format(host))

        try:
            lines = self._exec_ping(timeout, host)
            regex = re.compile(r"time[<=]((\d|\.)*)")
            result = regex.findall(lines)
            if result:
                length = result[0][0]
            else:
                raise CheckException("No time= found ({})".format(lines))

        except CheckException as e:
            self.log.info("%s is DOWN (%s)", host, e)
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               custom_tags,
                               message=str(e))
            self.gauge(self.SERVICE_CHECK_NAME, 0, custom_tags)

            raise e

        if response_time:
            self.gauge("network.ping.response_time", length, custom_tags)

        self.log.debug("%s is UP", host)
        self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, custom_tags)
        self.gauge(self.SERVICE_CHECK_NAME, 1, custom_tags)
Пример #6
0
    def __init__(self, name, init_config, instances):
        super(MaprCheck, self).__init__(name, init_config, instances)
        self._conn = None
        self.hostname = self.instance.get('hostname', get_fqdn())
        self.streams_count = self.instance.get('streams_count', 1)
        self.topic_path = "{stream_path}/{stream_id}:{topic_name}".format(
            stream_path=self.instance.get('stream_path', DEFAULT_STREAM_PATH),
            stream_id=get_stream_id_for_topic(self.hostname,
                                              rng=self.streams_count),
            topic_name=self.hostname,
        )
        self.allowed_metrics = [
            re.compile(w) for w in self.instance.get('metric_whitelist', [])
        ]
        self.custom_tags = self.instance.get('tags', [])
        self.has_ever_submitted_metrics = False
        self._disable_legacy_cluster_tag = is_affirmative(
            self.instance.get('disable_legacy_cluster_tag', False))
        self.auth_ticket = self.instance.get(
            'ticket_location', os.environ.get(TICKET_LOCATION_ENV_VAR))

        if not self.auth_ticket:
            self.log.warning(
                "Neither `ticket_location` (in the config.yaml) or the %s environment variable is set. This will"
                "cause authentication issues if your cluster requires authenticated requests.",
                TICKET_LOCATION_ENV_VAR,
            )
        elif not os.access(self.auth_ticket, os.R_OK):
            raise CheckException(
                "MapR authentication ticket located at %s is not readable by the dd-agent "
                "user. Please update the file permissions.",
                self.auth_ticket,
            )
        else:
            os.environ[TICKET_LOCATION_ENV_VAR] = self.auth_ticket
Пример #7
0
    def _create_gitlab_runner_prometheus_instance(self, instance, init_config):
        """
        Set up the gitlab_runner instance so it can be used in OpenMetricsBaseCheck
        """
        # Mapping from Prometheus metrics names to Datadog ones
        # For now it's a 1:1 mapping
        allowed_metrics = init_config.get('allowed_metrics')
        if allowed_metrics is None:
            raise CheckException(
                "At least one metric must be whitelisted in `allowed_metrics`."
            )

        gitlab_runner_instance = deepcopy(instance)

        # gitlab_runner uses 'prometheus_endpoint' and not 'prometheus_url', so we have to rename the key
        gitlab_runner_instance['prometheus_url'] = instance.get(
            'prometheus_endpoint', None)

        gitlab_runner_instance.update({
            'namespace':
            'gitlab_runner',
            'metrics':
            allowed_metrics,
            # Defaults that were set when gitlab_runner was based on PrometheusCheck
            'send_monotonic_counter':
            instance.get('send_monotonic_counter', False),
            'health_service_check':
            instance.get('health_service_check', False),
        })

        return gitlab_runner_instance
Пример #8
0
    def check(self, instance):
        # Metrics collection
        endpoint = instance.get('prometheus_url',
                                instance.get('prometheus_endpoint'))
        if endpoint is None:
            raise CheckException(
                "Unable to find `prometheus_url` or `prometheus_endpoint` in config file."
            )

        scraper_config = self.config_map[endpoint]

        try:
            self.process(scraper_config)
            self.service_check(self.PROMETHEUS_SERVICE_CHECK_NAME,
                               OpenMetricsBaseCheck.OK, self._tags)
        except requests.exceptions.ConnectionError as e:
            # Unable to connect to the metrics endpoint
            self.service_check(
                self.PROMETHEUS_SERVICE_CHECK_NAME,
                OpenMetricsBaseCheck.CRITICAL,
                message=
                "Unable to retrieve Prometheus metrics from endpoint {}: {}".
                format(endpoint, e),
            )

        # Service check to check Gitlab's health endpoints
        for check_type in self.ALLOWED_SERVICE_CHECKS:
            self._check_health_endpoint(instance, check_type)

        self.submit_version(instance)
Пример #9
0
    def _exec_ping(self, timeout, target_host):
        if platform.system() == "Windows":  # pragma: nocover
            countOption = "-n"
            timeoutOption = "-w"
            # The timeout option is in ms on Windows
            # https://docs.microsoft.com/en-us/windows-server/administration/windows-commands/ping
            timeout = timeout * 1000
        elif platform.system() == "Darwin":
            countOption = "-c"
            timeoutOption = "-W"  # Also in ms on Mac
            timeout = timeout * 1000
        else:
            # The timeout option is is seconds on Linux, leaving timeout as is
            # https://linux.die.net/man/8/ping
            countOption = "-c"
            timeoutOption = "-W"

        self.log.debug("Running: ping %s %s %s %s %s", countOption, "1",
                       timeoutOption, timeout, target_host)

        lines, err, retcode = get_subprocess_output([
            "ping", countOption, "1", timeoutOption,
            str(timeout), target_host
        ],
                                                    self.log,
                                                    raise_on_empty_output=True)
        self.log.debug("ping returned %s - %s - %s", retcode, lines, err)
        if retcode != 0:
            raise CheckException("ping returned {}: {}".format(retcode, err))

        return lines
Пример #10
0
    def check(self, _):
        service_check_tags = ['dir_name:{}'.format(self._config.name)]
        service_check_tags.extend(self._config.tags)
        if not exists(self._config.abs_directory):
            msg = ("Either directory '{}' doesn't exist or the Agent doesn't "
                   "have permissions to access it, skipping.".format(
                       self._config.abs_directory))
            # report missing directory
            self.service_check(name=SERVICE_DIRECTORY_EXISTS,
                               status=self.WARNING,
                               tags=service_check_tags,
                               message=msg)

            # raise exception if `ignore_missing` is False
            if not self._config.ignore_missing:
                raise CheckException(msg)

            self.log.warning(msg)

            # return gracefully, nothing to look for
            return

        self.service_check(name=SERVICE_DIRECTORY_EXISTS,
                           tags=service_check_tags,
                           status=self.OK)
        self._get_stats()
Пример #11
0
    def _get_json(self, url, path, tags):
        try:
            r = self._perform_request(url, path)
        except requests.exceptions.Timeout:
            self.service_check(
                self.SERVICE_CHECK_NAME,
                self.CRITICAL,
                message='Timeout when hitting {}'.format(url),
                tags=tags + ['url:{}'.format(url)],
            )
            raise
        except Exception as e:
            self.service_check(
                self.SERVICE_CHECK_NAME,
                self.CRITICAL,
                message='Error hitting {}. Error: {}'.format(url, str(e)),
                tags=tags + ['url:{}'.format(url)],
            )
            raise

        if r.status_code != 200:
            self.service_check(
                self.SERVICE_CHECK_NAME,
                self.CRITICAL,
                message='Got {} when hitting {}'.format(r.status_code, url),
                tags=tags + ['url:{}'.format(url)],
            )
            raise CheckException('Http status code {} on url {}'.format(
                r.status_code, url))

        return r.json()
Пример #12
0
def get_schema_field(descriptors):
    # type: (List[Tuple[Any, str]]) -> str
    """Return column containing the schema name for that query."""
    for column, name in descriptors:
        if name == 'schema':
            return column
    raise CheckException("The descriptors are missing a schema field")
Пример #13
0
    def check(self, _):
        kubelet_conn_info = get_connection_info()
        endpoint = kubelet_conn_info.get('url')
        if endpoint is None:
            raise CheckException(
                "Unable to detect the kubelet URL automatically: " +
                kubelet_conn_info.get('err', ''))

        self.pod_list_url = endpoint.strip("/") + POD_LIST_PATH
        self.kubelet_credentials = KubeletCredentials(kubelet_conn_info)

        if self.fargate_mode:
            pod_list = self.retrieve_pod_list()
            for pod in pod_list.get('items', []):
                pod_id = pod.get('metadata', {}).get('uid')
                tagger_tags = tagger.tag('kubernetes_pod_uid://%s' % pod_id,
                                         tagger.ORCHESTRATOR) or []
                tagger_tags.extend(self.tags)
                tags = set(tagger_tags)
                # Submit the heartbeat metric for fargate virtual nodes.
                self.gauge(self.NAMESPACE + '.pods.running', 1, tags)
                pod_annotations = pod.get('metadata', {}).get('annotations')
                if CAPACITY_ANNOTATION_KEY not in pod_annotations:
                    continue
                cpu_val, mem_val = extract_resource_values(
                    pod_annotations.get(CAPACITY_ANNOTATION_KEY))
                if cpu_val == 0 or mem_val == 0:
                    continue
                self.gauge(self.NAMESPACE + '.cpu.capacity', cpu_val, tags)
                self.gauge(self.NAMESPACE + '.memory.capacity', mem_val, tags)
    def check(self, instance):
        """Main method"""
        endpoints_def = instance.get('endpoints')
        if not endpoints_def:
            raise CheckException('The list of metric endpoints is empty')
        if not isinstance(endpoints_def, (list, tuple)):
            raise CheckException(
                'Incorrect value specified for the list of metric endpoints')

        metric_def = self.init_config.get('metric_definitions', ALL_METRICS)
        for endpoint in endpoints_def:
            metrics = metric_def.get(endpoint)
            if metrics is None:
                raise CheckException(
                    'Unknown metric endpoint: {}'.format(endpoint))
            self.check_endpoint(instance, endpoint, metrics)
Пример #15
0
    def _run_socket_commands(parsed_url, commands):
        if parsed_url.scheme == 'tcp':
            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            splitted_loc = parsed_url.netloc.split(':')
            host = splitted_loc[0]
            port = int(splitted_loc[1])
            sock.connect((host, port))
        else:
            sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
            sock.connect(parsed_url.path)

        sock.send(b';'.join(commands) + b"\r\n")

        response = ""
        output = sock.recv(BUFSIZE)
        while output:
            response += output.decode("ASCII")
            output = sock.recv(BUFSIZE)
        sock.close()

        responses = [r.strip() for r in response.split('\n\n') if r.strip()]

        if len(responses) != len(commands):
            raise CheckException("Expected {} responses, got {}".format(
                len(commands), len(responses)))

        return tuple(r.splitlines() for r in responses)
Пример #16
0
    def __init__(self, name, init_config, instances):
        super(AerospikeCheck, self).__init__(name, init_config, instances)

        if not aerospike:
            msg = 'The `aerospike` client is not installed: {}'.format(aerospike_exception)
            self.log.error(msg)
            raise CheckException(msg)

        # https://www.aerospike.com/apidocs/python/aerospike.html#aerospike.client
        host = self.instance.get('host', 'localhost')
        port = int(self.instance.get('port', 3000))
        tls_name = self.instance.get('tls_name')
        self._host = (host, port, tls_name) if tls_name else (host, port)
        self._tls_config = self.instance.get('tls_config')
        if self._tls_config:
            self._tls_config['enable'] = True

        # https://www.aerospike.com/apidocs/python/client.html#aerospike.Client.connect
        self._username = self.instance.get('username')
        self._password = self.instance.get('password')

        # In milliseconds, see https://www.aerospike.com/apidocs/python/client.html#aerospike-info-policies
        timeout = int(self.instance.get('timeout', 10)) * 1000
        self._info_policies = {'timeout': timeout}

        self._metrics = set(self.instance.get('metrics', []))
        self._namespace_metrics = set(self.instance.get('namespace_metrics', []))
        self._required_namespaces = self.instance.get('namespaces')
        self._datacenter_metrics = set(self.instance.get('datacenter_metrics', []))
        self._required_datacenters = self.instance.get('datacenters')
        self._rate_metrics = set(self.init_config.get('mappings', []))
        self._tags = self.instance.get('tags', [])

        # We'll connect on the first check run
        self._client = None
Пример #17
0
    def fetch_all_values(cls, cursor, counters_list, logger, databases=None):
        # special case since this table is specific to databases, need to run query for each database instance
        rows = []
        columns = []

        if databases is None:
            databases = []

        cursor.execute(
            'select DB_NAME()'
        )  # This can return None in some implementations so it cannot be chained
        data = cursor.fetchall()
        current_db = data[0][0]
        logger.debug("%s: current db is %s", cls.__name__, current_db)

        for db in databases:
            # use statements need to be executed separate from select queries
            ctx = construct_use_statement(db)
            try:
                logger.debug(
                    "%s: changing cursor context via use statement: %s",
                    cls.__name__, ctx)
                cursor.execute(ctx)
                logger.debug("%s: fetch_all executing query: %s", cls.__name__,
                             cls.QUERY_BASE)
                cursor.execute(cls.QUERY_BASE)
                data = cursor.fetchall()
            except Exception as e:
                logger.warning(
                    "Error when trying to query db %s - skipping.  Error: %s",
                    db, e)
                continue

            query_columns = ['database'] + [i[0] for i in cursor.description]
            if columns:
                if columns != query_columns:
                    raise CheckException('Assertion error: {} != {}'.format(
                        columns, query_columns))
            else:
                columns = query_columns

            results = []
            # insert database name as new column for each row
            for row in data:
                r = list(row)
                r.insert(0, db)
                results.append(r)

            rows.extend(results)

            logger.debug("%s: received %d rows and %d columns for db %s",
                         cls.__name__, len(data), len(columns), db)

        # reset back to previous db
        logger.debug("%s: reverting cursor context via use statement to %s",
                     cls.__name__, current_db)
        cursor.execute(construct_use_statement(current_db))

        return rows, columns
Пример #18
0
    def check(self, instance):
        if self.kube_apiserver_config is None:
            self.kube_apiserver_config = self.get_scraper_config(instance)

        if not self.kube_apiserver_config['metrics_mapper']:
            url = self.kube_apiserver_config['prometheus_url']
            raise CheckException("You have to collect at least one metric from the endpoint: {}".format(url))
        self.process(self.kube_apiserver_config, metric_transformers=self.metric_transformers)
Пример #19
0
    def check(self, instance):
        url = instance.get('url', '')
        default_timeout = instance.get('default_timeout', 5)
        timeout = float(instance.get('timeout', default_timeout))
        tags = instance.get('tags', [])

        if not url:
            raise CheckException("Configuration error, please fix conf.yaml")

        try:
            r = requests.get(url, timeout=timeout)
        except requests.exceptions.Timeout:
            raise CheckException('URL: {0} timed out after {1} \
                                 seconds.'.format(url, timeout))
        except requests.exceptions.ConnectionError as e:
            raise CheckException(e)

        if r.status_code != 200:
            raise CheckException('Invalid Status Code, {0} returned a status \
                                 of {1}.'.format(url, r.status_code))

        try:
            stats = json.loads(r.text)
        except ValueError:
            raise CheckException('{0} returned an unserializable \
                                 payload'.format(url))

        for key, val in iteritems(stats):
            if key in self.REPL_STATS:
                self.safe_submit_metric("riak_repl." + key, val, tags=tags)

        if stats['realtime_enabled'] is not None:
            for key, val in iteritems(stats['realtime_queue_stats']):
                if key in self.REALTIME_QUEUE_STATS:
                    self.safe_submit_metric("riak_repl.realtime_queue_stats."
                                            + key, val, tags=tags)

        for c in stats['connected_clusters']:
            cluster = c.replace("-", "_")
            if c not in stats['fullsync_coordinator']:
                continue
            for key, val in iteritems(stats['fullsync_coordinator'][c]):
                if key in self.FULLSYNC_COORDINATOR:
                    self.safe_submit_metric("riak_repl.fullsync_coordinator."
                                            + cluster + "." + key,
                                            val, tags=tags)
    def check(self, instance):
        host, port, user, password, timeout, server_name = self._get_config(
            instance)
        tags = instance.get('tags', [])
        tags.append('server_name:{}'.format(server_name))
        service_check_tags = tags + ['url:{}'.format(host)]
        auth = (user, password)

        # Neo specific
        # Create payload using built-in Neo4j queryJmx stored procedure
        payload = {
            "statements": [{
                "statement":
                "CALL dbms.queryJmx('org.neo4j:*') yield attributes with  "
                "keys(attributes) as k, attributes unwind k as "
                "row return row, attributes[row]['value'];"
            }]
        }
        try:
            version = self._get_version(host, port, timeout, auth,
                                        service_check_tags)

            if version > 2:
                check_url = "{}:{}/db/data/transaction/commit".format(
                    host, port)
            else:
                check_url = "{}:{}/v1/service/metrics".format(host, port)
            r = requests.post(check_url,
                              auth=auth,
                              json=payload,
                              timeout=timeout)
        except Exception as e:
            msg = "Unable to fetch Neo4j stats: {}".format(e)
            self._critical_service_check(service_check_tags, msg)
            raise CheckException(msg)

        if r.status_code != 200:
            msg = "Unexpected status of {0} when fetching Neo4j stats, response: {1}"
            msg = msg.format(r.status_code, r.text)
            self._critical_service_check(service_check_tags, msg)
            r.raise_for_status()

        stats = r.json()
        self.service_check(self.SERVICE_CHECK_NAME,
                           AgentCheck.OK,
                           tags=service_check_tags)

        for doc in stats['results'][0]['data']:
            name = doc['row'][0].lower()
            if name in self.keys:
                try:
                    self.gauge(self.display.get(name, ""),
                               doc['row'][1],
                               tags=tags)
                except TypeError:
                    continue
                except ValueError:
                    continue
Пример #21
0
    def _check_health_endpoint(self, instance, check_type):
        if check_type not in self.ALLOWED_SERVICE_CHECKS:
            raise CheckException(
                "Health endpoint {} is not a valid endpoint".format(
                    check_type))

        url = instance.get('gitlab_url')

        if url is None:
            # Simply ignore this service check if not configured
            self.log.debug(
                "gitlab_url not configured, service check %s skipped",
                check_type)
            return

        # These define which endpoint is hit and which type of check is actually performed
        # TODO: parse errors and report for single sub-service failure?
        service_check_name = 'gitlab.{}'.format(check_type)
        check_url = '{}/-/{}'.format(url, check_type)

        try:
            self.log.debug("checking %s against %s", check_type, check_url)
            r = self.http.get(check_url)
            if r.status_code != 200:
                self.service_check(
                    service_check_name,
                    OpenMetricsBaseCheck.CRITICAL,
                    message="Got {} when hitting {}".format(
                        r.status_code, check_url),
                    tags=self._tags,
                )
                raise Exception("Http status code {} on check_url {}".format(
                    r.status_code, check_url))
            else:
                r.raise_for_status()

        except requests.exceptions.Timeout:
            # If there's a timeout
            self.service_check(
                service_check_name,
                OpenMetricsBaseCheck.CRITICAL,
                message="Timeout when hitting {}".format(check_url),
                tags=self._tags,
            )
            raise
        except Exception as e:
            self.service_check(
                service_check_name,
                OpenMetricsBaseCheck.CRITICAL,
                message="Error hitting {}. Error: {}".format(check_url, e),
                tags=self._tags,
            )
            raise

        else:
            self.service_check(service_check_name, OpenMetricsBaseCheck.OK,
                               self._tags)
        self.log.debug("gitlab check %s succeeded", check_type)
Пример #22
0
 def addrs(self):
     if self._addrs is None or self._addrs == []:
         try:
             self.resolve_ips()
         except Exception as e:
             self.log.error(str(e))
             msg = "URL: {} could not be resolved".format(self.host)
             raise CheckException(msg)
     return self._addrs
Пример #23
0
 def _process_error(self, error_msg):
     if error_msg.code() == ck.KafkaError.TOPIC_AUTHORIZATION_FAILED:
         if self.auth_ticket:
             raise CheckException(
                 "The user impersonated using the ticket %s does not have the 'consume' permission on topic %s. "
                 "Please update the stream permissions." %
                 (self.auth_ticket, self.topic_path))
         else:
             raise CheckException(
                 "dd-agent user could not consume topic '%s'. Please ensure that:\n"
                 "\t* This is a non-secure cluster, otherwise a user ticket is required.\n"
                 "\t* The dd-agent user has the 'consume' permission on topic %s or "
                 "impersonation is correctly configured." %
                 (self.topic_path, self.topic_path))
     elif error_msg.code() != ck.KafkaError._PARTITION_EOF:
         # Partition EOF is expected anytime we reach the end of one partition in the topic.
         # This is expected at least once per partition per check run.
         raise CheckException(error_msg)
Пример #24
0
    def get_api_json(self, url):

        try:
            key = self.api_key
            headers = {"X-Api-Key": key, "content-type": "application/json"}
            response = self.http.get(url, headers=headers)
        except Timeout as e:
            error_message = "Request timeout: {}, {}".format(url, e)
            self.log.warning(error_message)
            self.service_check(
                "can_connect",
                AgentCheck.CRITICAL,
                message=error_message,
            )
            raise

        except (HTTPError, InvalidURL, ConnectionError) as e:
            error_message = "Request failed: {}, {}".format(url, e)
            self.log.warning(error_message)
            self.service_check(
                "can_connect",
                AgentCheck.CRITICAL,
                message=error_message,
            )
            raise

        except JSONDecodeError as e:
            error_message = "JSON Parse failed: {}, {}".format(url, e)
            self.log.warning(error_message)
            self.service_check(
                "can_connect",
                AgentCheck.CRITICAL,
                message=error_message,
            )
            raise

        except ValueError as e:
            error_message = str(e)
            self.log.warning(error_message)
            self.service_check("can_connect",
                               AgentCheck.CRITICAL,
                               message=error_message)
            raise

        if response.status_code != 200:
            error_message = (
                "Expected status code 200 for url {}, but got status code: {} check your config information"
                .format(url, response.status_code))
            self.log.warning(error_message)
            self.service_check("can_connect",
                               AgentCheck.CRITICAL,
                               message=error_message)
            raise CheckException(error_message)
        else:
            self.service_check("can_connect", AgentCheck.OK)

        return response.json()
Пример #25
0
    def get_connection(self):
        if HanaConnection is None:
            raise CheckException(
                "hdbcli is not installed. Check the integration documentation to install it."
            )
        # https://help.sap.com/viewer/f1b440ded6144a54ada97ff95dac7adf/2.10/en-US/ee592e89dcce4480a99571a4ae7a702f.html
        connection_properties = self.instance.get('connection_properties',
                                                  {}).copy()

        connection_properties.setdefault('address', self._server)
        connection_properties.setdefault('port', self._port)
        connection_properties.setdefault('user', self._username)
        connection_properties.setdefault('password', self._password)

        timeout_milliseconds = int(self._timeout * 1000)
        connection_properties.setdefault('communicationTimeout',
                                         timeout_milliseconds)
        connection_properties.setdefault('nodeConnectTimeout',
                                         timeout_milliseconds)

        if self._use_tls:
            connection_properties.setdefault('encrypt', True)
            connection_properties.setdefault('sslHostNameInCertificate',
                                             self._server)
            connection_properties.setdefault('sslSNIHostname', self._server)

            tls_verify = self.instance.get('tls_verify', True)
            if not tls_verify:
                connection_properties.setdefault('sslValidateCertificate',
                                                 False)

            tls_cert = self.instance.get('tls_cert')
            if tls_cert:
                connection_properties.setdefault('sslKeyStore', tls_cert)

            tls_ca_cert = self.instance.get('tls_ca_cert')
            if tls_ca_cert:
                connection_properties.setdefault('sslTrustStore', tls_ca_cert)
            elif not connection_properties.get('sslUseDefaultTrustStore',
                                               True):
                connection_properties.setdefault('sslTrustStore',
                                                 certifi.where())

        try:
            connection = HanaConnection(**connection_properties)
        except Exception as e:
            error = str(e)
            self.log.error('Unable to connect to SAP HANA: %s', error)
            self.service_check(self.SERVICE_CHECK_CONNECT,
                               self.CRITICAL,
                               message=error,
                               tags=self._tags)
        else:
            self.service_check(self.SERVICE_CHECK_CONNECT,
                               self.OK,
                               tags=self._tags)
            return connection
Пример #26
0
    def check(self, instance):
        """
        Process all the endpoints associated with this instance.
        All the endpoints themselves are optional, but at least one must be passed.
        """
        processed = False
        # Get the config for the istio_mesh instance
        istio_mesh_endpoint = instance.get('istio_mesh_endpoint')
        if istio_mesh_endpoint:
            istio_mesh_config = self.config_map[istio_mesh_endpoint]

            # Process istio_mesh
            self.process(istio_mesh_config)
            processed = True

        # Get the config for the process_mixer instance
        process_mixer_endpoint = instance.get('mixer_endpoint')
        if process_mixer_endpoint:
            process_mixer_config = self.config_map[process_mixer_endpoint]

            # Process process_mixer
            self.process(process_mixer_config)
            processed = True

        # Get the config for the process_pilot instance
        process_pilot_endpoint = instance.get('pilot_endpoint')
        if process_pilot_endpoint:
            process_pilot_config = self.config_map[process_pilot_endpoint]

            # Process process_pilot
            self.process(process_pilot_config)
            processed = True

        # Get the config for the process_galley instance
        process_galley_endpoint = instance.get('galley_endpoint')
        if process_galley_endpoint:
            process_galley_config = self.config_map[process_galley_endpoint]

            # Process process_galley
            self.process(process_galley_config)
            processed = True

        # Get the config for the process_citadel instance
        process_citadel_endpoint = instance.get('citadel_endpoint')
        if process_citadel_endpoint:
            process_citadel_config = self.config_map[process_citadel_endpoint]

            # Process process_citadel
            self.process(process_citadel_config)
            processed = True

        # Check that at least 1 endpoint is configured
        if not processed:
            raise CheckException(
                "At least one of Mixer, Mesh, Pilot, Galley or Citadel endpoints must be configured"
            )
    def check(self, instance):
        try:
            region_name = instance.get('region_name')
            if not region_name:
                region_name = 'us-east-1'

            pricing_client = boto3.client('pricing', region_name=region_name)

            service_codes = get_aws_service_codes(pricing_client)
            rate_codes_dict = get_rate_codes_dict_from_instance(service_codes, instance)

            # Python dictionaries evaluate to false when empty
            if not rate_codes_dict:
                message = 'No rate codes for existing AWS services were defined, please fix conf.yaml'
                self.service_check('aws_pricing.status', self.CRITICAL, message=message)
                raise CheckException(message)

            missing_rate_codes = defaultdict(list)

            for service_code, rate_codes in iteritems(rate_codes_dict):
                for rate_code in rate_codes:
                    price_dimensions = get_aws_prices(pricing_client, service_code, rate_code)

                    if price_dimensions is None:
                        missing_rate_codes[service_code].append(rate_code)
                        continue

                    name = 'aws.pricing.{}'.format(service_code.lower())
                    price = get_price_from_price_dimensions(price_dimensions)
                    tags = get_tags_from_price_dimensions(price_dimensions)

                    self.gauge(name, price, tags)

            # Python dictionaries evaluate to true when not empty
            if not missing_rate_codes:
                self.service_check('aws_pricing.status', self.OK)
            else:
                message = 'Pricing data not found for these service rate codes: {}'.format(dict(missing_rate_codes))
                self.service_check('aws_pricing.status', self.WARNING, message=message)

        except ClientError as client_error:
            self.service_check('aws_pricing.status', self.CRITICAL, message=str(client_error))
            raise CheckException('Pricing Service client error: {}'.format(str(client_error)))
Пример #28
0
    def _load_conf(self, instance):
        # Fetches the conf
        timeout = float(instance.get("timeout", 4))
        response_time = instance.get("collect_response_time", False)
        custom_tags = instance.get("tags", [])

        host = instance.get("host", None)
        if host is None:
            raise CheckException("A valid host must be specified")

        return host, custom_tags, timeout, response_time
Пример #29
0
    def get_clusters(self, base_url):
        clusters_endpoint = common.CLUSTERS_URL.format(base_url=base_url)

        resp = self._make_request(clusters_endpoint)
        if resp is None:
            self._submit_service_checks("can_connect", self.CRITICAL, ["url:{}".format(base_url)])
            raise CheckException(
                "Couldn't connect to URL: {}. Please verify the address is reachable".format(clusters_endpoint)
            )

        self._submit_service_checks("can_connect", self.OK, ["url:{}".format(base_url)])
        return self._get_response_clusters(resp)
Пример #30
0
    def check(self, _):
        if ck is None:
            raise CheckException(
                "confluent_kafka was not imported correctly, make sure the library is installed and that you've "
                "set LD_LIBRARY_PATH correctly. Please refer to datadog documentation for more details. Error is %s"
                % ck_import_error)

        try:
            conn = self.get_connection()
        except Exception:
            self.service_check(
                SERVICE_CHECK, AgentCheck.CRITICAL,
                self.custom_tags + ['topic:{}'.format(self.topic_path)])
            raise
        else:
            self.service_check(
                SERVICE_CHECK, AgentCheck.OK,
                self.custom_tags + ['topic:{}'.format(self.topic_path)])

        submitted_metrics_count = 0

        while True:
            # Collecting one message at a time has no impact on performance because the library
            # batches data. Most calls to `poll` won't initiate a I/O connection.
            msg = conn.poll(timeout=0.5)
            if msg is None:
                # Timed out, no more messages
                break

            if msg.error() is None:
                # Metric received
                submitted_metrics_count += self._process_metric(msg)
            else:
                self._process_error(msg.error())

        if not self.has_ever_submitted_metrics:
            # The integration has never found any metric so far
            if submitted_metrics_count:
                self.has_ever_submitted_metrics = True
                self.log.info(
                    "The integration collected metrics for the first time in topic %s",
                    self.topic_path)
            else:
                self.log.error(
                    "The integration was not yet able to collect any MapR metric in topic %s. If this error continues "
                    "after a few check runs, double-check the existence of the stream and the topic using "
                    "maprcli as well as the permissions on this topic.",
                    self.topic_path,
                )

        if submitted_metrics_count:
            self.gauge(METRICS_SUBMITTED_METRIC_NAME, submitted_metrics_count,
                       self.custom_tags)