def get_connection(self, key, host, port, user, password, dbname, ssl, connect_fct, tags, use_cached=True): "Get and memoize connections to instances" if key in self.dbs and use_cached: return self.dbs[key] elif host != "" and user != "": try: if host == 'localhost' and password == '': # Use ident method connection = connect_fct("user=%s dbname=%s" % (user, dbname)) elif port != '': connection = connect_fct(host=host, port=port, user=user, password=password, database=dbname, ssl=ssl) elif host.startswith('/'): # If the hostname starts with /, it's probably a path # to a UNIX socket. This is similar behaviour to psql connection = connect_fct(unix_sock=host, user=user, password=password, database=dbname) else: connection = connect_fct(host=host, user=user, password=password, database=dbname, ssl=ssl) except Exception as e: message = u'Error establishing postgres connection: %s' % (str(e)) service_check_tags = self._get_service_check_tags(host, port, tags) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=message) raise else: if not host: raise CheckException("Please specify a Postgres host to connect to.") elif not user: raise CheckException("Please specify a user to connect to Postgres as.") self.dbs[key] = connection return connection
def check(self, instance): service_check_metric_name = 'nifi.instance.http_check' timeout = 10 if 'url' not in instance: raise CheckException("No url defined for Nifi instance") url = instance.get('url') url = "{0}/{1}".format(url, ENDPOINT) instance_tags = instance.get('tags', []) self.log.info('Connecting to Nifi instance {0}'.format(url)) try: r = requests.get(url, timeout=timeout) r.raise_for_status() except requests.exceptions.Timeout as e: self.service_check(service_check_metric_name, self.WARNING, tags=instance_tags, message=str(e)) return except Exception as e: self.service_check(service_check_metric_name, self.CRITICAL, tags=instance_tags) raise CheckException(e) self.service_check(service_check_metric_name, self.OK, tags=instance_tags) # Obtain all the key metrics from Nifi to send to DataDog for point in NiFiCheck.get_system_metrics(r.json()): if type(point.metric) is int: self.rate(point.type, point.metric, tags=instance_tags) else: self.gauge(point.type, point.metric, tags=instance_tags) time.sleep(1)
def _get_custom_metrics(self, custom_metrics, key): # Pre-processed cached custom_metrics if key in self.custom_metrics: return self.custom_metrics[key] # Otherwise pre-process custom metrics and verify definition required_parameters = ("descriptors", "metrics", "query", "relation") for m in custom_metrics: for param in required_parameters: if param not in m: raise CheckException( "Missing {0} parameter in custom metric".format(param)) self.log.debug("Metric: {0}".format(m)) try: for ref, (_, mtype) in m['metrics'].iteritems(): cap_mtype = mtype.upper() if cap_mtype not in ('RATE', 'GAUGE', 'MONOTONIC'): raise CheckException( "Collector method {0} is not known. " "Known methods are RATE, GAUGE, MONOTONIC".format( cap_mtype)) m['metrics'][ref][1] = getattr(PostgreSql, cap_mtype) self.log.debug("Method: %s" % (str(mtype))) except Exception as e: raise CheckException( "Error processing custom metric '{}': {}".format(m, e)) self.custom_metrics[key] = custom_metrics return custom_metrics
def _get_connect_kwargs(self, host, port, user, password, database_url): """ Get the params to pass to psycopg2.connect() based on passed-in vals from yaml settings file """ if database_url: return {'dsn': database_url} if not host: raise CheckException( "Please specify a PgBouncer host to connect to.") if not user: raise CheckException( "Please specify a user to connect to PgBouncer as.") if host in ('localhost', '127.0.0.1') and password == '': return { # Use ident method 'dsn': "user={} dbname={}".format(user, self.DB_NAME) } if port: return {'host': host, 'user': user, 'password': password, 'database': self.DB_NAME, 'port': port} return {'host': host, 'user': user, 'password': password, 'database': self.DB_NAME}
def _send_service_check(self, url, response, status, failure_expected=False, tags=None, message=None): if status is AgentCheck.CRITICAL and failure_expected: status = AgentCheck.OK message = "Got %s when hitting %s" % (response.status_code, url) raise CheckException(message) elif status is AgentCheck.CRITICAL and not failure_expected: raise CheckException('Cannot connect to mesos. Error: {0}'.format(message)) if self.service_check_needed: self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=message) self.service_check_needed = False
def get_metadata(self, type): try: response = requests.get( "http://169.254.169.254/latest/meta-data/{}".format(type)) response.raise_for_status() except requests.exceptions.HTTPError as e: raise CheckException("HTTP error caught: {}".format(e)) except requests.exceptions.RequestException as e: raise CheckException("Connection error: {}".format(e)) return response.text
def check(self, instance): try: region_name = instance.get('region_name') if not region_name: region_name = 'us-east-1' pricing_client = boto3.client('pricing', region_name=region_name) service_codes = get_aws_service_codes(pricing_client) rate_codes_dict = get_rate_codes_dict_from_instance( service_codes, instance) # Python dictionaries evaluate to false when empty if not rate_codes_dict: message = 'No rate codes for existing AWS services were defined, please fix conf.yaml' self.service_check('aws_pricing.status', self.CRITICAL, message=message) raise CheckException(message) missing_rate_codes = defaultdict(list) for service_code, rate_codes in iteritems(rate_codes_dict): for rate_code in rate_codes: price_dimensions = get_aws_prices(pricing_client, service_code, rate_code) if price_dimensions is None: missing_rate_codes[service_code].append(rate_code) continue name = 'aws.pricing.{}'.format(service_code.lower()) price = get_price_from_price_dimensions(price_dimensions) tags = get_tags_from_price_dimensions(price_dimensions) self.gauge(name, price, tags) # Python dictionaries evaluate to true when not empty if not missing_rate_codes: self.service_check('aws_pricing.status', self.OK) else: message = 'Pricing data not found for these service rate codes: {}'.format( dict(missing_rate_codes)) self.service_check('aws_pricing.status', self.WARNING, message=message) except ClientError as client_error: self.service_check('aws_pricing.status', self.CRITICAL, message=str(client_error)) raise CheckException('Pricing Service client error: {}'.format( str(client_error)))
def check(self, instance): url = instance.get('url', '') default_timeout = instance.get('default_timeout', 5) timeout = float(instance.get('timeout', default_timeout)) tags = instance.get('tags', []) if not url: raise CheckException("Configuration error, please fix conf.yaml") try: r = requests.get(url, timeout=timeout) except requests.exceptions.Timeout as e: raise CheckException('URL: {0} timed out after {1} \ seconds.'.format(url, timeout)) except requests.exceptions.ConnectionError as e: raise CheckException(e) if r.status_code != 200: raise CheckException('Invalid Status Code, {0} returned a status \ of {1}.'.format(url, r.status_code)) try: stats = json.loads(r.text) except ValueError as e: raise CheckException('{0} returned an unserializable \ payload'.format(url)) for key, val in stats.iteritems(): if key in self.REPL_STATS: self.safe_submit_metric("riak_repl." + key, val, tags=tags) if stats['realtime_enabled'] is not None: for key, val in stats['realtime_queue_stats'].iteritems(): if key in self.REALTIME_QUEUE_STATS: self.safe_submit_metric("riak_repl.realtime_queue_stats." + key, val, tags=tags) for c in stats['connected_clusters']: cluster = c.replace("-", "_") if c not in stats['fullsync_coordinator']: continue for key, val in stats['fullsync_coordinator'][c].iteritems(): if key in self.FULLSYNC_COORDINATOR: self.safe_submit_metric("riak_repl.fullsync_coordinator." + cluster + "." + key, val, tags=tags)
def _get_tls_object(self, ssl_params): """ Return a TLS object to establish a secure connection to a server """ if ssl_params is None: return None if not ssl_params["verify"] and ssl_params["ca_certs"]: self.warning("Incorrect configuration: trying to disable server certificate validation, " "while also specifying a capath. No validation will be performed. Fix your " "configuration to remove this warning") validate = ssl.CERT_REQUIRED if ssl_params["verify"] else ssl.CERT_NONE if ssl_params["ca_certs"] is None or os.path.isfile(ssl_params["ca_certs"]): tls = ldap3.core.tls.Tls( local_private_key_file=ssl_params["key"], local_certificate_file=ssl_params["cert"], ca_certs_file=ssl_params["ca_certs"], version=ssl.PROTOCOL_SSLv23, validate=validate, ) elif os.path.isdir(ssl_params["ca_certs"]): tls = ldap3.core.tls.Tls( local_private_key_file=ssl_params["key"], local_certificate_file=ssl_params["cert"], ca_certs_path=ssl_params["ca_certs"], version=ssl.PROTOCOL_SSLv23, validate=validate, ) else: raise CheckException("Invalid path {} for ssl_ca_certs: no such file or directory" .format(ssl_params["ca_certs"])) return tls
def _create_gitlab_runner_prometheus_instance(self, instance, init_config): """ Set up the gitlab_runner instance so it can be used in OpenMetricsBaseCheck """ # Mapping from Prometheus metrics names to Datadog ones # For now it's a 1:1 mapping allowed_metrics = init_config.get('allowed_metrics') if allowed_metrics is None: raise CheckException( "At least one metric must be whitelisted in `allowed_metrics`." ) gitlab_runner_instance = deepcopy(instance) # gitlab_runner uses 'prometheus_endpoint' and not 'prometheus_url', so we have to rename the key gitlab_runner_instance['prometheus_url'] = instance.get( 'prometheus_endpoint', None) gitlab_runner_instance.update({ 'namespace': 'gitlab_runner', 'metrics': allowed_metrics, # Defaults that were set when gitlab_runner was based on PrometheusCheck 'send_monotonic_counter': instance.get('send_monotonic_counter', False), 'health_service_check': instance.get('health_service_check', False) }) return gitlab_runner_instance
def check(self, instance): endpoint = instance.get('kube_state_url') if endpoint is None: raise CheckException("Unable to find kube_state_url in config file.") if 'labels_mapper' in instance: if isinstance(instance['labels_mapper'], dict): self.labels_mapper = instance['labels_mapper'] else: self.log.warning("labels_mapper should be a dictionnary") send_buckets = instance.get('send_histograms_buckets', True) # By default we send the buckets. if send_buckets is not None and str(send_buckets).lower() == 'false': send_buckets = False else: send_buckets = True self.custom_tags = instance.get('tags', []) if self.custom_tags is None: self.custom_tags = [] # Job counters are monotonic: they increase at every run of the job # We want to send the delta via the `monotonic_count` method self.job_succeeded_count = defaultdict(int) self.job_failed_count = defaultdict(int) self.process(endpoint, send_histograms_buckets=send_buckets, instance=instance) for job_tags, job_count in self.job_succeeded_count.iteritems(): self.monotonic_count(self.NAMESPACE + '.job.succeeded', job_count, list(job_tags)) for job_tags, job_count in self.job_failed_count.iteritems(): self.monotonic_count(self.NAMESPACE + '.job.failed', job_count, list(job_tags))
def check(self, instance): host, custom_tags, timeout, response_time = self._load_conf(instance) custom_tags.append("target_host:{}".format(host)) try: lines = self._exec_ping(timeout, host) regex = re.compile(r"time=((\d|\.)*)") result = regex.findall(lines) if result: length = result[0][0] else: raise CheckException("No time= found ({})".format(lines)) except CheckException as e: self.log.info("{} is DOWN ({})".format(host, str(e))) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, custom_tags, message=str(e)) self.gauge(self.SERVICE_CHECK_NAME, 0, custom_tags) raise e if response_time: self.gauge("network.ping.response_time", length, custom_tags) self.log.debug("{} is UP".format(host)) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, custom_tags) self.gauge(self.SERVICE_CHECK_NAME, 1, custom_tags)
def _exec_ping(self, timeout, target_host): if platform.system() == "Windows": # pragma: nocover countOption = "-n" timeoutOption = "-w" # The timeout option is in ms on Windows # https://docs.microsoft.com/en-us/windows-server/administration/windows-commands/ping timeout = timeout * 1000 elif platform.system() == "Darwin": countOption = "-c" timeoutOption = "-W" # Also in ms on Mac timeout = timeout * 1000 else: # The timeout option is is seconds on Linux, leaving timeout as is # https://linux.die.net/man/8/ping countOption = "-c" timeoutOption = "-W" self.log.debug("Running: ping {} {} {} {} {}".format( countOption, "1", timeoutOption, str(timeout), target_host)) lines, err, retcode = get_subprocess_output([ "ping", countOption, "1", timeoutOption, str(timeout), target_host ], self.log, raise_on_empty_output=True) self.log.debug("ping returned {} - {} - {}".format( retcode, lines, err)) if retcode != 0: raise CheckException("ping returned {}: {}".format(retcode, err)) return lines
def check(self, instance): #### Metrics collection endpoint = instance.get('prometheus_endpoint') custom_tags = instance.get('tags', []) if endpoint is None: raise CheckException( "Unable to find prometheus_endpoint in config file.") # By default we send the buckets send_buckets = _is_affirmative( instance.get('send_histograms_buckets', True)) try: self.process(endpoint, send_histograms_buckets=send_buckets, instance=instance) self.service_check(self.PROMETHEUS_SERVICE_CHECK_NAME, PrometheusCheck.OK, tags=custom_tags) except requests.exceptions.ConnectionError as e: # Unable to connect to the metrics endpoint self.service_check( self.PROMETHEUS_SERVICE_CHECK_NAME, PrometheusCheck.CRITICAL, message= "Unable to retrieve Prometheus metrics from endpoint %s: %s" % (endpoint, e.message), tags=custom_tags) #### Service check to check whether the Runner can talk to the Gitlab master self._check_connectivity_to_master(instance, custom_tags)
def check(self, instance): self.kubelet_conn_info = get_connection_info() endpoint = self.kubelet_conn_info.get('url') if endpoint is None: raise CheckException( "Unable to find metrics_endpoint in config " "file or detect the kubelet URL automatically.") self.metrics_url = instance.get('metrics_endpoint') or urljoin( endpoint, CADVISOR_METRICS_PATH) self.kube_health_url = urljoin(endpoint, KUBELET_HEALTH_PATH) self.node_spec_url = urljoin(endpoint, NODE_SPEC_PATH) self.pod_list_url = urljoin(endpoint, POD_LIST_PATH) # By default we send the buckets. send_buckets = instance.get('send_histograms_buckets', True) if send_buckets is not None and str(send_buckets).lower() == 'false': send_buckets = False else: send_buckets = True try: self.pod_list = self.retrieve_pod_list() except Exception: self.pod_list = None instance_tags = instance.get('tags', []) self._perform_kubelet_check(instance_tags) self._report_node_metrics(instance_tags) self._report_pods_running(self.pod_list, instance_tags) self._report_container_spec_metrics(self.pod_list, instance_tags) self.process(self.metrics_url, send_histograms_buckets=send_buckets, instance=instance)
def check(self, instance): # Metrics collection endpoint = instance.get('prometheus_endpoint') if endpoint is None: raise CheckException( "Unable to find prometheus_endpoint in config file.") scraper_config = self.config_map[endpoint] custom_tags = instance.get('tags', []) try: self.process(scraper_config) self.service_check(self.PROMETHEUS_SERVICE_CHECK_NAME, OpenMetricsBaseCheck.OK, tags=custom_tags) except requests.exceptions.ConnectionError as e: # Unable to connect to the metrics endpoint self.service_check( self.PROMETHEUS_SERVICE_CHECK_NAME, OpenMetricsBaseCheck.CRITICAL, message= "Unable to retrieve Prometheus metrics from endpoint {}: {}". format(endpoint, e), tags=custom_tags, ) # Service check to check whether the Runner can talk to the Gitlab master self._check_connectivity_to_master(instance, custom_tags)
def check(self, instance): # Metrics collection endpoint = instance.get('prometheus_endpoint') if endpoint is None: raise CheckException( "Unable to find prometheus_endpoint in config file.") # By default we send the buckets send_buckets = _is_affirmative( instance.get('send_histograms_buckets', True)) custom_tags = instance.get('tags', []) try: self.process(endpoint, send_histograms_buckets=send_buckets, instance=instance) self.service_check(self.PROMETHEUS_SERVICE_CHECK_NAME, PrometheusCheck.OK, tags=custom_tags) except requests.exceptions.ConnectionError as e: # Unable to connect to the metrics endpoint self.service_check( self.PROMETHEUS_SERVICE_CHECK_NAME, PrometheusCheck.CRITICAL, message= "Unable to retrieve Prometheus metrics from endpoint {}: {}". format(endpoint, e.message), tags=custom_tags, ) # Service check to check Gitlab's health endpoints for check_type in self.ALLOWED_SERVICE_CHECKS: self._check_health_endpoint(instance, check_type, custom_tags)
def check(self, instance): # Metrics collection endpoint = instance.get('prometheus_endpoint') if endpoint is None: raise CheckException( "Unable to find prometheus_endpoint in config file.") scraper_config = self.config_map[endpoint] custom_tags = instance.get('tags', []) try: self.process(scraper_config) self.service_check(self.PROMETHEUS_SERVICE_CHECK_NAME, OpenMetricsBaseCheck.OK, tags=custom_tags) except requests.exceptions.ConnectionError as e: # Unable to connect to the metrics endpoint self.service_check( self.PROMETHEUS_SERVICE_CHECK_NAME, OpenMetricsBaseCheck.CRITICAL, message= "Unable to retrieve Prometheus metrics from endpoint {}: {}". format(endpoint, e), tags=custom_tags, ) # Service check to check Gitlab's health endpoints for check_type in self.ALLOWED_SERVICE_CHECKS: self._check_health_endpoint(instance, check_type, custom_tags) self.submit_version(instance)
def _check_health_endpoint(self, instance, check_type, tags): if check_type not in self.ALLOWED_SERVICE_CHECKS: raise CheckException( "Health endpoint {} is not a valid endpoint".format( check_type)) url = instance.get('gitlab_url') if url is None: # Simply ignore this service check if not configured self.log.debug( "gitlab_url not configured, service check %s skipped", check_type) return service_check_tags = self._service_check_tags(url) service_check_tags.extend(tags) # These define which endpoint is hit and which type of check is actually performed # TODO: parse errors and report for single sub-service failure? service_check_name = 'gitlab.{}'.format(check_type) check_url = '{}/-/{}'.format(url, check_type) try: self.log.debug("checking %s against %s", check_type, check_url) r = self.http.get(check_url) if r.status_code != 200: self.service_check( service_check_name, OpenMetricsBaseCheck.CRITICAL, message="Got {} when hitting {}".format( r.status_code, check_url), tags=service_check_tags, ) raise Exception("Http status code {} on check_url {}".format( r.status_code, check_url)) else: r.raise_for_status() except requests.exceptions.Timeout: # If there's a timeout self.service_check( service_check_name, OpenMetricsBaseCheck.CRITICAL, message="Timeout when hitting {}".format(check_url), tags=service_check_tags, ) raise except Exception as e: self.service_check( service_check_name, OpenMetricsBaseCheck.CRITICAL, message="Error hitting {}. Error: {}".format(check_url, e), tags=service_check_tags, ) raise else: self.service_check(service_check_name, OpenMetricsBaseCheck.OK, tags=service_check_tags) self.log.debug("gitlab check %s succeeded", check_type)
def _get_json(self, url, timeout, verify, tags=None): tags = tags + ["url:%s" % url] if tags else ["url:%s" % url] msg = None status = None try: r = requests.get(url, timeout=timeout, verify=verify) if r.status_code != 200: status = AgentCheck.CRITICAL msg = "Got %s when hitting %s" % (r.status_code, url) else: status = AgentCheck.OK msg = "Mesos master instance detected at %s " % url except requests.exceptions.Timeout: # If there's a timeout msg = "%s seconds timeout when hitting %s" % (timeout, url) status = AgentCheck.CRITICAL except Exception as e: msg = str(e) status = AgentCheck.CRITICAL finally: self.log.debug('Request to url : {0}, timeout: {1}, message: {2}'.format(url, timeout, msg)) if self.service_check_needed: self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg) self.service_check_needed = False if status is AgentCheck.CRITICAL: raise CheckException('Cannot connect to mesos. Error: {0}'.format(msg)) if r.encoding is None: r.encoding = 'UTF8' return r.json()
def check(self, instance): """ Process the istio_mesh, process_mixer, pilot, and galley endpoints associated with this instance. All the endpoints themselves are optional, but at least one must be passed. """ processed = False # Get the config for the istio_mesh instance istio_mesh_endpoint = instance.get('istio_mesh_endpoint') if istio_mesh_endpoint: istio_mesh_config = self.config_map[istio_mesh_endpoint] # Process istio_mesh self.process(istio_mesh_config) processed = True # Get the config for the process_mixer instance process_mixer_endpoint = instance.get('mixer_endpoint') if process_mixer_endpoint: process_mixer_config = self.config_map[process_mixer_endpoint] # Process process_mixer self.process(process_mixer_config) processed = True # Get the config for the process_pilot instance process_pilot_endpoint = instance.get('pilot_endpoint') if process_pilot_endpoint: process_pilot_config = self.config_map[process_pilot_endpoint] # Process process_pilot self.process(process_pilot_config) processed = True # Get the config for the process_galley instance process_galley_endpoint = instance.get('galley_endpoint') if process_galley_endpoint: process_galley_config = self.config_map[process_galley_endpoint] # Process process_galley self.process(process_galley_config) processed = True # Get the config for the process_citadel instance process_citadel_endpoint = instance.get('citadel_endpoint') if process_citadel_endpoint: process_citadel_config = self.config_map[process_citadel_endpoint] # Process process_citadel self.process(process_citadel_config) processed = True # Check that at least 1 endpoint is configured if not processed: raise CheckException( "At least one of Mixer, Mesh, Pilot, Galley or Citadel endpoints must be configured" )
def _create_process_mixer_instance(self, instance): """ Grab the mixer scraper from the dict and return it if it exists, otherwise create the scraper and add it to the dict """ endpoint = instance.get('mixer_endpoint') if endpoint is None: raise CheckException("Unable to find mixer_endpoint in config file.") process_mixer_instance = deepcopy(instance) process_mixer_instance.update( { 'namespace': self.MIXER_NAMESPACE, 'prometheus_url': endpoint, 'metrics': [ { # Pre 1.1 metrics 'grpc_server_handled_total': 'grpc.server.handled_total', 'grpc_server_handling_seconds': 'grpc.server.handling_seconds', 'grpc_server_msg_received_total': 'grpc.server.msg_received_total', 'grpc_server_msg_sent_total': 'grpc.server.msg_sent_total', 'grpc_server_started_total': 'grpc.server.started_total', 'mixer_adapter_dispatch_count': 'adapter.dispatch_count', 'mixer_adapter_dispatch_duration': 'adapter.dispatch_duration', 'mixer_adapter_old_dispatch_count': 'adapter.old_dispatch_count', 'mixer_adapter_old_dispatch_duration': 'adapter.old_dispatch_duration', 'mixer_config_resolve_actions': 'config.resolve_actions', 'mixer_config_resolve_count': 'config.resolve_count', 'mixer_config_resolve_duration': 'config.resolve_duration', 'mixer_config_resolve_rules': 'config.resolve_rules', # 1.1 metrics 'grpc_io_server_completed_rpcs': 'grpc_io_server.completed_rpcs', 'grpc_io_server_received_bytes_per_rpc': 'grpc_io_server.received_bytes_per_rpc', 'grpc_io_server_sent_bytes_per_rpc': 'grpc_io_server.sent_bytes_per_rpc', 'grpc_io_server_server_latency': 'grpc_io_server.server_latency', 'mixer_config_attributes_total': 'config.attributes_total', 'mixer_config_handler_configs_total': 'config.handler_configs_total', 'mixer_config_instance_configs_total': 'config.instance_configs_total', 'mixer_config_rule_configs_total': 'config.rule_configs_total', 'mixer_dispatcher_destinations_per_request': 'dispatcher.destinations_per_request', 'mixer_dispatcher_instances_per_request': 'dispatcher.instances_per_request', 'mixer_handler_daemons_total': 'handler.daemons_total', 'mixer_handler_new_handlers_total': 'handler.new_handlers_total', 'mixer_mcp_sink_reconnections': 'mcp_sink.reconnections', 'mixer_mcp_sink_request_acks_total': 'mcp_sink.request_acks_total', 'mixer_runtime_dispatches_total': 'runtime.dispatches_total', 'mixer_runtime_dispatch_duration_seconds': 'runtime.dispatch_duration_seconds', } ], # Defaults that were set when istio was based on PrometheusCheck 'send_monotonic_counter': instance.get('send_monotonic_counter', False), 'health_service_check': instance.get('health_service_check', False), } ) process_mixer_instance['metrics'][0].update(self._get_generic_metrics()) return process_mixer_instance
def check(self, instance): if self.kube_apiserver_config is None: kube_apiserver_config = self._create_kube_apiserver_metrics_instance(instance) self.kube_apiserver_config = self.get_scraper_config(kube_apiserver_config) if not self.kube_apiserver_config['metrics_mapper']: url = self.kube_apiserver_config['prometheus_url'] raise CheckException("You have to collect at least one metric from the endpoint: {}".format(url)) self.process(self.kube_apiserver_config, metric_transformers=self.metric_transformers)
def check(self, instance): self.kubelet_conn_info = get_connection_info() endpoint = self.kubelet_conn_info.get('url') if endpoint is None: raise CheckException( "Unable to find metrics_endpoint in config " "file or detect the kubelet URL automatically.") self.metrics_url = instance.get( 'metrics_endpoint', urljoin(endpoint, CADVISOR_METRICS_PATH)) self.kube_health_url = urljoin(endpoint, KUBELET_HEALTH_PATH) self.node_spec_url = urljoin(endpoint, NODE_SPEC_PATH) self.pod_list_url = urljoin(endpoint, POD_LIST_PATH) # Legacy cadvisor support try: self.cadvisor_legacy_url = self.detect_cadvisor( endpoint, self.cadvisor_legacy_port) except Exception as e: self.log.debug( 'cAdvisor not found, running in prometheus mode: %s' % str(e)) # By default we send the buckets. send_buckets = instance.get('send_histograms_buckets', True) if send_buckets is not None and str(send_buckets).lower() == 'false': send_buckets = False else: send_buckets = True try: self.pod_list = self.retrieve_pod_list() if self.pod_list.get("items") is None: # Sanitize input: if no pod are running, 'items' is a NoneObject self.pod_list['items'] = [] except Exception: self.pod_list = None self.container_filter = ContainerFilter(self.pod_list) self.instance_tags = instance.get('tags', []) self._perform_kubelet_check(self.instance_tags) self._report_node_metrics(self.instance_tags) self._report_pods_running(self.pod_list, self.instance_tags) self._report_container_spec_metrics(self.pod_list, self.instance_tags) if self.cadvisor_legacy_url: # Legacy cAdvisor self.process_cadvisor(instance, self.cadvisor_legacy_url, self.pod_list, self.container_filter) elif self.metrics_url: # Prometheus self.process(self.metrics_url, send_histograms_buckets=send_buckets, instance=instance) # Free up memory self.pod_list = None self.container_filter = None
def _load_conf(self, instance): # Fetches the conf timeout = float(instance.get("timeout", 4)) response_time = instance.get("collect_response_time", False) custom_tags = instance.get("tags", []) host = instance.get("host", None) if host is None: raise CheckException("A valid host must be specified") return host, custom_tags, timeout, response_time
def __init__(self, name, init_config, agentConfig, instances=None): super(GitlabRunnerCheck, self).__init__(name, init_config, agentConfig, instances) # Mapping from Prometheus metrics names to Datadog ones # For now it's a 1:1 mapping # TODO: mark some metrics as rate allowed_metrics = init_config.get('allowed_metrics') if not allowed_metrics: raise CheckException("At least one metric must be whitelisted in `allowed_metrics`.") self.metrics_mapper = dict(zip(allowed_metrics, allowed_metrics)) self.NAMESPACE = 'gitlab_runner'
def check(self, instance): host = instance.get('host') port = instance.get('port', '8080') path = instance.get('path', '/health') if not host: self.warning("Configuration error, please fix traefik.yaml") raise CheckException( "Configuration error, please fix traefik.yaml") try: url = 'http://{}:{}{}'.format(host, port, path) response = requests.get(url) response_status_code = response.status_code if response_status_code == 200: self.service_check('traefik.health', self.OK) payload = response.json() if 'total_status_code_count' in payload: values = payload['total_status_code_count'] for status_code in values: self.gauge('traefik.total_status_code_count', values[status_code], ['status_code:' + status_code]) else: self.log.warn( 'Field total_status_code_count not found in response.') if 'total_count' in payload: self.gauge('traefik.total_count', payload['total_count']) else: self.log.warn('Field total_count not found in response.') else: self.service_check( 'traefik.health', self.CRITICAL, message="Traefik health check return code is not 200") except requests.exceptions.ConnectionError: self.service_check('traefik.health', self.CRITICAL, message="Traefik endpoint unreachable") except Exception as e: self.service_check('traefik.health', self.UNKNOWN, message="UNKNOWN exception" + str(e))
def check(self, instance): self.metric_count = 0 self.services_up = 0 instance_tags = instance.get('tags', []) consumer = instance.get('consumer') if not consumer: raise CheckException( "The consumer must be specified in the configuration.") url = self.URL + '?consumer=' + consumer try: json = self._get_metrics_json(url) if 'services' not in json: self.service_check( self.METRICS_SERVICE_CHECK, AgentCheck.WARNING, tags=instance_tags, message="No services in response from metrics proxy on {}". format(url)) return for service in json['services']: service_name = service['name'] self._report_service_status(instance_tags, service_name, service) for metrics in service['metrics']: self._emit_metrics(service_name, metrics, instance_tags) self.log.info("Forwarded {} metrics to hq for {} services".format( self.metric_count, self.services_up)) self.service_check( self.METRICS_SERVICE_CHECK, AgentCheck.OK, tags=instance_tags, message="Metrics collected successfully for consumer {}". format(consumer)) except Timeout as e: self._report_metrics_error( "Timed out connecting to Vespa's node metrics api: {}".format( e), AgentCheck.CRITICAL, instance_tags) except (HTTPError, InvalidURL, ConnectionError) as e: self._report_metrics_error( "Could not connect to Vespa's node metrics api: {}".format(e), AgentCheck.CRITICAL, instance_tags) except JSONDecodeError as e: self._report_metrics_error( "Error parsing JSON from Vespa's node metrics api: {}".format( e), AgentCheck.CRITICAL, instance_tags) except Exception as e: self._report_metrics_error("Unexpected error: {}".format(e), AgentCheck.WARNING, instance_tags)
def check(self, instance): socket = instance.get('socket') server = instance.get('url') options = instance.get('options', {}) username = instance.get('username') password = instance.get('password') if not server and not socket: raise InvalidConfigError( 'Either "url" or "socket" must be configured') if socket: server = 'unix' port = socket connection_server = "{}".format(port) else: port = int(instance.get('port', self.DEFAULT_PORT)) connection_server = "{}:{}".format(server, port) custom_tags = instance.get('tags') or [] mc = None # client tags = ["url:{0}:{1}".format(server, port)] + custom_tags service_check_tags = ["host:%s" % server, "port:%s" % port] + custom_tags try: self.log.debug("Connecting to %s, tags:%s", connection_server, tags) mc = bmemcached.Client(connection_server, username, password) self._get_metrics(mc, tags, service_check_tags) if options: # setting specific handlers self.OPTIONAL_STATS["items"][2] = Memcache.get_items_stats self.OPTIONAL_STATS["slabs"][2] = Memcache.get_slabs_stats self._get_optional_metrics(mc, tags, options) except BadResponseError as e: self.service_check(self.SERVICE_CHECK, AgentCheck.CRITICAL, tags=service_check_tags, message="Unable to fetch stats from server") raise CheckException( "Unable to retrieve stats from memcache instance: {}:{}." "Please check your configuration. ({})".format( server, port, e)) if mc is not None: mc.disconnect_all() self.log.debug("Disconnected from memcached") del mc
def check(self, instance): endpoint = instance.get('prometheus_endpoint') if endpoint is None: raise CheckException("Unable to find prometheus_endpoint in config file.") self.set_prometheus_timeout(instance) send_buckets = instance.get('send_histograms_buckets', True) # By default we send the buckets. if send_buckets is not None and str(send_buckets).lower() == 'false': send_buckets = False else: send_buckets = True self.process(endpoint, send_histograms_buckets=send_buckets, instance=instance)