def discover_api(self): # type: () -> Tuple[str, str] self.log.info( "Discovering Cloud Foundry API version and authentication endpoint" ) try: res = self.http.get(self._api_url) except RequestException: self.log.exception("Error connecting to the API server") raise try: res.raise_for_status() except HTTPError: self.log.exception("Error querying API information: response: %s", res.text) raise try: payload = res.json() except ValueError: self.log.exception("Error decoding API information: response: %s", res.text) raise links = payload.get("links") if not links: raise CheckException( "Unable to inspect API information from payload {}".format( payload)) api_v3_version = "0.0.0" try: api_v3_version = links["cloud_controller_v3"]["meta"]["version"] except Exception: self.log.debug( "cloud_controller_v3 information not found, defaulting to v2") try: uaa_url = links["uaa"]["href"] except Exception: raise CheckException( "Unable to collect API version and/or UAA URL from links {}". format(links)) api_version = "v2" if semver.parse_version_info(api_v3_version) >= MIN_V3_VERSION: api_version = "v3" self.log.info("Discovered API `%s` and UAA URL `%s`", api_version, uaa_url) return api_version, uaa_url
def _create_gitlab_runner_prometheus_instance(self, instance, init_config): """ Set up the gitlab_runner instance so it can be used in OpenMetricsBaseCheck """ # Mapping from Prometheus metrics names to Datadog ones # For now it's a 1:1 mapping allowed_metrics = init_config.get('allowed_metrics') if allowed_metrics is None: raise CheckException("At least one metric must be whitelisted in `allowed_metrics`.") # Users may want to only report the version # OpenMetricsCheck doesn't allow the metadata_metric_name to be one of the metrics if 'ci_runner_version_info' in allowed_metrics: allowed_metrics.remove('ci_runner_version_info') gitlab_runner_instance = deepcopy(instance) # gitlab_runner uses 'prometheus_endpoint' and not 'prometheus_url', so we have to rename the key gitlab_runner_instance['prometheus_url'] = instance.get('prometheus_endpoint', None) gitlab_runner_instance.update( { 'namespace': 'gitlab_runner', 'metrics': allowed_metrics, # Defaults that were set when gitlab_runner was based on PrometheusCheck 'send_monotonic_counter': instance.get('send_monotonic_counter', False), 'health_service_check': instance.get('health_service_check', False), 'metadata_metric_name': 'ci_runner_version_info', 'metadata_label_map': {'version': 'version'}, } ) return gitlab_runner_instance
def _run_socket_commands(parsed_url, commands): if parsed_url.scheme == 'tcp': sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) splitted_loc = parsed_url.netloc.split(':') host = splitted_loc[0] port = int(splitted_loc[1]) sock.connect((host, port)) else: sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.connect(parsed_url.path) sock.send(b';'.join(commands) + b"\r\n") response = "" output = sock.recv(BUFSIZE) while output: response += output.decode("ASCII") output = sock.recv(BUFSIZE) sock.close() responses = response.split('\n\n') if len(responses) != len(commands) + 1 or responses[len(responses) - 1] != '': raise CheckException( "Got a different number of responses than expected") return tuple(r.splitlines() for r in responses[:len(commands)])
def check(self, instance): # Metrics collection endpoint = instance.get('prometheus_endpoint') if endpoint is None: raise CheckException( "Unable to find prometheus_endpoint in config file.") scraper_config = self.config_map[endpoint] custom_tags = instance.get('tags', []) try: self.process(scraper_config) self.service_check(self.PROMETHEUS_SERVICE_CHECK_NAME, OpenMetricsBaseCheck.OK, tags=custom_tags) except requests.exceptions.ConnectionError as e: # Unable to connect to the metrics endpoint self.service_check( self.PROMETHEUS_SERVICE_CHECK_NAME, OpenMetricsBaseCheck.CRITICAL, message= "Unable to retrieve Prometheus metrics from endpoint {}: {}". format(endpoint, e), tags=custom_tags, ) # Service check to check whether the Runner can talk to the Gitlab master self._check_connectivity_to_master(instance, custom_tags)
def check(self, instance): host, custom_tags, timeout, response_time = self._load_conf(instance) custom_tags.append("target_host:{}".format(host)) try: lines = self._exec_ping(timeout, host) regex = re.compile(r"time[<=]((\d|\.)*)") result = regex.findall(lines) if result: length = result[0][0] else: raise CheckException("No time= found ({})".format(lines)) except CheckException as e: self.log.info("%s is DOWN (%s)", host, e) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, custom_tags, message=str(e)) self.gauge(self.SERVICE_CHECK_NAME, 0, custom_tags) raise e if response_time: self.gauge("network.ping.response_time", length, custom_tags) self.log.debug("%s is UP", host) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, custom_tags) self.gauge(self.SERVICE_CHECK_NAME, 1, custom_tags)
def __init__(self, name, init_config, instances): super(MaprCheck, self).__init__(name, init_config, instances) self._conn = None self.hostname = self.instance.get('hostname', get_fqdn()) self.streams_count = self.instance.get('streams_count', 1) self.topic_path = "{stream_path}/{stream_id}:{topic_name}".format( stream_path=self.instance.get('stream_path', DEFAULT_STREAM_PATH), stream_id=get_stream_id_for_topic(self.hostname, rng=self.streams_count), topic_name=self.hostname, ) self.allowed_metrics = [ re.compile(w) for w in self.instance.get('metric_whitelist', []) ] self.custom_tags = self.instance.get('tags', []) self.has_ever_submitted_metrics = False self._disable_legacy_cluster_tag = is_affirmative( self.instance.get('disable_legacy_cluster_tag', False)) self.auth_ticket = self.instance.get( 'ticket_location', os.environ.get(TICKET_LOCATION_ENV_VAR)) if not self.auth_ticket: self.log.warning( "Neither `ticket_location` (in the config.yaml) or the %s environment variable is set. This will" "cause authentication issues if your cluster requires authenticated requests.", TICKET_LOCATION_ENV_VAR, ) elif not os.access(self.auth_ticket, os.R_OK): raise CheckException( "MapR authentication ticket located at %s is not readable by the dd-agent " "user. Please update the file permissions.", self.auth_ticket, ) else: os.environ[TICKET_LOCATION_ENV_VAR] = self.auth_ticket
def _create_gitlab_runner_prometheus_instance(self, instance, init_config): """ Set up the gitlab_runner instance so it can be used in OpenMetricsBaseCheck """ # Mapping from Prometheus metrics names to Datadog ones # For now it's a 1:1 mapping allowed_metrics = init_config.get('allowed_metrics') if allowed_metrics is None: raise CheckException( "At least one metric must be whitelisted in `allowed_metrics`." ) gitlab_runner_instance = deepcopy(instance) # gitlab_runner uses 'prometheus_endpoint' and not 'prometheus_url', so we have to rename the key gitlab_runner_instance['prometheus_url'] = instance.get( 'prometheus_endpoint', None) gitlab_runner_instance.update({ 'namespace': 'gitlab_runner', 'metrics': allowed_metrics, # Defaults that were set when gitlab_runner was based on PrometheusCheck 'send_monotonic_counter': instance.get('send_monotonic_counter', False), 'health_service_check': instance.get('health_service_check', False), }) return gitlab_runner_instance
def check(self, instance): # Metrics collection endpoint = instance.get('prometheus_url', instance.get('prometheus_endpoint')) if endpoint is None: raise CheckException( "Unable to find `prometheus_url` or `prometheus_endpoint` in config file." ) scraper_config = self.config_map[endpoint] try: self.process(scraper_config) self.service_check(self.PROMETHEUS_SERVICE_CHECK_NAME, OpenMetricsBaseCheck.OK, self._tags) except requests.exceptions.ConnectionError as e: # Unable to connect to the metrics endpoint self.service_check( self.PROMETHEUS_SERVICE_CHECK_NAME, OpenMetricsBaseCheck.CRITICAL, message= "Unable to retrieve Prometheus metrics from endpoint {}: {}". format(endpoint, e), ) # Service check to check Gitlab's health endpoints for check_type in self.ALLOWED_SERVICE_CHECKS: self._check_health_endpoint(instance, check_type) self.submit_version(instance)
def _exec_ping(self, timeout, target_host): if platform.system() == "Windows": # pragma: nocover countOption = "-n" timeoutOption = "-w" # The timeout option is in ms on Windows # https://docs.microsoft.com/en-us/windows-server/administration/windows-commands/ping timeout = timeout * 1000 elif platform.system() == "Darwin": countOption = "-c" timeoutOption = "-W" # Also in ms on Mac timeout = timeout * 1000 else: # The timeout option is is seconds on Linux, leaving timeout as is # https://linux.die.net/man/8/ping countOption = "-c" timeoutOption = "-W" self.log.debug("Running: ping %s %s %s %s %s", countOption, "1", timeoutOption, timeout, target_host) lines, err, retcode = get_subprocess_output([ "ping", countOption, "1", timeoutOption, str(timeout), target_host ], self.log, raise_on_empty_output=True) self.log.debug("ping returned %s - %s - %s", retcode, lines, err) if retcode != 0: raise CheckException("ping returned {}: {}".format(retcode, err)) return lines
def check(self, _): service_check_tags = ['dir_name:{}'.format(self._config.name)] service_check_tags.extend(self._config.tags) if not exists(self._config.abs_directory): msg = ("Either directory '{}' doesn't exist or the Agent doesn't " "have permissions to access it, skipping.".format( self._config.abs_directory)) # report missing directory self.service_check(name=SERVICE_DIRECTORY_EXISTS, status=self.WARNING, tags=service_check_tags, message=msg) # raise exception if `ignore_missing` is False if not self._config.ignore_missing: raise CheckException(msg) self.log.warning(msg) # return gracefully, nothing to look for return self.service_check(name=SERVICE_DIRECTORY_EXISTS, tags=service_check_tags, status=self.OK) self._get_stats()
def _get_json(self, url, path, tags): try: r = self._perform_request(url, path) except requests.exceptions.Timeout: self.service_check( self.SERVICE_CHECK_NAME, self.CRITICAL, message='Timeout when hitting {}'.format(url), tags=tags + ['url:{}'.format(url)], ) raise except Exception as e: self.service_check( self.SERVICE_CHECK_NAME, self.CRITICAL, message='Error hitting {}. Error: {}'.format(url, str(e)), tags=tags + ['url:{}'.format(url)], ) raise if r.status_code != 200: self.service_check( self.SERVICE_CHECK_NAME, self.CRITICAL, message='Got {} when hitting {}'.format(r.status_code, url), tags=tags + ['url:{}'.format(url)], ) raise CheckException('Http status code {} on url {}'.format( r.status_code, url)) return r.json()
def get_schema_field(descriptors): # type: (List[Tuple[Any, str]]) -> str """Return column containing the schema name for that query.""" for column, name in descriptors: if name == 'schema': return column raise CheckException("The descriptors are missing a schema field")
def check(self, _): kubelet_conn_info = get_connection_info() endpoint = kubelet_conn_info.get('url') if endpoint is None: raise CheckException( "Unable to detect the kubelet URL automatically: " + kubelet_conn_info.get('err', '')) self.pod_list_url = endpoint.strip("/") + POD_LIST_PATH self.kubelet_credentials = KubeletCredentials(kubelet_conn_info) if self.fargate_mode: pod_list = self.retrieve_pod_list() for pod in pod_list.get('items', []): pod_id = pod.get('metadata', {}).get('uid') tagger_tags = tagger.tag('kubernetes_pod_uid://%s' % pod_id, tagger.ORCHESTRATOR) or [] tagger_tags.extend(self.tags) tags = set(tagger_tags) # Submit the heartbeat metric for fargate virtual nodes. self.gauge(self.NAMESPACE + '.pods.running', 1, tags) pod_annotations = pod.get('metadata', {}).get('annotations') if CAPACITY_ANNOTATION_KEY not in pod_annotations: continue cpu_val, mem_val = extract_resource_values( pod_annotations.get(CAPACITY_ANNOTATION_KEY)) if cpu_val == 0 or mem_val == 0: continue self.gauge(self.NAMESPACE + '.cpu.capacity', cpu_val, tags) self.gauge(self.NAMESPACE + '.memory.capacity', mem_val, tags)
def check(self, instance): """Main method""" endpoints_def = instance.get('endpoints') if not endpoints_def: raise CheckException('The list of metric endpoints is empty') if not isinstance(endpoints_def, (list, tuple)): raise CheckException( 'Incorrect value specified for the list of metric endpoints') metric_def = self.init_config.get('metric_definitions', ALL_METRICS) for endpoint in endpoints_def: metrics = metric_def.get(endpoint) if metrics is None: raise CheckException( 'Unknown metric endpoint: {}'.format(endpoint)) self.check_endpoint(instance, endpoint, metrics)
def _run_socket_commands(parsed_url, commands): if parsed_url.scheme == 'tcp': sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) splitted_loc = parsed_url.netloc.split(':') host = splitted_loc[0] port = int(splitted_loc[1]) sock.connect((host, port)) else: sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.connect(parsed_url.path) sock.send(b';'.join(commands) + b"\r\n") response = "" output = sock.recv(BUFSIZE) while output: response += output.decode("ASCII") output = sock.recv(BUFSIZE) sock.close() responses = [r.strip() for r in response.split('\n\n') if r.strip()] if len(responses) != len(commands): raise CheckException("Expected {} responses, got {}".format( len(commands), len(responses))) return tuple(r.splitlines() for r in responses)
def __init__(self, name, init_config, instances): super(AerospikeCheck, self).__init__(name, init_config, instances) if not aerospike: msg = 'The `aerospike` client is not installed: {}'.format(aerospike_exception) self.log.error(msg) raise CheckException(msg) # https://www.aerospike.com/apidocs/python/aerospike.html#aerospike.client host = self.instance.get('host', 'localhost') port = int(self.instance.get('port', 3000)) tls_name = self.instance.get('tls_name') self._host = (host, port, tls_name) if tls_name else (host, port) self._tls_config = self.instance.get('tls_config') if self._tls_config: self._tls_config['enable'] = True # https://www.aerospike.com/apidocs/python/client.html#aerospike.Client.connect self._username = self.instance.get('username') self._password = self.instance.get('password') # In milliseconds, see https://www.aerospike.com/apidocs/python/client.html#aerospike-info-policies timeout = int(self.instance.get('timeout', 10)) * 1000 self._info_policies = {'timeout': timeout} self._metrics = set(self.instance.get('metrics', [])) self._namespace_metrics = set(self.instance.get('namespace_metrics', [])) self._required_namespaces = self.instance.get('namespaces') self._datacenter_metrics = set(self.instance.get('datacenter_metrics', [])) self._required_datacenters = self.instance.get('datacenters') self._rate_metrics = set(self.init_config.get('mappings', [])) self._tags = self.instance.get('tags', []) # We'll connect on the first check run self._client = None
def fetch_all_values(cls, cursor, counters_list, logger, databases=None): # special case since this table is specific to databases, need to run query for each database instance rows = [] columns = [] if databases is None: databases = [] cursor.execute( 'select DB_NAME()' ) # This can return None in some implementations so it cannot be chained data = cursor.fetchall() current_db = data[0][0] logger.debug("%s: current db is %s", cls.__name__, current_db) for db in databases: # use statements need to be executed separate from select queries ctx = construct_use_statement(db) try: logger.debug( "%s: changing cursor context via use statement: %s", cls.__name__, ctx) cursor.execute(ctx) logger.debug("%s: fetch_all executing query: %s", cls.__name__, cls.QUERY_BASE) cursor.execute(cls.QUERY_BASE) data = cursor.fetchall() except Exception as e: logger.warning( "Error when trying to query db %s - skipping. Error: %s", db, e) continue query_columns = ['database'] + [i[0] for i in cursor.description] if columns: if columns != query_columns: raise CheckException('Assertion error: {} != {}'.format( columns, query_columns)) else: columns = query_columns results = [] # insert database name as new column for each row for row in data: r = list(row) r.insert(0, db) results.append(r) rows.extend(results) logger.debug("%s: received %d rows and %d columns for db %s", cls.__name__, len(data), len(columns), db) # reset back to previous db logger.debug("%s: reverting cursor context via use statement to %s", cls.__name__, current_db) cursor.execute(construct_use_statement(current_db)) return rows, columns
def check(self, instance): if self.kube_apiserver_config is None: self.kube_apiserver_config = self.get_scraper_config(instance) if not self.kube_apiserver_config['metrics_mapper']: url = self.kube_apiserver_config['prometheus_url'] raise CheckException("You have to collect at least one metric from the endpoint: {}".format(url)) self.process(self.kube_apiserver_config, metric_transformers=self.metric_transformers)
def check(self, instance): url = instance.get('url', '') default_timeout = instance.get('default_timeout', 5) timeout = float(instance.get('timeout', default_timeout)) tags = instance.get('tags', []) if not url: raise CheckException("Configuration error, please fix conf.yaml") try: r = requests.get(url, timeout=timeout) except requests.exceptions.Timeout: raise CheckException('URL: {0} timed out after {1} \ seconds.'.format(url, timeout)) except requests.exceptions.ConnectionError as e: raise CheckException(e) if r.status_code != 200: raise CheckException('Invalid Status Code, {0} returned a status \ of {1}.'.format(url, r.status_code)) try: stats = json.loads(r.text) except ValueError: raise CheckException('{0} returned an unserializable \ payload'.format(url)) for key, val in iteritems(stats): if key in self.REPL_STATS: self.safe_submit_metric("riak_repl." + key, val, tags=tags) if stats['realtime_enabled'] is not None: for key, val in iteritems(stats['realtime_queue_stats']): if key in self.REALTIME_QUEUE_STATS: self.safe_submit_metric("riak_repl.realtime_queue_stats." + key, val, tags=tags) for c in stats['connected_clusters']: cluster = c.replace("-", "_") if c not in stats['fullsync_coordinator']: continue for key, val in iteritems(stats['fullsync_coordinator'][c]): if key in self.FULLSYNC_COORDINATOR: self.safe_submit_metric("riak_repl.fullsync_coordinator." + cluster + "." + key, val, tags=tags)
def check(self, instance): host, port, user, password, timeout, server_name = self._get_config( instance) tags = instance.get('tags', []) tags.append('server_name:{}'.format(server_name)) service_check_tags = tags + ['url:{}'.format(host)] auth = (user, password) # Neo specific # Create payload using built-in Neo4j queryJmx stored procedure payload = { "statements": [{ "statement": "CALL dbms.queryJmx('org.neo4j:*') yield attributes with " "keys(attributes) as k, attributes unwind k as " "row return row, attributes[row]['value'];" }] } try: version = self._get_version(host, port, timeout, auth, service_check_tags) if version > 2: check_url = "{}:{}/db/data/transaction/commit".format( host, port) else: check_url = "{}:{}/v1/service/metrics".format(host, port) r = requests.post(check_url, auth=auth, json=payload, timeout=timeout) except Exception as e: msg = "Unable to fetch Neo4j stats: {}".format(e) self._critical_service_check(service_check_tags, msg) raise CheckException(msg) if r.status_code != 200: msg = "Unexpected status of {0} when fetching Neo4j stats, response: {1}" msg = msg.format(r.status_code, r.text) self._critical_service_check(service_check_tags, msg) r.raise_for_status() stats = r.json() self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags) for doc in stats['results'][0]['data']: name = doc['row'][0].lower() if name in self.keys: try: self.gauge(self.display.get(name, ""), doc['row'][1], tags=tags) except TypeError: continue except ValueError: continue
def _check_health_endpoint(self, instance, check_type): if check_type not in self.ALLOWED_SERVICE_CHECKS: raise CheckException( "Health endpoint {} is not a valid endpoint".format( check_type)) url = instance.get('gitlab_url') if url is None: # Simply ignore this service check if not configured self.log.debug( "gitlab_url not configured, service check %s skipped", check_type) return # These define which endpoint is hit and which type of check is actually performed # TODO: parse errors and report for single sub-service failure? service_check_name = 'gitlab.{}'.format(check_type) check_url = '{}/-/{}'.format(url, check_type) try: self.log.debug("checking %s against %s", check_type, check_url) r = self.http.get(check_url) if r.status_code != 200: self.service_check( service_check_name, OpenMetricsBaseCheck.CRITICAL, message="Got {} when hitting {}".format( r.status_code, check_url), tags=self._tags, ) raise Exception("Http status code {} on check_url {}".format( r.status_code, check_url)) else: r.raise_for_status() except requests.exceptions.Timeout: # If there's a timeout self.service_check( service_check_name, OpenMetricsBaseCheck.CRITICAL, message="Timeout when hitting {}".format(check_url), tags=self._tags, ) raise except Exception as e: self.service_check( service_check_name, OpenMetricsBaseCheck.CRITICAL, message="Error hitting {}. Error: {}".format(check_url, e), tags=self._tags, ) raise else: self.service_check(service_check_name, OpenMetricsBaseCheck.OK, self._tags) self.log.debug("gitlab check %s succeeded", check_type)
def addrs(self): if self._addrs is None or self._addrs == []: try: self.resolve_ips() except Exception as e: self.log.error(str(e)) msg = "URL: {} could not be resolved".format(self.host) raise CheckException(msg) return self._addrs
def _process_error(self, error_msg): if error_msg.code() == ck.KafkaError.TOPIC_AUTHORIZATION_FAILED: if self.auth_ticket: raise CheckException( "The user impersonated using the ticket %s does not have the 'consume' permission on topic %s. " "Please update the stream permissions." % (self.auth_ticket, self.topic_path)) else: raise CheckException( "dd-agent user could not consume topic '%s'. Please ensure that:\n" "\t* This is a non-secure cluster, otherwise a user ticket is required.\n" "\t* The dd-agent user has the 'consume' permission on topic %s or " "impersonation is correctly configured." % (self.topic_path, self.topic_path)) elif error_msg.code() != ck.KafkaError._PARTITION_EOF: # Partition EOF is expected anytime we reach the end of one partition in the topic. # This is expected at least once per partition per check run. raise CheckException(error_msg)
def get_api_json(self, url): try: key = self.api_key headers = {"X-Api-Key": key, "content-type": "application/json"} response = self.http.get(url, headers=headers) except Timeout as e: error_message = "Request timeout: {}, {}".format(url, e) self.log.warning(error_message) self.service_check( "can_connect", AgentCheck.CRITICAL, message=error_message, ) raise except (HTTPError, InvalidURL, ConnectionError) as e: error_message = "Request failed: {}, {}".format(url, e) self.log.warning(error_message) self.service_check( "can_connect", AgentCheck.CRITICAL, message=error_message, ) raise except JSONDecodeError as e: error_message = "JSON Parse failed: {}, {}".format(url, e) self.log.warning(error_message) self.service_check( "can_connect", AgentCheck.CRITICAL, message=error_message, ) raise except ValueError as e: error_message = str(e) self.log.warning(error_message) self.service_check("can_connect", AgentCheck.CRITICAL, message=error_message) raise if response.status_code != 200: error_message = ( "Expected status code 200 for url {}, but got status code: {} check your config information" .format(url, response.status_code)) self.log.warning(error_message) self.service_check("can_connect", AgentCheck.CRITICAL, message=error_message) raise CheckException(error_message) else: self.service_check("can_connect", AgentCheck.OK) return response.json()
def get_connection(self): if HanaConnection is None: raise CheckException( "hdbcli is not installed. Check the integration documentation to install it." ) # https://help.sap.com/viewer/f1b440ded6144a54ada97ff95dac7adf/2.10/en-US/ee592e89dcce4480a99571a4ae7a702f.html connection_properties = self.instance.get('connection_properties', {}).copy() connection_properties.setdefault('address', self._server) connection_properties.setdefault('port', self._port) connection_properties.setdefault('user', self._username) connection_properties.setdefault('password', self._password) timeout_milliseconds = int(self._timeout * 1000) connection_properties.setdefault('communicationTimeout', timeout_milliseconds) connection_properties.setdefault('nodeConnectTimeout', timeout_milliseconds) if self._use_tls: connection_properties.setdefault('encrypt', True) connection_properties.setdefault('sslHostNameInCertificate', self._server) connection_properties.setdefault('sslSNIHostname', self._server) tls_verify = self.instance.get('tls_verify', True) if not tls_verify: connection_properties.setdefault('sslValidateCertificate', False) tls_cert = self.instance.get('tls_cert') if tls_cert: connection_properties.setdefault('sslKeyStore', tls_cert) tls_ca_cert = self.instance.get('tls_ca_cert') if tls_ca_cert: connection_properties.setdefault('sslTrustStore', tls_ca_cert) elif not connection_properties.get('sslUseDefaultTrustStore', True): connection_properties.setdefault('sslTrustStore', certifi.where()) try: connection = HanaConnection(**connection_properties) except Exception as e: error = str(e) self.log.error('Unable to connect to SAP HANA: %s', error) self.service_check(self.SERVICE_CHECK_CONNECT, self.CRITICAL, message=error, tags=self._tags) else: self.service_check(self.SERVICE_CHECK_CONNECT, self.OK, tags=self._tags) return connection
def check(self, instance): """ Process all the endpoints associated with this instance. All the endpoints themselves are optional, but at least one must be passed. """ processed = False # Get the config for the istio_mesh instance istio_mesh_endpoint = instance.get('istio_mesh_endpoint') if istio_mesh_endpoint: istio_mesh_config = self.config_map[istio_mesh_endpoint] # Process istio_mesh self.process(istio_mesh_config) processed = True # Get the config for the process_mixer instance process_mixer_endpoint = instance.get('mixer_endpoint') if process_mixer_endpoint: process_mixer_config = self.config_map[process_mixer_endpoint] # Process process_mixer self.process(process_mixer_config) processed = True # Get the config for the process_pilot instance process_pilot_endpoint = instance.get('pilot_endpoint') if process_pilot_endpoint: process_pilot_config = self.config_map[process_pilot_endpoint] # Process process_pilot self.process(process_pilot_config) processed = True # Get the config for the process_galley instance process_galley_endpoint = instance.get('galley_endpoint') if process_galley_endpoint: process_galley_config = self.config_map[process_galley_endpoint] # Process process_galley self.process(process_galley_config) processed = True # Get the config for the process_citadel instance process_citadel_endpoint = instance.get('citadel_endpoint') if process_citadel_endpoint: process_citadel_config = self.config_map[process_citadel_endpoint] # Process process_citadel self.process(process_citadel_config) processed = True # Check that at least 1 endpoint is configured if not processed: raise CheckException( "At least one of Mixer, Mesh, Pilot, Galley or Citadel endpoints must be configured" )
def check(self, instance): try: region_name = instance.get('region_name') if not region_name: region_name = 'us-east-1' pricing_client = boto3.client('pricing', region_name=region_name) service_codes = get_aws_service_codes(pricing_client) rate_codes_dict = get_rate_codes_dict_from_instance(service_codes, instance) # Python dictionaries evaluate to false when empty if not rate_codes_dict: message = 'No rate codes for existing AWS services were defined, please fix conf.yaml' self.service_check('aws_pricing.status', self.CRITICAL, message=message) raise CheckException(message) missing_rate_codes = defaultdict(list) for service_code, rate_codes in iteritems(rate_codes_dict): for rate_code in rate_codes: price_dimensions = get_aws_prices(pricing_client, service_code, rate_code) if price_dimensions is None: missing_rate_codes[service_code].append(rate_code) continue name = 'aws.pricing.{}'.format(service_code.lower()) price = get_price_from_price_dimensions(price_dimensions) tags = get_tags_from_price_dimensions(price_dimensions) self.gauge(name, price, tags) # Python dictionaries evaluate to true when not empty if not missing_rate_codes: self.service_check('aws_pricing.status', self.OK) else: message = 'Pricing data not found for these service rate codes: {}'.format(dict(missing_rate_codes)) self.service_check('aws_pricing.status', self.WARNING, message=message) except ClientError as client_error: self.service_check('aws_pricing.status', self.CRITICAL, message=str(client_error)) raise CheckException('Pricing Service client error: {}'.format(str(client_error)))
def _load_conf(self, instance): # Fetches the conf timeout = float(instance.get("timeout", 4)) response_time = instance.get("collect_response_time", False) custom_tags = instance.get("tags", []) host = instance.get("host", None) if host is None: raise CheckException("A valid host must be specified") return host, custom_tags, timeout, response_time
def get_clusters(self, base_url): clusters_endpoint = common.CLUSTERS_URL.format(base_url=base_url) resp = self._make_request(clusters_endpoint) if resp is None: self._submit_service_checks("can_connect", self.CRITICAL, ["url:{}".format(base_url)]) raise CheckException( "Couldn't connect to URL: {}. Please verify the address is reachable".format(clusters_endpoint) ) self._submit_service_checks("can_connect", self.OK, ["url:{}".format(base_url)]) return self._get_response_clusters(resp)
def check(self, _): if ck is None: raise CheckException( "confluent_kafka was not imported correctly, make sure the library is installed and that you've " "set LD_LIBRARY_PATH correctly. Please refer to datadog documentation for more details. Error is %s" % ck_import_error) try: conn = self.get_connection() except Exception: self.service_check( SERVICE_CHECK, AgentCheck.CRITICAL, self.custom_tags + ['topic:{}'.format(self.topic_path)]) raise else: self.service_check( SERVICE_CHECK, AgentCheck.OK, self.custom_tags + ['topic:{}'.format(self.topic_path)]) submitted_metrics_count = 0 while True: # Collecting one message at a time has no impact on performance because the library # batches data. Most calls to `poll` won't initiate a I/O connection. msg = conn.poll(timeout=0.5) if msg is None: # Timed out, no more messages break if msg.error() is None: # Metric received submitted_metrics_count += self._process_metric(msg) else: self._process_error(msg.error()) if not self.has_ever_submitted_metrics: # The integration has never found any metric so far if submitted_metrics_count: self.has_ever_submitted_metrics = True self.log.info( "The integration collected metrics for the first time in topic %s", self.topic_path) else: self.log.error( "The integration was not yet able to collect any MapR metric in topic %s. If this error continues " "after a few check runs, double-check the existence of the stream and the topic using " "maprcli as well as the permissions on this topic.", self.topic_path, ) if submitted_metrics_count: self.gauge(METRICS_SUBMITTED_METRIC_NAME, submitted_metrics_count, self.custom_tags)