def check(self, instance): instance_name = instance.get("name") if instance_name is None: raise Exception("Each instance must have a unique name") server = instance.get("server") if server is None: raise Exception("Each instance must have a server") # Check the server URL for HTTP or HTTPS designation, # fall back to http:// if no scheme present (allows for backwards compatibility). server = self._normalize_server_url(server) build_conf = instance.get("build_configuration") if build_conf is None: raise Exception("Each instance must have a build configuration") host = instance.get("host_affected") or self.hostname tags = instance.get("tags") is_deployment = _is_affirmative(instance.get("is_deployment", False)) basic_http_authentication = _is_affirmative( instance.get("basic_http_authentication", False)) self._initialize_if_required(instance_name, server, build_conf, basic_http_authentication) # Look for new successful builds if basic_http_authentication: new_build_url = self.NEW_BUILD_URL_AUTHENTICATED.format( server=server, build_conf=build_conf, since_build=self.last_build_ids[instance_name]) else: new_build_url = self.NEW_BUILD_URL.format( server=server, build_conf=build_conf, since_build=self.last_build_ids[instance_name]) try: resp = self.http.get(new_build_url) resp.raise_for_status() new_builds = resp.json() if new_builds["count"] == 0: self.log.debug("No new builds found.") else: self._build_and_send_event(new_builds["build"][0], instance_name, is_deployment, host, tags) except requests.exceptions.HTTPError: self.log.exception("Couldn't fetch last build, got code %s", resp.status_code) raise except Exception: self.log.exception( "Couldn't fetch last build, unhandled exception") raise
def check(self, _): url = self.instance.get("url") custom_tags = self.instance.get('tags', []) max_queues = int(self.instance.get("max_queues", MAX_ELEMENTS)) max_topics = int(self.instance.get("max_topics", MAX_ELEMENTS)) max_subscribers = int( self.instance.get("max_subscribers", MAX_ELEMENTS)) detailed_queues = self.instance.get("detailed_queues", []) detailed_topics = self.instance.get("detailed_topics", []) detailed_subscribers = self.instance.get("detailed_subscribers", []) suppress_errors = _is_affirmative( self.instance.get("suppress_errors", False)) tags = custom_tags + ["url:{0}".format(url)] self.log.debug("Processing ActiveMQ data for %s", url) data = self._fetch_data(url, QUEUE_URL, suppress_errors) if data: self._process_data(data, "queue", tags, max_queues, detailed_queues) data = self._fetch_data(url, TOPIC_URL, suppress_errors) if data: self._process_data(data, "topic", tags, max_topics, detailed_topics) data = self._fetch_data(url, SUBSCRIBER_URL, suppress_errors) if data: self._process_subscriber_data(data, tags, max_subscribers, detailed_subscribers)
def _collect_raw(self, ceph_cmd, ceph_cluster, instance): use_sudo = _is_affirmative(instance.get('use_sudo', False)) ceph_args = [] if use_sudo: test_sudo = os.system('setsid sudo -l < /dev/null') if test_sudo != 0: raise Exception('The dd-agent user does not have sudo access') ceph_args = 'sudo {}'.format(ceph_cmd) else: ceph_args = ceph_cmd ceph_args = '{} --cluster {}'.format(ceph_args, ceph_cluster) raw = {} for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats', 'osd perf', 'health detail'): try: args = '{} {} -fjson'.format(ceph_args, cmd) output, _, _ = get_subprocess_output(args.split(), self.log) res = json.loads(output) except Exception as e: self.log.warning('Unable to parse data from cmd=%s: %s', cmd, e) continue name = cmd.replace(' ', '_') raw[name] = res return raw
def build_resource_filters(raw_filters): # type: (List[Dict[str, Any]]) -> Dict[str, List[ResourceFilter]] created_filters = { 'included': [], 'excluded': [] } # type: Dict[str, List[ResourceFilter]] for f in raw_filters: included = _is_affirmative(f.get('include', True)) if f.get('pattern') is None or f.get('resource_type') is None: raise ConfigurationError( 'A resource filter requires at least a pattern and a resource_type' ) if f['resource_type'] not in ALLOWED_RESOURCES_FOR_FILTERS: raise ConfigurationError('Unknown resource_type: {}'.format( f['resource_type'])) regex = re.compile(f['pattern']) if included: created_filters['included'].append( ResourceFilter(f['resource_type'], regex, True, f.get('group'))) else: created_filters['excluded'].append( ResourceFilter(f['resource_type'], regex, False, f.get('group'))) return created_filters
def __init__(self, instance): # type: (Dict[str, Any]) -> None self.url = instance.get('url', '') # type: str if self.url == '': raise ConfigurationError("url is a required configuration.") self.tags = instance.get('tags', []) self.enable_health_service_checks = _is_affirmative( instance.get('enable_health_service_checks', False)) self.resource_filters = self.build_resource_filters( instance.get('resource_filters', []))
def _collect_raw(self, ceph_cmd, ceph_cluster, instance): use_sudo = _is_affirmative(instance.get('use_sudo', False)) if use_sudo: test_sudo = os.system('setsid sudo -l < /dev/null') if test_sudo != 0: raise CheckException( 'The dd-agent user does not have sudo access') ceph_args = 'sudo {}'.format(ceph_cmd) else: ceph_args = ceph_cmd ceph_args = '{} --cluster {}'.format(ceph_args, ceph_cluster) raw = {} for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats', 'osd perf', 'health detail'): try: args = '{} {} -fjson'.format(ceph_args, cmd) output, _, _ = get_subprocess_output(args.split(), self.log) res = json.loads(output) except Exception as e: self.log.warning('Unable to parse data from cmd=%s: %s', cmd, e) continue name = cmd.replace(' ', '_') raw[name] = res mon_map = raw.get('status', {}).get('monmap') if mon_map is None: raise RuntimeError("Could not detect Ceph release series") if 'min_mon_release_name' in mon_map and mon_map[ 'min_mon_release_name'] == 'octopus': self.log.debug("Detected octopus version of ceph...") self._octopus = True else: self._octopus = False return raw
def check(self, instance): name = instance.get('name', None) tags = instance.get('tags', []) exact_match = _is_affirmative(instance.get('exact_match', True)) search_string = instance.get('search_string', None) ignore_ad = _is_affirmative(instance.get('ignore_denied_access', True)) pid = instance.get('pid') pid_file = instance.get('pid_file') collect_children = _is_affirmative( instance.get('collect_children', False)) user = instance.get('user', False) try_sudo = instance.get('try_sudo', False) if self._conflicting_procfs: self.warning( 'The `procfs_path` defined in `process.yaml is different from the one defined in ' '`datadog.conf` This is currently not supported by the Agent. Defaulting to the ' 'value defined in `datadog.conf`: %s', psutil.PROCFS_PATH, ) elif self._deprecated_init_procfs: self.warning( 'DEPRECATION NOTICE: Specifying `procfs_path` in process.yaml` is deprecated. ' 'Please specify it in `datadog.conf` instead') if not isinstance(search_string, list) and pid is None and pid_file is None: raise ValueError( '"search_string" or "pid" or "pid_file" parameter is required') # FIXME 8.x remove me if search_string is not None: if "All" in search_string: self.warning( 'Deprecated: Having "All" in your search_string will greatly reduce the ' 'performance of the check and will be removed in a future version of the agent.' ) if name is None: raise KeyError('The "name" of process groups is mandatory') if search_string is not None: pids = self.find_pids(name, search_string, exact_match, ignore_ad=ignore_ad) elif pid is not None: # we use Process(pid) as a means to search, if pid not found # psutil.NoSuchProcess is raised. pids = self._get_pid_set(pid) elif pid_file is not None: try: with open(pid_file, 'r') as file_pid: pid_line = file_pid.readline().strip() pids = self._get_pid_set(int(pid_line)) except IOError as e: # pid file doesn't exist, assuming the process is not running self.log.debug('Unable to find pid file: %s', e) pids = set() else: raise ValueError( 'The "search_string" or "pid" options are required for process identification' ) if collect_children: pids.update(self._get_child_processes(pids)) if user: pids = self._filter_by_user(user, pids) proc_state = self.get_process_state(name, pids, try_sudo) # FIXME 8.x remove the `name` tag tags.extend(['process_name:{}'.format(name), name]) self.log.debug('ProcessCheck: process %s analysed', name) self.gauge('system.processes.number', len(pids), tags=tags) if len(pids) == 0: self.warning("No matching process '%s' was found", name) # reset the process caches now, something changed self.last_pid_cache_ts[name] = 0 self.process_list_cache.reset() for attr, mname in iteritems(ATTR_TO_METRIC): vals = [x for x in proc_state[attr] if x is not None] # skip [] if vals: sum_vals = sum(vals) if attr == 'run_time': self.gauge('system.processes.{}.avg'.format(mname), sum_vals / len(vals), tags=tags) self.gauge('system.processes.{}.max'.format(mname), max(vals), tags=tags) self.gauge('system.processes.{}.min'.format(mname), min(vals), tags=tags) # FIXME 8.x: change this prefix? else: self.gauge('system.processes.{}'.format(mname), sum_vals, tags=tags) if mname in ['ioread_bytes', 'iowrite_bytes']: self.monotonic_count( 'system.processes.{}_count'.format(mname), sum_vals, tags=tags) for attr, mname in iteritems(ATTR_TO_METRIC_RATE): vals = [x for x in proc_state[attr] if x is not None] if vals: self.rate('system.processes.{}'.format(mname), sum(vals), tags=tags) self._process_service_check(name, len(pids), instance.get('thresholds', None), tags)
def check(self, _): aci_url = self.instance.get('aci_url') aci_urls = self.instance.get('aci_urls', []) if aci_url: aci_urls.append(aci_url) if not aci_urls: raise ConfigurationError( "The Cisco ACI check requires at least one url") username = self.instance['username'] pwd = self.instance.get('pwd') instance_hash = hash_mutable(self.instance) appcenter = _is_affirmative(self.instance.get('appcenter')) cert_key = self.instance.get('cert_key') if not cert_key and self.instance.get('cert_key_path'): with open(self.instance.get('cert_key_path'), 'rb') as f: cert_key = f.read() cert_name = self.instance.get('cert_name') if not cert_name: cert_name = username cert_key_password = self.instance.get('cert_key_password') if instance_hash in self._api_cache: api = self._api_cache.get(instance_hash) else: api = Api( aci_urls, self.http, username, password=pwd, cert_name=cert_name, cert_key=cert_key, log=self.log, appcenter=appcenter, cert_key_password=cert_key_password, ) self._api_cache[instance_hash] = api service_check_tags = [] for url in aci_urls: service_check_tags.append("url:{}".format(url)) service_check_tags.extend(self.check_tags) service_check_tags.extend(self.instance.get('tags', [])) try: api.login() except Exception as e: self.log.error("Cannot login to the Cisco ACI: %s", e) self.service_check( SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="aci login returned a status of {}".format(e), tags=service_check_tags, ) raise self.tagger.api = api try: tenant = Tenant(self, api, self.instance, instance_hash) tenant.collect() except Exception as e: self.log.error('tenant collection failed: %s', e) self.service_check( SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="aci tenant operations failed, returning a status of {}" .format(e), tags=service_check_tags, ) api.close() raise try: fabric = Fabric(self, api, self.instance) fabric.collect() except Exception as e: self.log.error('fabric collection failed: %s', e) self.service_check( SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="aci fabric operations failed, returning a status of {}" .format(e), tags=service_check_tags, ) api.close() raise try: capacity = Capacity(api, self.instance, check_tags=self.check_tags, gauge=self.gauge, log=self.log) capacity.collect() except Exception as e: self.log.error('capacity collection failed: %s', e) self.service_check( SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message= "aci capacity operations failed, returning a status of {}". format(e), tags=service_check_tags, ) api.close() raise self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags) self.set_external_tags(self.get_external_host_tags()) api.close()