def check_health_v1(self, config, tags): url = config['api_url'] + '/sys/health' health_data = self.access_api(url, config, tags).json() cluster_name = health_data.get('cluster_name') if cluster_name: tags.append('cluster_name:{}'.format(cluster_name)) vault_version = health_data.get('version') if vault_version: tags.append('vault_version:{}'.format(vault_version)) unsealed = not is_affirmative(health_data.get('sealed')) if unsealed: self.service_check(self.SERVICE_CHECK_UNSEALED, AgentCheck.OK, tags=tags) else: self.service_check(self.SERVICE_CHECK_UNSEALED, AgentCheck.CRITICAL, tags=tags) initialized = is_affirmative(health_data.get('initialized')) if initialized: self.service_check(self.SERVICE_CHECK_INITIALIZED, AgentCheck.OK, tags=tags) else: self.service_check(self.SERVICE_CHECK_INITIALIZED, AgentCheck.CRITICAL, tags=tags)
def check(self, instance): try: directory = instance['directory'] except KeyError: raise Exception('DirectoryCheck: missing `directory` in config') abs_directory = abspath(directory) name = instance.get('name', directory) pattern = instance.get('pattern') exclude_dirs = instance.get('exclude_dirs', []) exclude_dirs_pattern = re_compile( '|'.join(exclude_dirs)) if exclude_dirs else None recursive = is_affirmative(instance.get('recursive', False)) dirtagname = instance.get('dirtagname', 'name') filetagname = instance.get('filetagname', 'filename') filegauges = is_affirmative(instance.get('filegauges', False)) countonly = is_affirmative(instance.get('countonly', False)) ignore_missing = is_affirmative(instance.get('ignore_missing', False)) custom_tags = instance.get('tags', []) if not exists(abs_directory): if ignore_missing: self.log.info( 'DirectoryCheck: the directory `{}` does not exist. Skipping.' .format(abs_directory)) return raise Exception( 'DirectoryCheck: the directory `{}` does not exist. Skipping.'. format(abs_directory)) self._get_stats(abs_directory, name, dirtagname, filetagname, filegauges, pattern, exclude_dirs_pattern, recursive, countonly, custom_tags)
def check(self, instance): try: directory = instance['directory'] except KeyError: raise ConfigurationError( 'DirectoryCheck: missing `directory` in config') abs_directory = abspath(directory) name = instance.get('name', directory) pattern = instance.get('pattern') exclude_dirs = instance.get('exclude_dirs', []) exclude_dirs_pattern = re_compile( '|'.join(exclude_dirs)) if exclude_dirs else None dirs_patterns_full = is_affirmative( instance.get('dirs_patterns_full', False)) recursive = is_affirmative(instance.get('recursive', False)) dirtagname = instance.get('dirtagname', 'name') filetagname = instance.get('filetagname', 'filename') filegauges = is_affirmative(instance.get('filegauges', False)) countonly = is_affirmative(instance.get('countonly', False)) ignore_missing = is_affirmative(instance.get('ignore_missing', False)) custom_tags = instance.get('tags', []) if not exists(abs_directory): msg = "Either directory '{}' doesn't exist or the Agent doesn't "\ "have permissions to access it, skipping.".format(abs_directory) if not ignore_missing: raise ConfigurationError(msg) self.log.warning(msg) self._get_stats(abs_directory, name, dirtagname, filetagname, filegauges, pattern, exclude_dirs_pattern, dirs_patterns_full, recursive, countonly, custom_tags)
def __init__(self, instance): self.channel = instance.get('channel') self.queue_manager_name = instance.get('queue_manager', 'default') self.host = instance.get('host', 'localhost') self.port = instance.get('port', '1414') self.host_and_port = "{}({})".format(self.host, self.port) self.username = instance.get('username') self.password = instance.get('password') self.queues = instance.get('queues', []) self.queue_patterns = instance.get('queue_patterns', []) self.custom_tags = instance.get('tags', []) self.auto_discover_queues = is_affirmative( instance.get('auto_discover_queues', False)) self.ssl = is_affirmative(instance.get('ssl_auth', False)) self.ssl_cipher_spec = instance.get('ssl_cipher_spec', 'TLS_RSA_WITH_AES_256_CBC_SHA') self.key_repository_location = instance.get( 'ssl_key_repository_location', '/var/mqm/ssl-db/client/KeyringClient') self.mq_installation_dir = instance.get('mq_installation_dir', '/opt/mqm/')
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) self._ssl_verify = is_affirmative(init_config.get("ssl_verify", True)) self.keystone_server_url = init_config.get("keystone_server_url") self.hypervisor_name_cache = {} self.paginated_server_limit = init_config.get( 'paginated_server_limit') or DEFAULT_PAGINATED_SERVER_LIMIT self.request_timeout = init_config.get( 'request_timeout') or DEFAULT_API_REQUEST_TIMEOUT if not self.keystone_server_url: raise IncompleteConfig() # Current authentication scopes self._parent_scope = None self._current_scope = None # Cache some things between runs for values that change rarely self._aggregate_list = None # Mapping of check instances to associated OpenStack project scopes self.instance_map = {} # Mapping of Nova-managed servers to tags self.external_host_tags = {} self.exclude_network_id_rules = set([ re.compile(ex) for ex in init_config.get('exclude_network_ids', []) ]) self.exclude_server_id_rules = set([ re.compile(ex) for ex in init_config.get('exclude_server_ids', []) ]) self.include_project_name_rules = set([ re.compile(ex) for ex in init_config.get('whitelist_project_names', []) ]) self.exclude_project_name_rules = set([ re.compile(ex) for ex in init_config.get('blacklist_project_names', []) ]) skip_proxy = not is_affirmative( init_config.get('use_agent_proxy', True)) self.proxy_config = None if skip_proxy else self.proxies self.backoff = BackOffRetry(self) # ISO8601 date time: used to filter the call to get the list of nova servers self.changes_since_time = {} # Ex: server_details_by_id = { # UUID: {UUID: <value>, etc} # 1: {id: 1, name: hostA}, # 2: {id: 2, name: hostB} # } self.server_details_by_id = {}
def _check_for_leader_change(self, instance, instance_state): perform_new_leader_checks = is_affirmative(instance.get('new_leader_checks', self.init_config.get('new_leader_checks', False))) perform_self_leader_check = is_affirmative(instance.get('self_leader_check', self.init_config.get('self_leader_check', False))) if perform_new_leader_checks and perform_self_leader_check: self.log.warn('Both perform_self_leader_check and perform_new_leader_checks are set, ' 'ignoring perform_new_leader_checks') elif not perform_new_leader_checks and not perform_self_leader_check: # Nothing to do here return leader = self._get_cluster_leader(instance) if not leader: # A few things could be happening here. # 1. Consul Agent is Down # 2. The cluster is in the midst of a leader election # 3. The Datadog agent is not able to reach the Consul instance (network partition et al.) self.log.warn('Consul Leader information is not available!') return if not instance_state.last_known_leader: # We have no state preserved, store some and return instance_state.last_known_leader = leader return agent = self._get_agent_url(instance, instance_state) agent_dc = self._get_agent_datacenter(instance, instance_state) if leader != instance_state.last_known_leader: # There was a leadership change if perform_new_leader_checks or (perform_self_leader_check and agent == leader): # We either emit all leadership changes or emit when we become the leader and that just happened self.log.info(('Leader change from {0} to {1}. Sending new leader event').format( instance_state.last_known_leader, leader)) self.event({ "timestamp": int(datetime.now().strftime("%s")), "event_type": "consul.new_leader", "source_type_name": self.SOURCE_TYPE_NAME, "msg_title": "New Consul Leader Elected in consul_datacenter:{0}".format(agent_dc), "aggregation_key": "consul.new_leader", "msg_text": "The Node at {0} is the new leader of the consul datacenter {1}".format( leader, agent_dc ), "tags": ["prev_consul_leader:{0}".format(instance_state.last_known_leader), "curr_consul_leader:{0}".format(leader), "consul_datacenter:{0}".format(agent_dc)] }) instance_state.last_known_leader = leader
def get_config(self, instance): instance_id = hash_mutable(instance) config = self.config.get(instance_id) if config is None: config = {} try: api_url = instance['api_url'] api_version = api_url[-1] if api_version not in self.api_versions: self.log.warning( 'Unknown Vault API version `{}`, using version ' '`{}`'.format(api_version, self.DEFAULT_API_VERSION)) config['api_url'] = api_url config['api'] = self.api_versions.get( api_version, self.DEFAULT_API_VERSION)['functions'] except KeyError: self.log.error( 'Vault configuration setting `api_url` is required') return client_token = instance.get('client_token') config['headers'] = { 'X-Vault-Token': client_token } if client_token else None username = instance.get('username') password = instance.get('password') config['auth'] = (username, password) if username and password else None config['ssl_verify'] = is_affirmative( instance.get('ssl_verify', True)) config['ssl_ignore_warning'] = is_affirmative( instance.get('ssl_ignore_warning', False)) config['proxies'] = self.get_instance_proxy( instance, config['api_url']) config['timeout'] = int(instance.get('timeout', 20)) config['tags'] = instance.get('tags', []) # Keep track of the previous cluster leader to detect changes. config['leader'] = None config['detect_leader'] = is_affirmative( instance.get('detect_leader')) self.config[instance_id] = config return config
def check_leader_v1(self, config, tags): url = config['api_url'] + '/sys/leader' leader_data = self.access_api(url, config, tags).json() is_leader = is_affirmative(leader_data.get('is_self')) tags.append('is_leader:{}'.format('true' if is_leader else 'false')) self.gauge('vault.is_leader', int(is_leader), tags=tags) current_leader = leader_data.get('leader_address') previous_leader = config['leader'] if config['detect_leader'] and current_leader: if previous_leader is not None and current_leader != previous_leader: self.event({ 'timestamp': timestamp(), 'event_type': self.EVENT_LEADER_CHANGE, 'msg_title': 'Leader change', 'msg_text': 'Leader changed from `{}` to `{}`.'.format( previous_leader, current_leader), 'alert_type': 'info', 'source_type_name': self.CHECK_NAME, 'host': self.hostname, 'tags': tags, }) config['leader'] = current_leader
def check(self, instance): host = instance.get('host', '') port = instance.get('port', '') user = instance.get('username', '') password = instance.get('password', '') tags = instance.get('tags', []) database_url = instance.get('database_url') use_cached = is_affirmative(instance.get('use_cached', True)) if database_url: key = database_url else: key = '%s:%s' % (host, port) if tags is None: tags = [] else: tags = list(set(tags)) try: db = self._get_connection(key, host, port, user, password, tags=tags, database_url=database_url, use_cached=use_cached) self._collect_stats(db, tags) except ShouldRestartException: self.log.info("Resetting the connection") db = self._get_connection(key, host, port, user, password, tags=tags, database_url=database_url, use_cached=False) self._collect_stats(db, tags) redacted_dsn = self._get_redacted_dsn(host, port, user, database_url) message = u'Established connection to {}'.format(redacted_dsn) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=self._get_service_checks_tags(host, port, database_url, tags), message=message)
def _cache_morlist_raw(self, instance): """ Fill the Mor objects queue that will be asynchronously processed later. Resolve the vCenter `rootFolder` and initiate hosts and virtual machines discovery. """ i_key = self._instance_key(instance) self.log.debug("Caching the morlist for vcenter instance %s" % i_key) # If the queue is not completely empty, don't do anything for resource_type in RESOURCE_TYPE_METRICS: if self.mor_objects_queue.contains(i_key) and self.mor_objects_queue.size(i_key, resource_type): last = self.cache_config.get_last(CacheConfig.Morlist, i_key) self.log.debug("Skipping morlist collection: the objects queue for the " "resource type '{}' is still being processed " "(latest refresh was {}s ago)".format(resource_type, time.time() - last)) return instance_tag = "vcenter_server:%s" % instance.get('name') regexes = { 'host_include': instance.get('host_include_only_regex'), 'vm_include': instance.get('vm_include_only_regex') } include_only_marked = is_affirmative(instance.get('include_only_marked', False)) # Discover hosts and virtual machines self.pool.apply_async( self._cache_morlist_raw_async, args=(instance, [instance_tag], regexes, include_only_marked) ) self.cache_config.set_last(CacheConfig.Morlist, i_key, time.time())
def create(logger, proxies, instance_config): ssl_verify = is_affirmative(instance_config.get("ssl_verify", True)) paginated_limit = instance_config.get('paginated_limit', DEFAULT_PAGINATED_LIMIT) request_timeout = instance_config.get('request_timeout', DEFAULT_API_REQUEST_TIMEOUT) user = instance_config.get("user") openstack_config_file_path = instance_config.get( "openstack_config_file_path") openstack_cloud_name = instance_config.get("openstack_cloud_name") # If an OpenStack configuration is specified, an OpenstackSDKApi is created, and the authentication # is made directly from the OpenStack configuration file if openstack_cloud_name is None: keystone_server_url = instance_config.get("keystone_server_url") api = SimpleApi( logger, keystone_server_url, timeout=request_timeout, ssl_verify=ssl_verify, proxies=proxies, limit=paginated_limit, ) api.connect(user) else: api = OpenstackSDKApi(logger) api.connect(openstack_config_file_path, openstack_cloud_name) return api
def get_network_stats(self, tags): """ Collect stats for all reachable networks """ # FIXME: (aaditya) Check all networks defaults to true # until we can reliably assign agents to networks to monitor if is_affirmative(self.init_config.get('check_all_networks', True)): all_network_ids = set(self.get_all_network_ids()) # Filter out excluded networks network_ids = [ network_id for network_id in all_network_ids if not any([ re.match(exclude_id, network_id) for exclude_id in self.exclude_network_id_rules ]) ] else: network_ids = self.init_config.get('network_ids', []) if not network_ids: self.warning( "Your check is not configured to monitor any networks.\n" + "Please list `network_ids` under your init_config") for nid in network_ids: self.get_stats_for_single_network(nid, tags)
def check(self, instance): # Get the configuration for this specific instance scraper_config = self.get_scraper_config(instance) # Populate the metric transformers dict transformers = {} limiters = self.DEFAUT_RATE_LIMITERS + instance.get( "extra_limiters", []) for limiter in limiters: transformers[limiter + "_rate_limiter_use"] = self.rate_limiter_use queues = self.DEFAULT_QUEUES + instance.get("extra_queues", []) for queue in queues: for metric, func in iteritems(self.QUEUE_METRICS_TRANSFORMERS): transformers[queue + metric] = func # Support new metrics (introduced in v1.14.0) for metric_name in self.WORKQUEUE_METRICS_RENAMING: transformers[metric_name] = self.workqueue_transformer self.ignore_deprecated_metrics = instance.get( "ignore_deprecated", self.DEFAULT_IGNORE_DEPRECATED) if self.ignore_deprecated_metrics: self._filter_metric = self._ignore_deprecated_metric self.process(scraper_config, metric_transformers=transformers) # Check the leader-election status if is_affirmative(instance.get('leader_election', True)): leader_config = self.LEADER_ELECTION_CONFIG leader_config["tags"] = instance.get("tags", []) self.check_election_status(leader_config)
def get_config(self, instance): instance_id = hash_mutable(instance) config = self.config.get(instance_id) if config is None: config = {} try: api_url = instance['api_url'] api_version = api_url[-1] if api_version not in self.api_versions: self.log.warning( 'Unknown Vault API version `%s`, using version `%s`', api_version, self.DEFAULT_API_VERSION) api_url = api_url[:-1] + self.DEFAULT_API_VERSION api_version = self.DEFAULT_API_VERSION config['api_url'] = api_url config['api'] = self.api_versions[api_version]['functions'] except KeyError: self.log.error( 'Vault configuration setting `api_url` is required') return config['tags'] = instance.get('tags', []) # Keep track of the previous cluster leader to detect changes. config['leader'] = None config['detect_leader'] = is_affirmative( instance.get('detect_leader')) self.config[instance_id] = config return config
def create(logger, proxies, instance_config): keystone_server_url = instance_config.get("keystone_server_url") ssl_verify = is_affirmative(instance_config.get("ssl_verify", True)) paginated_limit = instance_config.get('paginated_limit') request_timeout = instance_config.get('request_timeout') user = instance_config.get("user") openstack_config_file_path = instance_config.get( "openstack_config_file_path") openstack_cloud_name = instance_config.get("openstack_cloud_name") api = None # If an openstack configuration is specified, an OpenstackSDKApi will be created, and the authentification # will be made directly from the openstack configuration file if openstack_cloud_name is None: api = SimpleApi(logger, keystone_server_url, timeout=request_timeout, ssl_verify=ssl_verify, proxies=proxies, limit=paginated_limit) api.connect(user) else: api = OpenstackSDKApi(logger) api.connect(openstack_config_file_path, openstack_cloud_name) return api
def _get_conn(self, instance): no_cache = is_affirmative( instance.get('disable_connection_cache', False)) key = self._generate_instance_key(instance) if no_cache or key not in self.connections: try: # Only send useful parameters to the redis client constructor list_params = [ 'host', 'port', 'db', 'password', 'socket_timeout', 'connection_pool', 'charset', 'errors', 'unix_socket_path', 'ssl', 'ssl_certfile', 'ssl_keyfile', 'ssl_ca_certs', 'ssl_cert_reqs' ] # Set a default timeout (in seconds) if no timeout is specified in the instance config instance['socket_timeout'] = instance.get('socket_timeout', 5) connection_params = dict( (k, instance[k]) for k in list_params if k in instance) # If caching is disabled, we overwrite the dictionary value so the old connection # will be closed as soon as the corresponding Python object gets garbage collected self.connections[key] = redis.Redis(**connection_params) except TypeError: msg = "You need a redis library that supports authenticated connections. Try sudo easy_install redis." raise Exception(msg) return self.connections[key]
def __init__(self, instance): self.channel = instance.get('channel') self.queue_manager_name = instance.get('queue_manager', 'default') self.host = instance.get('host', 'localhost') self.port = instance.get('port', '1414') self.host_and_port = "{}({})".format(self.host, self.port) self.username = instance.get('username') self.password = instance.get('password') self.queues = instance.get('queues', []) self.queue_patterns = instance.get('queue_patterns', []) self.queue_regex = [ re.compile(regex) for regex in instance.get('queue_regex', []) ] self.auto_discover_queues = is_affirmative( instance.get('auto_discover_queues', False)) if int(self.auto_discover_queues) + int(bool( self.queue_patterns)) + int(bool(self.queue_regex)) > 1: log.warning( "Configurations auto_discover_queues, queue_patterns and queue_regex are not intended to be used " "together.") self.channels = instance.get('channels', []) self.channel_status_mapping = self.get_channel_status_mapping( instance.get('channel_status_mapping')) self.custom_tags = instance.get('tags', []) self.ssl = is_affirmative(instance.get('ssl_auth', False)) self.ssl_cipher_spec = instance.get('ssl_cipher_spec', 'TLS_RSA_WITH_AES_256_CBC_SHA') self.ssl_key_repository_location = instance.get( 'ssl_key_repository_location', '/var/mqm/ssl-db/client/KeyringClient') self.mq_installation_dir = instance.get('mq_installation_dir', '/opt/mqm/') self._queue_tag_re = instance.get('queue_tag_re', {}) self.queue_tag_re = self._compile_tag_re()
def test_is_affirmative(): assert config.is_affirmative(None) is False assert config.is_affirmative(0) is False assert config.is_affirmative("whatever, it could be 'off'") is False assert config.is_affirmative(1) is True assert config.is_affirmative('YES') is True assert config.is_affirmative('True') is True assert config.is_affirmative('On') is True assert config.is_affirmative('1') is True
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # Cache connections self.connections = {} self.failed_connections = {} self.instances_metrics = {} self.instances_per_type_metrics = defaultdict(dict) self.existing_databases = None self.do_check = {} self.proc_type_mapping = {'gauge': self.gauge, 'rate': self.rate, 'histogram': self.histogram} self.adoprovider = self.default_adoprovider self.connector = init_config.get('connector', 'adodbapi') if self.connector.lower() not in self.valid_connectors: self.log.error("Invalid database connector %s, defaulting to adodbapi", self.connector) self.connector = 'adodbapi' self.adoprovider = init_config.get('adoprovider', self.default_adoprovider) if self.adoprovider.upper() not in self.valid_adoproviders: self.log.error( "Invalid ADODB provider string %s, defaulting to %s", self.adoprovider, self.default_adoprovider ) self.adoprovider = self.default_adoprovider # Pre-process the list of metrics to collect self.custom_metrics = init_config.get('custom_metrics', []) for instance in instances: try: instance_key = self._conn_key(instance, self.DEFAULT_DB_KEY) self.do_check[instance_key] = True # check to see if the database exists before we try any connections to it with self.open_managed_db_connections(instance, None, db_name=self.DEFAULT_DATABASE): db_exists, context = self._check_db_exists(instance) if db_exists: if instance.get('stored_procedure') is None: with self.open_managed_db_connections(instance, self.DEFAULT_DB_KEY): self._make_metric_list_to_collect(instance, self.custom_metrics) else: # How much do we care that the DB doesn't exist? ignore = is_affirmative(instance.get("ignore_missing_database", False)) if ignore is not None and ignore: # not much : we expect it. leave checks disabled self.do_check[instance_key] = False self.log.warning("Database %s does not exist. Disabling checks for this instance.", context) else: # yes we do. Keep trying self.log.error("Database %s does not exist. Fix issue and restart agent", context) except SQLConnectionError: self.log.exception("Skipping SQL Server instance") continue except Exception as e: self.log.exception("INitialization exception %s", e) continue
def _get_auth_response_from_config(cls, logger, init_config, instance_config, proxy_config=None): keystone_server_url = init_config.get("keystone_server_url") if not keystone_server_url: raise IncompleteConfig() ssl_verify = is_affirmative(init_config.get("ssl_verify", False)) identity = cls._get_user_identity(instance_config) keystone_api = KeystoneApi(logger, ssl_verify, proxy_config, keystone_server_url, None) resp = keystone_api.post_auth_token(identity) return resp.headers.get('X-Subject-Token')
def _cache_morlist_raw_async(self, instance, tags, regexes=None, include_only_marked=False): """ Fills the queue in a separate thread """ i_key = self._instance_key(instance) server_instance = self._get_server_instance(instance) use_guest_hostname = is_affirmative(instance.get("use_guest_hostname", False)) all_objs = self._get_all_objs(server_instance, regexes, include_only_marked, tags, use_guest_hostname=use_guest_hostname) self.mor_objects_queue.fill(i_key, dict(all_objs))
def __init__(self, name, init_config, agentConfig, instances=None): # We do not support more than one instance of kube-state-metrics instance = instances[0] kubernetes_state_instance = self._create_kubernetes_state_prometheus_instance(instance) # First deprecation phase: we keep ksm labels by default # Next iteration: remove ksm labels by default # Last iteration: remove this option self.keep_ksm_labels = is_affirmative(kubernetes_state_instance.get('keep_ksm_labels', True)) generic_instances = [kubernetes_state_instance] super(KubernetesState, self).__init__(name, init_config, agentConfig, instances=generic_instances) self.condition_to_status_positive = {'true': self.OK, 'false': self.CRITICAL, 'unknown': self.UNKNOWN} self.condition_to_status_negative = {'true': self.CRITICAL, 'false': self.OK, 'unknown': self.UNKNOWN} # Parameters for the count_objects_by_tags method self.object_count_params = { 'kube_persistentvolume_status_phase': { 'metric_name': 'persistentvolumes.by_phase', 'allowed_labels': ['storageclass', 'phase'], }, 'kube_service_spec_type': {'metric_name': 'service.count', 'allowed_labels': ['namespace', 'type']}, } self.METRIC_TRANSFORMERS = { 'kube_pod_status_phase': self.kube_pod_status_phase, 'kube_pod_container_status_waiting_reason': self.kube_pod_container_status_waiting_reason, 'kube_pod_container_status_terminated_reason': self.kube_pod_container_status_terminated_reason, 'kube_cronjob_next_schedule_time': self.kube_cronjob_next_schedule_time, 'kube_job_complete': self.kube_job_complete, 'kube_job_failed': self.kube_job_failed, 'kube_job_status_failed': self.kube_job_status_failed, 'kube_job_status_succeeded': self.kube_job_status_succeeded, 'kube_node_status_condition': self.kube_node_status_condition, 'kube_node_status_ready': self.kube_node_status_ready, 'kube_node_status_out_of_disk': self.kube_node_status_out_of_disk, 'kube_node_status_memory_pressure': self.kube_node_status_memory_pressure, 'kube_node_status_disk_pressure': self.kube_node_status_disk_pressure, 'kube_node_status_network_unavailable': self.kube_node_status_network_unavailable, 'kube_node_spec_unschedulable': self.kube_node_spec_unschedulable, 'kube_resourcequota': self.kube_resourcequota, 'kube_limitrange': self.kube_limitrange, 'kube_persistentvolume_status_phase': self.count_objects_by_tags, 'kube_service_spec_type': self.count_objects_by_tags, } # Handling cron jobs succeeded/failed counts self.failed_cron_job_counts = defaultdict(KubernetesState.CronJobCount) self.succeeded_cron_job_counts = defaultdict(KubernetesState.CronJobCount) # Logic for Jobs self.job_succeeded_count = defaultdict(int) self.job_failed_count = defaultdict(int)
def _get_config(self, instance): server = instance.get('server') user = instance.get('user') password = instance.get('password') service = instance.get('service_name') jdbc_driver = instance.get('jdbc_driver_path') tags = instance.get('tags') or [] custom_queries = instance.get('custom_queries', []) if is_affirmative(instance.get('use_global_custom_queries', True)): custom_queries.extend(self.init_config.get('global_custom_queries', [])) return server, user, password, service, jdbc_driver, tags, custom_queries
def check(self, instance): status_url = instance.get('status_url') ping_url = instance.get('ping_url') use_fastcgi = is_affirmative(instance.get('use_fastcgi', False)) ping_reply = instance.get('ping_reply') auth = None user = instance.get('user') password = instance.get('password') tags = instance.get('tags', []) http_host = instance.get('http_host') timeout = instance.get('timeout', DEFAULT_TIMEOUT) disable_ssl_validation = is_affirmative( instance.get('disable_ssl_validation', False)) if user and password: auth = (user, password) if status_url is None and ping_url is None: raise BadConfigError( "No status_url or ping_url specified for this instance") pool = None if status_url is not None: try: pool = self._process_status(status_url, auth, tags, http_host, timeout, disable_ssl_validation, use_fastcgi) except Exception as e: self.log.error("Error running php_fpm check: {}".format(e)) if ping_url is not None: self._process_ping(ping_url, ping_reply, auth, tags, pool, http_host, timeout, disable_ssl_validation, use_fastcgi)
def _get_request_url(self, instance, url): ''' Get the request address, build with proxy if necessary ''' parsed = urlparse(url) _url = url if not (parsed.netloc and parsed.scheme) and \ is_affirmative(instance.get('spark_proxy_enabled', False)): master_address = self._get_master_address(instance) _url = urljoin(master_address, parsed.path) self.log.debug('Request URL returned: %s', _url) return _url
def _cache_morlist_raw(self, instance): """ Fill the Mor objects queue that will be asynchronously processed later. Resolve the vCenter `rootFolder` and initiate hosts and virtual machines discovery. """ i_key = self._instance_key(instance) self.log.debug("Caching the morlist for vcenter instance {}".format(i_key)) # If the queue is not completely empty, don't do anything for resource_type in RESOURCE_TYPE_METRICS: if self.mor_objects_queue.contains(i_key) and self.mor_objects_queue.size(i_key, resource_type): last = self.cache_config.get_last(CacheConfig.Morlist, i_key) self.log.debug( "Skipping morlist collection: the objects queue for the " "resource type '{}' is still being processed " "(latest refresh was {}s ago)".format( ensure_unicode(resource_type), time.time() - last ) ) return tags = ["vcenter_server:{}".format(ensure_unicode(instance.get('name')))] regexes = { 'host_include': instance.get('host_include_only_regex'), 'vm_include': instance.get('vm_include_only_regex') } include_only_marked = is_affirmative(instance.get('include_only_marked', False)) # Discover hosts and virtual machines server_instance = self._get_server_instance(instance) use_guest_hostname = is_affirmative(instance.get("use_guest_hostname", False)) all_objs = self._get_all_objs(server_instance, regexes, include_only_marked, tags, use_guest_hostname=use_guest_hostname) self.mor_objects_queue.fill(i_key, dict(all_objs)) self.cache_config.set_last(CacheConfig.Morlist, i_key, time.time())
def from_config(cls, init_config, instance_config, proxy_config=None): keystone_server_url = init_config.get("keystone_server_url") if not keystone_server_url: raise IncompleteConfig() ssl_verify = is_affirmative(init_config.get("ssl_verify", True)) auth_token = cls._get_auth_response_from_config( init_config, instance_config, proxy_config) # list all projects projects = KeystoneApi.get_auth_projects(auth_token, keystone_server_url, ssl_verify, proxy_config) # for each project, we create an OpenStackProject object that we add to the `project_scope_map` dict project_scope_map = {} for project in projects: identity = {"methods": ['token'], "token": {"id": auth_token}} scope = {'project': {'id': project.get('id')}} token_resp = KeystoneApi.post_auth_token(keystone_server_url, identity, ssl_verify, scope=scope, proxy=proxy_config) project_auth_token = token_resp.headers.get('X-Subject-Token') nova_endpoint = cls._get_nova_endpoint(token_resp.json()) neutron_endpoint = cls._get_neutron_endpoint(token_resp.json()) project_auth_scope = { 'project': { 'name': project.get('name'), 'id': project.get('id'), 'domain': {} if project.get('domain_id') is None else { 'id': project.get('domain_id') }, } } project_scope = OpenStackProject(project_auth_token, project_auth_scope, nova_endpoint, neutron_endpoint) project_name = project.get('name') project_id = project.get('id') if project_name is None or project_id is None: break project_key = (project_name, project_id) project_scope_map[project_key] = project_scope return cls(auth_token, project_scope_map)
def create(logger, proxies, instance_config): keystone_server_url = instance_config.get("keystone_server_url") ssl_verify = is_affirmative(instance_config.get("ssl_verify", True)) paginated_limit = instance_config.get('paginated_limit') request_timeout = instance_config.get('request_timeout') user = instance_config.get("user") api = SimpleApi(logger, keystone_server_url, timeout=request_timeout, ssl_verify=ssl_verify, proxies=proxies, limit=paginated_limit) api.connect(user) return api
def _process_mor_objects_queue(self, instance): """ Pops `batch_morlist_size` items from the mor objects queue and run asynchronously the _process_mor_objects_queue_async method to fill the Mor cache. """ i_key = self._instance_key(instance) self.mor_cache.init_instance(i_key) if not self.mor_objects_queue.contains(i_key): self.log.debug( b"Objects queue is not initialized yet for instance {}, skipping processing" .format(i_key)) return for resource_type in RESOURCE_TYPE_METRICS: # Batch size can prevent querying large payloads at once if the environment is too large # If batch size is set to 0, process everything at once batch_size = self.batch_morlist_size or self.mor_objects_queue.size( i_key, resource_type) while self.mor_objects_queue.size(i_key, resource_type): mors = [] for _ in xrange(batch_size): mor = self.mor_objects_queue.pop(i_key, resource_type) if mor is None: self.log.debug( "No more objects of type '{}' left in the queue". format(resource_type)) break mor_name = str(mor['mor']) mor['interval'] = REAL_TIME_INTERVAL if mor[ 'mor_type'] in REALTIME_RESOURCES else None # Always update the cache to account for Mors that might have changed parent # in the meantime (e.g. a migrated VM). self.mor_cache.set_mor(i_key, mor_name, mor) # Only do this for non real-time resources i.e. datacenter, datastore and cluster # For hosts and VMs, we can rely on a precomputed list of metrics realtime_only = is_affirmative( instance.get("collect_realtime_only", True)) if mor["mor_type"] not in REALTIME_RESOURCES and not realtime_only: mors.append(mor) # We will actually schedule jobs for non realtime resources only. if mors: self.pool.apply_async( self._process_mor_objects_queue_async, args=(instance, mors))
def __init__(self, name, init_config, agentConfig, instances=None): super(OpenStackControllerCheck, self).__init__(name, init_config, agentConfig, instances) self.keystone_server_url = init_config.get("keystone_server_url") if not self.keystone_server_url: raise IncompleteConfig() self.proxy_config = self.get_instance_proxy(init_config, self.keystone_server_url) self.ssl_verify = is_affirmative(init_config.get("ssl_verify", True)) self.paginated_server_limit = init_config.get('paginated_server_limit') or DEFAULT_PAGINATED_SERVER_LIMIT self.request_timeout = init_config.get('request_timeout') or DEFAULT_API_REQUEST_TIMEOUT exclude_network_id_patterns = set(init_config.get('exclude_network_ids', [])) self.exclude_network_id_rules = [re.compile(ex) for ex in exclude_network_id_patterns] exclude_server_id_patterns = set(init_config.get('exclude_server_ids', [])) self.exclude_server_id_rules = [re.compile(ex) for ex in exclude_server_id_patterns] include_project_name_patterns = set(init_config.get('whitelist_project_names', [])) self.include_project_name_rules = [re.compile(ex) for ex in include_project_name_patterns] exclude_project_name_patterns = set(init_config.get('blacklist_project_names', [])) self.exclude_project_name_rules = [re.compile(ex) for ex in exclude_project_name_patterns] self._keystone_api = None self._compute_api = None self._neutron_api = None self._backoff = BackOffRetry() # Mapping of check instances to associated OpenStackScope self.instance_scopes_cache = {} # Current instance and project authentication scopes self.instance_scope = None # Cache some things between runs for values that change rarely self._aggregate_list = None # Mapping of Nova-managed servers to tags self.external_host_tags = {} # ISO8601 date time: used to filter the call to get the list of nova servers self.changes_since_time = {} # Ex: server_details_by_id = { # UUID: {UUID: <value>, etc} # 1: {id: 1, name: hostA}, # 2: {id: 2, name: hostB} # } self.server_details_by_id = {}