def check(self, instance): try: self.start_pool() self.exception_printed = 0 # First part: make sure our object repository is neat & clean if self._should_cache(instance, CacheConfig.Metadata): self._cache_metrics_metadata(instance) if self._should_cache(instance, CacheConfig.Morlist): self._cache_morlist_raw(instance) self._process_mor_objects_queue(instance) # Remove old objects that might be gone from the Mor cache self.mor_cache.purge(self._instance_key(instance), self.clean_morlist_interval) # Second part: do the job self.collect_metrics(instance) self._query_event(instance) if set_external_tags is not None: set_external_tags(self.get_external_host_tags()) self.stop_pool() if self.exception_printed > 0: self.log.error( "One thread in the pool crashed, check the logs") except Exception: self.terminate_pool() raise
def check(self, instance): if not self.pool_started: self.start_pool() custom_tags = instance.get('tags', []) # ## <TEST-INSTRUMENTATION> self.gauge('datadog.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:initial'] + custom_tags) # ## </TEST-INSTRUMENTATION> # Only schedule more jobs on the queue if the jobs from the previous check runs are finished # It's no good to keep piling up jobs if self.pool._workq.qsize() == 0: # First part: make sure our object repository is neat & clean if self._should_cache(instance, CacheConfig.Metadata): self._cache_metrics_metadata(instance) if self._should_cache(instance, CacheConfig.Morlist): self._cache_morlist_raw(instance) self._process_mor_objects_queue(instance) # Remove old objects that might be gone from the Mor cache self.mor_cache.purge(self._instance_key(instance), self.clean_morlist_interval) # Second part: do the job self.collect_metrics(instance) else: self.log.debug( "Thread pool is still processing jobs from previous run. Not scheduling anything." ) self._query_event(instance) thread_crashed = False try: while True: self.log.error(self.exceptionq.get_nowait()) thread_crashed = True except Empty: pass if thread_crashed: self.stop_pool() raise Exception("One thread in the pool crashed, check the logs") if set_external_tags is not None: set_external_tags(self.get_external_host_tags()) # ## <TEST-INSTRUMENTATION> self.gauge('datadog.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:final'] + custom_tags)
def check(self, instance): if not self.pool_started: self.start_pool() custom_tags = instance.get('tags', []) # ## <TEST-INSTRUMENTATION> self.gauge('datadog.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:initial'] + custom_tags) # ## </TEST-INSTRUMENTATION> # First part: make sure our object repository is neat & clean if self._should_cache(instance, METRICS_METADATA): self.log.debug("Caching metrics metadata for instance %s", self._instance_key(instance)) self._cache_metrics_metadata(instance) if self._should_cache(instance, MORLIST): self.log.debug("Caching MOR list for instance %s", self._instance_key(instance)) self._cache_morlist_raw(instance) self._cache_morlist_process(instance) self._vacuum_morlist(instance) # Second part: do the job self.collect_metrics(instance) self._query_event(instance) # For our own sanity self._clean() thread_crashed = False try: while True: self.log.critical(self.exceptionq.get_nowait()) thread_crashed = True except Empty: pass if thread_crashed: self.stop_pool() raise Exception("One thread in the pool crashed, check the logs") if set_external_tags is not None: set_external_tags(self.get_external_host_tags()) # ## <TEST-INSTRUMENTATION> self.gauge('datadog.agent.vsphere.queue_size', self.pool._workq.qsize(), tags=['instant:final'] + custom_tags)
def set_external_tags(self, external_tags): # Example of external_tags format # [ # ('hostname', {'src_name': ['test:t1']}), # ('hostname2', {'src2_name': ['test2:t3']}) # ] try: for _, source_map in external_tags: for src_name, tags in iteritems(source_map): source_map[src_name] = self._normalize_tags_type(tags) datadog_agent.set_external_tags(external_tags) except IndexError: self.log.exception('Unexpected external tags format: {}'.format(external_tags)) raise
def set_external_tags(self, external_tags): # Example of external_tags format # [ # ('hostname', {'src_name': ['test:t1']}), # ('hostname2', {'src2_name': ['test2:t3']}) # ] try: new_tags = [] for hostname, source_map in external_tags: new_tags.append((to_string(hostname), source_map)) for src_name, tags in iteritems(source_map): source_map[src_name] = self._normalize_tags_type(tags) datadog_agent.set_external_tags(new_tags) except IndexError: self.log.exception('Unexpected external tags format: %s', external_tags) raise
def check(self, instance): custom_tags = instance.get("tags", []) if custom_tags is None: custom_tags = [] try: instance_scope = self.ensure_auth_scope(instance) if not instance_scope: # Fast fail in the absence of an instance_scope return self._send_api_service_checks(instance_scope, custom_tags) # Store the scope on the object so we don't have to keep passing it around self._current_scope = instance_scope collect_all_projects = instance.get("collect_all_projects", False) self.log.debug("Running check with credentials: \n") self.log.debug("Nova Url: %s", self.get_nova_endpoint()) self.log.debug("Neutron Url: %s", self.get_neutron_endpoint()) # Restrict monitoring to this (host, hypervisor, project) # and it's guest servers hyp = self.get_local_hypervisor() project = self.get_scoped_project(instance) projects = [] if collect_all_projects: projects = self.get_all_projects(instance) else: projects.append(project) # Restrict monitoring to non-excluded servers server_ids = self.get_servers_managed_by_hypervisor() host_tags = self._get_tags_for_host() for sid in server_ids: server_tags = ["nova_managed_server"] if instance_scope.tenant_id: server_tags.append("tenant_id:%s" % instance_scope.tenant_id) if project and 'name' in project: server_tags.append('project_name:{0}'.format( project['name'])) self.external_host_tags[sid] = host_tags self.get_stats_for_single_server(sid, tags=server_tags + custom_tags) if hyp: self.get_stats_for_single_hypervisor(hyp, host_tags=host_tags, custom_tags=custom_tags) else: self.warning( "Couldn't get hypervisor to monitor for host: %s" % self.get_my_hostname()) if projects and project: # Ensure projects list and scoped project exists self.get_stats_for_all_projects(projects, custom_tags) # For now, monitor all networks self.get_network_stats(custom_tags) if set_external_tags is not None: set_external_tags(self.get_external_host_tags()) except IncompleteConfig as e: if isinstance(e, IncompleteAuthScope): self.warning( """Please specify the auth scope via the `auth_scope` variable in your init_config.\n The auth_scope should look like: \n {'project': {'name': 'my_project', 'domain': {'id': 'my_domain_id'}}}\n OR\n {'project': {'id': 'my_project_id'}} """) elif isinstance(e, IncompleteIdentity): self.warning( "Please specify the user via the `user` variable in your init_config.\n" + "This is the user you would use to authenticate with Keystone v3 via password auth.\n" + "The user should look like: {'password': '******', 'name': 'my_name', 'domain': {'id': 'my_domain_id'}}" ) else: self.warning( "Configuration Incomplete! Check your openstack.yaml file")
def check(self, instance): # Initialize global variable that are per instances self.external_host_tags = {} self.instance_name = instance.get('name') if not self.instance_name: # We need a instance_name to identify this instance raise IncompleteConfig() # have we been backed off if not self._backoff.should_run(): self.log.info('Skipping run due to exponential backoff in effect') return # Fetch instance configs keystone_server_url = instance.get("keystone_server_url") if not keystone_server_url: raise IncompleteConfig() network_ids = instance.get('network_ids', []) exclude_network_id_patterns = set( instance.get('exclude_network_ids', [])) exclude_network_id_rules = [ re.compile(ex) for ex in exclude_network_id_patterns ] exclude_server_id_patterns = set(instance.get('exclude_server_ids', [])) exclude_server_id_rules = [ re.compile(ex) for ex in exclude_server_id_patterns ] include_project_name_patterns = set( instance.get('whitelist_project_names', [])) include_project_name_rules = [ re.compile(ex) for ex in include_project_name_patterns ] exclude_project_name_patterns = set( instance.get('blacklist_project_names', [])) exclude_project_name_rules = [ re.compile(ex) for ex in exclude_project_name_patterns ] custom_tags = instance.get("tags", []) collect_project_metrics = is_affirmative( instance.get('collect_project_metrics', True)) collect_hypervisor_metrics = is_affirmative( instance.get('collect_hypervisor_metrics', True)) collect_hypervisor_load = is_affirmative( instance.get('collect_hypervisor_load', True)) collect_network_metrics = is_affirmative( instance.get('collect_network_metrics', True)) collect_server_diagnostic_metrics = is_affirmative( instance.get('collect_server_diagnostic_metrics', True)) collect_server_flavor_metrics = is_affirmative( instance.get('collect_server_flavor_metrics', True)) use_shortname = is_affirmative(instance.get('use_shortname', False)) try: # Authenticate and add the instance api to apis cache self.init_api(instance, custom_tags) if self._api is None: self.log.info( "Not api found, make sure you admin user has access to your OpenStack projects: \n" ) return self.log.debug("Running check with credentials: \n") self._send_api_service_checks(keystone_server_url, custom_tags) # List projects and filter them # TODO: NOTE: During authentication we use /v3/auth/projects and here we use /v3/projects. # TODO: These api don't seems to return the same thing however the latter contains the former. # TODO: Is this expected or could we just have one call with proper config? projects = self.get_projects(include_project_name_rules, exclude_project_name_rules) if collect_project_metrics: for name, project in iteritems(projects): self.collect_project_limit(project, custom_tags) servers = self.populate_servers_cache(projects, exclude_server_id_rules) self.collect_hypervisors_metrics( servers, custom_tags=custom_tags, use_shortname=use_shortname, collect_hypervisor_metrics=collect_hypervisor_metrics, collect_hypervisor_load=collect_hypervisor_load) if collect_server_diagnostic_metrics or collect_server_flavor_metrics: if collect_server_diagnostic_metrics: self.log.debug("Fetch stats from %s server(s)" % len(servers)) for server in itervalues(servers): self.collect_server_diagnostic_metrics( server, tags=custom_tags, use_shortname=use_shortname) if collect_server_flavor_metrics: if len(servers) >= 1 and 'flavor_id' in next( itervalues(servers)): self.log.debug("Fetch server flavors") # If flavors are not part of servers detail (new in version 2.47) then we need to fetch them flavors = self.get_flavors() else: flavors = None for server in itervalues(servers): self.collect_server_flavor_metrics( server, flavors, tags=custom_tags, use_shortname=use_shortname) if collect_network_metrics: self.collect_networks_metrics(custom_tags, network_ids, exclude_network_id_rules) if set_external_tags is not None: set_external_tags(self.get_external_host_tags()) except IncompleteConfig as e: if isinstance(e, IncompleteIdentity): self.warning( "Please specify the user via the `user` variable in your init_config.\n" + "This is the user you would use to authenticate with Keystone v3 via password auth.\n" + "The user should look like:" + "{'password': '******', 'name': 'my_name', 'domain': {'id': 'my_domain_id'}}" ) else: self.warning( "Configuration Incomplete! Check your openstack.yaml file") except AuthenticationNeeded: # Delete the scope, we'll populate a new one on the next run for this instance self.delete_api_cache() except (requests.exceptions.HTTPError, requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: if isinstance(e, requests.exceptions.HTTPError ) and e.response.status_code < 500: self.warning("Error reaching nova API: %s" % e) else: # exponential backoff self.do_backoff(custom_tags) return self._backoff.reset_backoff()
def check(self, instance): self.log.info("Starting Cisco Check") start = datetime.datetime.now() aci_url = instance.get('aci_url') aci_urls = instance.get('aci_urls', []) if aci_url: aci_urls.append(aci_url) if len(aci_urls) == 0: raise Exception("The Cisco ACI check requires at least one url") username = instance['username'] pwd = instance['pwd'] instance_hash = hash_mutable(instance) timeout = instance.get('timeout', 15) ssl_verify = _is_affirmative(instance.get('ssl_verify', True)) if instance_hash in self._api_cache: api = self._api_cache.get(instance_hash) else: api = Api(aci_urls, username, pwd, verify=ssl_verify, timeout=timeout, log=self.log) self._api_cache[instance_hash] = api service_check_tags = [] for url in aci_urls: service_check_tags.append("url:{}".format(url)) service_check_tags.extend(self.check_tags) service_check_tags.extend(instance.get('tags', [])) try: api.login() except Exception as e: self.log.error("Cannot login to the Cisco ACI: {}".format(e)) self.service_check( SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="aci login returned a status of {}".format(e), tags=service_check_tags) raise self.tagger.api = api try: tenant = Tenant(self, api, instance, instance_hash) tenant.collect() except Exception as e: self.log.error('tenant collection failed: {}'.format(e)) self.service_check( SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="aci tenant operations failed, returning a status of {}" .format(e), tags=service_check_tags) api.close() raise try: fabric = Fabric(self, api, instance) fabric.collect() except Exception as e: self.log.error('fabric collection failed: {}'.format(e)) self.service_check( SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message="aci fabric operations failed, returning a status of {}" .format(e), tags=service_check_tags) api.close() raise try: capacity = Capacity(api, instance, check_tags=self.check_tags, gauge=self.gauge, log=self.log) capacity.collect() except Exception as e: self.log.error('capacity collection failed: {}'.format(e)) self.service_check( SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message= "aci capacity operations failed, returning a status of {}". format(e), tags=service_check_tags) api.close() raise self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags) if set_external_tags: set_external_tags(self.get_external_host_tags()) api.close() end = datetime.datetime.now() log_line = "finished running Cisco Check" if _is_affirmative(instance.get('report_timing', False)): log_line += ", took {}".format(end - start) self.log.info(log_line)
def test(): datadog_agent.set_external_tags(tags)
def check(self, instance): # have we been backed off if not self.backoff.should_run(instance): self.log.info('Skipping run due to exponential backoff in effect') return projects = {} custom_tags = instance.get("tags", []) collect_limits_from_all_projects = is_affirmative( instance.get('collect_limits_from_all_projects', True)) collect_hypervisor_load = is_affirmative( instance.get('collect_hypervisor_load', False)) use_shortname = is_affirmative(instance.get('use_shortname', False)) try: scope_map = self.get_scope_map(instance) for _, scope in iteritems(scope_map): # Store the scope on the object so we don't have to keep passing it around self._current_scope = scope self._send_api_service_checks(scope, custom_tags) self.log.debug("Running check with credentials: \n") self.log.debug("Nova Url: %s", self.get_nova_endpoint()) self.log.debug("Neutron Url: %s", self.get_neutron_endpoint()) project = self.get_scoped_project(scope) if project and project.get('name'): projects[project.get('name')] = project i_key = get_instance_key(instance) if collect_limits_from_all_projects: scope_projects = self.get_all_projects(scope) if scope_projects: for proj in scope_projects: projects[proj['name']] = proj proj_filtered = pattern_filter( [p for p in projects], whitelist=self.include_project_name_rules, blacklist=self.exclude_project_name_rules) projects = \ {name: v for (name, v) in iteritems(projects) if name in proj_filtered} for name, project in iteritems(projects): self.get_stats_for_single_project(project, custom_tags) self.get_stats_for_all_hypervisors( instance, custom_tags=custom_tags, use_shortname=use_shortname, collect_hypervisor_load=collect_hypervisor_load) # This updates the server cache directly self.get_all_servers(i_key) self.filter_excluded_servers() # Deep copy the cache so we can remove things from the Original during the iteration # Allows us to remove bad servers from the cache if needbe server_cache_copy = copy.deepcopy(self.server_details_by_id) for server in server_cache_copy: server_tags = copy.deepcopy(custom_tags) server_tags.append("nova_managed_server") self.get_stats_for_single_server(server_cache_copy[server], tags=server_tags, use_shortname=use_shortname) # For now, monitor all networks self.get_network_stats(custom_tags) if set_external_tags is not None: set_external_tags(self.get_external_host_tags()) except IncompleteConfig as e: if isinstance(e, IncompleteIdentity): self.warning( "Please specify the user via the `user` variable in your init_config.\n" + "This is the user you would use to authenticate with Keystone v3 via password auth.\n" + "The user should look like:" + "{'password': '******', 'name': 'my_name', 'domain': {'id': 'my_domain_id'}}" ) else: self.warning( "Configuration Incomplete! Check your openstack.yaml file") except requests.exceptions.HTTPError as e: if e.response.status_code >= 500: # exponential backoff self.backoff.do_backoff(instance) self.warning( "There were some problems reaching the nova API - applying exponential backoff" ) else: self.warning("Error reaching nova API") return except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: # exponential backoff self.backoff.do_backoff(instance) self.warning( "There were some problems reaching the nova API - applying exponential backoff" ) return self.backoff.reset_backoff(instance)
def check(self, instance): # have we been backed off if not self._backoff.should_run(instance): self.log.info('Skipping run due to exponential backoff in effect') return custom_tags = instance.get("tags", []) collect_project_metrics = is_affirmative( instance.get('collect_project_metrics', True)) collect_hypervisor_metrics = is_affirmative( instance.get('collect_hypervisor_metrics', True)) collect_hypervisor_load = is_affirmative( instance.get('collect_hypervisor_load', True)) collect_network_metrics = is_affirmative( instance.get('collect_network_metrics', True)) collect_server_diagnostic_metrics = is_affirmative( instance.get('collect_server_diagnostic_metrics', True)) collect_server_flavor_metrics = is_affirmative( instance.get('collect_server_flavor_metrics', True)) use_shortname = is_affirmative(instance.get('use_shortname', False)) try: instance_name = get_instance_name(instance) # Authenticate and add the instance scope to instance_scopes cache self.init_instance_scope_cache(instance) # Init instance_scope self.instance_scope = self.get_instance_scope(instance) project_scopes = self.get_project_scopes(instance) # TODO: The way we fetch projects will be changed in another PR. # Having this for loop result may result (depending on how permission arr set) on duplicate metrics. # This is a temporary hack, instead we will just pop the first element # for _, project_scope in iteritems(project_scopes): _, project_scope = project_scopes.popitem() if not project_scope: self.log.info( "Not project found, make sure you admin user has access to your OpenStack projects: \n" ) return self.log.debug("Running check with credentials: \n") self.log.debug("Nova Url: %s", project_scope.nova_endpoint) self.log.debug("Neutron Url: %s", project_scope.neutron_endpoint) self._neutron_api = NeutronApi(self.log, self.ssl_verify, self.proxy_config, project_scope.neutron_endpoint, project_scope.auth_token) self._compute_api = ComputeApi(self.log, self.ssl_verify, self.proxy_config, project_scope.nova_endpoint, project_scope.auth_token) self._send_api_service_checks(project_scope, custom_tags) # List projects and filter them # TODO: NOTE: During authentication we use /v3/auth/projects and here we use /v3/projects. # TODO: These api don't seems to return the same thing however the latter contains the former. # TODO: Is this expected or could we just have one call with proper config? projects = self.get_projects(project_scope.auth_token, self.include_project_name_rules, self.exclude_project_name_rules) if collect_project_metrics: for name, project in iteritems(projects): self.collect_project_limit(project, custom_tags) self.collect_hypervisors_metrics( custom_tags=custom_tags, use_shortname=use_shortname, collect_hypervisor_metrics=collect_hypervisor_metrics, collect_hypervisor_load=collect_hypervisor_load) if collect_server_diagnostic_metrics or collect_server_flavor_metrics: # This updates the server cache directly tenant_id_to_name = {} for name, p in iteritems(projects): tenant_id_to_name[p.get('id')] = name self.get_all_servers(tenant_id_to_name, instance_name) servers = self.servers_cache[instance_name]['servers'] if collect_server_diagnostic_metrics: self.log.debug("Fetch stats from %s server(s)" % len(servers)) for _, server in iteritems(servers): self.collect_server_diagnostic_metrics( server, tags=custom_tags, use_shortname=use_shortname) if collect_server_flavor_metrics: if len(servers) >= 1 and 'flavor_id' in next( itervalues(servers)): self.log.debug("Fetch server flavors") # If flavors are not part of servers detail (new in version 2.47) then we need to fetch them flavors = self.get_flavors() else: flavors = None for _, server in iteritems(servers): self.collect_server_flavor_metrics( server, flavors, tags=custom_tags, use_shortname=use_shortname) if collect_network_metrics: self.collect_networks_metrics(custom_tags) if set_external_tags is not None: set_external_tags(self.get_external_host_tags()) except IncompleteConfig as e: if isinstance(e, IncompleteIdentity): self.warning( "Please specify the user via the `user` variable in your init_config.\n" + "This is the user you would use to authenticate with Keystone v3 via password auth.\n" + "The user should look like:" + "{'password': '******', 'name': 'my_name', 'domain': {'id': 'my_domain_id'}}" ) else: self.warning( "Configuration Incomplete! Check your openstack.yaml file") except AuthenticationNeeded: # Delete the scope, we'll populate a new one on the next run for this instance self.delete_instance_scope() except (requests.exceptions.HTTPError, requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: if isinstance(e, requests.exceptions.HTTPError ) and e.response.status_code < 500: self.warning("Error reaching nova API: %s" % e) else: # exponential backoff self.do_backoff(instance) return self._backoff.reset_backoff(instance)