def test_query_api_with_retries_other_error_rate_limited(self): with mock.patch.object(KubernetesApi, "query_api") as mock_query: mock_query.side_effect = K8sApiPermanentError("Permanent Error") k8s = KubernetesApi() rate_limiter = BlockingRateLimiter( num_agents=1, initial_cluster_rate=100, max_cluster_rate=1000, min_cluster_rate=1, consecutive_success_threshold=1, strategy="multiply", ) options = ApiQueryOptions(rate_limiter=rate_limiter, max_retries=0) self.assertRaises( K8sApiPermanentError, lambda: k8s.query_api_with_retries("/foo/bar", options), ) self.assertEqual(rate_limiter.current_cluster_rate, 50.0) mock_query.side_effect = Exception("Some other exception") self.assertRaises( Exception, lambda: k8s.query_api_with_retries("/foo/bar", options)) self.assertEqual(rate_limiter.current_cluster_rate, 25.0)
def test_lookup_already_in_cache(self): query_options = ApiQueryOptions() self.k8s.set_response(self.NAMESPACE_1, self.POD_1, success=True) obj = self.cache.lookup( self.k8s, self.clock.time(), self.NAMESPACE_1, self.POD_1, query_options=query_options, ) self.assertTrue( self.cache.is_cached(self.NAMESPACE_1, self.POD_1, allow_expired=True)) self.k8s.set_response(self.NAMESPACE_1, self.POD_1, permanent_error=True) obj = self.cache.lookup( self.k8s, self.clock.time(), self.NAMESPACE_1, self.POD_1, query_options=query_options, ) self.assertTrue( self.cache.is_cached(self.NAMESPACE_1, self.POD_1, allow_expired=True)) self.assertEqual(obj.name, self.POD_1) self.assertEqual(obj.namespace, self.NAMESPACE_1)
def test_query_api_with_retries_not_found_not_rate_limited( self ): with mock.patch.object( KubernetesApi, "query_api" ) as mock_query: mock_query.side_effect = K8sApiNotFoundException( "/foo/bar" ) k8s = KubernetesApi() rate_limiter = BlockingRateLimiter( num_agents=1, initial_cluster_rate=100, max_cluster_rate=1000, min_cluster_rate=1, consecutive_success_threshold=1, strategy='multiply', ) options = ApiQueryOptions( rate_limiter=rate_limiter ) self.assertRaises( K8sApiNotFoundException, lambda: k8s.query_api_with_retries( "/foo/bar", options ) ) self.assertEqual( rate_limiter.current_cluster_rate, 200.0 )
def test_raise_exception_on_query_error(self): query_options = ApiQueryOptions() self.k8s.set_response(self.NAMESPACE_1, self.POD_1, permanent_error=True) self.assertRaises( K8sApiPermanentError, lambda: self.cache.lookup(self.k8s, self.clock.time(), self.NAMESPACE_1, self.POD_1, query_options=query_options))
def test_query_api_with_retries_success_not_rate_limited( self ): with mock.patch.object( KubernetesApi, "query_api" ) as mock_query: mock_query.return_value = { "success": "success" } k8s = KubernetesApi() rate_limiter = BlockingRateLimiter( num_agents=1, initial_cluster_rate=100, max_cluster_rate=1000, min_cluster_rate=1, consecutive_success_threshold=1, strategy='multiply', ) options = ApiQueryOptions( rate_limiter=rate_limiter ) result = k8s.query_api_with_retries( "/foo/bar", options ) self.assertEqual( result, { "success": "success" } ) self.assertEqual( rate_limiter.current_cluster_rate, 200.0 )
def run(self): """Begins executing the monitor, writing metric output to logger. """ if self.__disable_monitor: global_log.info( "kubernetes_events_monitor exiting because it has been disabled." ) return try: self._global_config.k8s_api_url self._global_config.k8s_verify_api_queries # We only create the k8s_cache while we are the leader k8s_cache = None if self.__log_watcher: self.log_config = self.__log_watcher.add_log_config( self.module_name, self.log_config ) # First instance of k8s api uses the main rate limiter. Leader election related API calls to the k8s # masters will go through this api/rate limiter. k8s_api_main = KubernetesApi.create_instance( self._global_config, rate_limiter_key="K8S_CACHE_MAIN_RATELIMITER" ) # Second instance of k8s api uses an ancillary ratelimiter (for exclusive use by events monitor) k8s_api_events = KubernetesApi.create_instance( self._global_config, rate_limiter_key="K8S_EVENTS_RATELIMITER" ) # k8s_cache is initialized with the main rate limiter. However, streaming-related API calls should go # through the ancillary ratelimiter. This is achieved by passing ApiQueryOptions with desired rate_limiter. k8s_events_query_options = ApiQueryOptions( max_retries=self._global_config.k8s_controlled_warmer_max_query_retries, rate_limiter=k8s_api_events.default_query_options.rate_limiter, ) pod_name = k8s_api_main.get_pod_name() self._node_name = k8s_api_main.get_node_name(pod_name) cluster_name = k8s_api_main.get_cluster_name() last_event = None last_resource = 0 last_check = time.time() - self._leader_check_interval last_reported_leader = None while not self._is_thread_stopped(): current_time = time.time() # if we are the leader, we could be going through this loop before the leader_check_interval # has expired, so make sure to only check for a new leader if the interval has expired if last_check + self._leader_check_interval <= current_time: last_check = current_time # check if we are the leader if not self._is_leader(k8s_api_main): # if not, then sleep and try again global_log.log( scalyr_logging.DEBUG_LEVEL_1, "Leader is %s" % (six.text_type(self._current_leader)), ) if ( self._current_leader is not None and last_reported_leader != self._current_leader ): global_log.info( "Kubernetes event leader is %s" % six.text_type(self._current_leader) ) last_reported_leader = self._current_leader if k8s_cache is not None: k8s_cache.stop() k8s_cache = None self._sleep_but_awaken_if_stopped(self._leader_check_interval) continue global_log.log( scalyr_logging.DEBUG_LEVEL_1, "Leader is %s" % (six.text_type(self._current_leader)), ) try: if last_reported_leader != self._current_leader: global_log.info("Acting as Kubernetes event leader") last_reported_leader = self._current_leader if k8s_cache is None: # create the k8s cache k8s_cache = k8s_utils.cache(self._global_config) # start streaming events lines = k8s_api_events.stream_events(last_event=last_event) json = {} for line in lines: try: json = scalyr_util.json_decode(line) except Exception as e: global_log.warning( "Error parsing event json: %s, %s, %s" % (line, six.text_type(e), traceback.format_exc()) ) continue try: # check to see if the resource version we are using has expired if self._is_resource_expired(json): last_event = None global_log.log( scalyr_logging.DEBUG_LEVEL_1, "K8S resource expired" ) continue obj = json.get("object", dict()) event_type = json.get("type", "UNKNOWN") # resource version hasn't expired, so update it to the most recently seen version last_event = last_resource metadata = obj.get("metadata", dict()) # skip any events with resourceVersions higher than ones we've already seen resource_version = metadata.get("resourceVersion", None) if resource_version is not None: resource_version = int(resource_version) if resource_version and resource_version <= last_resource: global_log.log( scalyr_logging.DEBUG_LEVEL_2, "Skipping older resource events", ) continue last_resource = resource_version last_event = resource_version # see if this event is about an object we are interested in (kind, namespace, name) = self._get_involved_object(obj) if kind is None: global_log.log( scalyr_logging.DEBUG_LEVEL_1, "Ignoring event due to None kind", ) continue # exclude any events that don't involve objects we are interested in if ( self.__event_object_filter and kind not in self.__event_object_filter ): global_log.log( scalyr_logging.DEBUG_LEVEL_1, "Ignoring event due to unknown kind %s - %s" % (kind, six.text_type(metadata)), ) continue # ignore events that belong to namespaces we are not interested in if namespace not in self.__k8s_namespaces_to_include: global_log.log( scalyr_logging.DEBUG_LEVEL_1, "Ignoring event due to belonging to an excluded namespace '%s'" % (namespace), ) continue # get cluster and deployment information extra_fields = { "k8s-cluster": cluster_name, "watchEventType": event_type, } if kind: if kind == "Pod": extra_fields["pod_name"] = name extra_fields["pod_namespace"] = namespace pod = k8s_cache.pod( namespace, name, current_time, query_options=k8s_events_query_options, ) if pod and pod.controller: extra_fields[ "k8s-controller" ] = pod.controller.name extra_fields["k8s-kind"] = pod.controller.kind elif kind != "Node": controller = k8s_cache.controller( namespace, name, kind, current_time, query_options=k8s_events_query_options, ) if controller: extra_fields["k8s-controller"] = controller.name extra_fields["k8s-kind"] = controller.kind # if so, log to disk self.__disk_logger.info( "event=%s extra=%s" % ( six.text_type(scalyr_util.json_encode(obj)), six.text_type( scalyr_util.json_encode(extra_fields) ), ) ) # see if we need to check for a new leader if last_check + self._leader_check_interval <= current_time: global_log.log( scalyr_logging.DEBUG_LEVEL_1, "Time to check for a new event leader", ) break except Exception as e: global_log.exception( "Failed to process single k8s event line due to following exception: %s, %s, %s" % (repr(e), six.text_type(e), traceback.format_exc()), limit_once_per_x_secs=300, limit_key="k8s-stream-events-general-exception", ) except K8sApiAuthorizationException: global_log.warning( "Could not stream K8s events due to an authorization error. The " "Scalyr Service Account does not have permission to watch available events. " "Please recreate the role with the latest definition which can be found " "at https://raw.githubusercontent.com/scalyr/scalyr-agent-2/release/k8s/scalyr-service-account.yaml " "K8s event collection will be disabled until this is resolved. See the K8s install " "directions for instructions on how to create the role " "https://www.scalyr.com/help/install-agent-kubernetes", limit_once_per_x_secs=300, limit_key="k8s-stream-events-no-permission", ) except ConnectionError: # ignore these, and just carry on querying in the next loop pass except Exception as e: global_log.exception( "Failed to stream k8s events due to the following exception: %s, %s, %s" % (repr(e), six.text_type(e), traceback.format_exc()) ) if k8s_cache is not None: k8s_cache.stop() k8s_cache = None except Exception: # TODO: Maybe remove this catch here and let the higher layer catch it. However, we do not # right now join on the monitor threads, so no one would catch it. We should change that. global_log.exception( "Monitor died due to exception:", error_code="failedMonitor" )