示例#1
0
    def test_query_api_with_retries_other_error_rate_limited(self):
        with mock.patch.object(KubernetesApi, "query_api") as mock_query:
            mock_query.side_effect = K8sApiPermanentError("Permanent Error")

            k8s = KubernetesApi()
            rate_limiter = BlockingRateLimiter(
                num_agents=1,
                initial_cluster_rate=100,
                max_cluster_rate=1000,
                min_cluster_rate=1,
                consecutive_success_threshold=1,
                strategy="multiply",
            )
            options = ApiQueryOptions(rate_limiter=rate_limiter, max_retries=0)
            self.assertRaises(
                K8sApiPermanentError,
                lambda: k8s.query_api_with_retries("/foo/bar", options),
            )
            self.assertEqual(rate_limiter.current_cluster_rate, 50.0)

            mock_query.side_effect = Exception("Some other exception")
            self.assertRaises(
                Exception,
                lambda: k8s.query_api_with_retries("/foo/bar", options))
            self.assertEqual(rate_limiter.current_cluster_rate, 25.0)
示例#2
0
    def test_lookup_already_in_cache(self):
        query_options = ApiQueryOptions()

        self.k8s.set_response(self.NAMESPACE_1, self.POD_1, success=True)
        obj = self.cache.lookup(
            self.k8s,
            self.clock.time(),
            self.NAMESPACE_1,
            self.POD_1,
            query_options=query_options,
        )
        self.assertTrue(
            self.cache.is_cached(self.NAMESPACE_1,
                                 self.POD_1,
                                 allow_expired=True))

        self.k8s.set_response(self.NAMESPACE_1,
                              self.POD_1,
                              permanent_error=True)
        obj = self.cache.lookup(
            self.k8s,
            self.clock.time(),
            self.NAMESPACE_1,
            self.POD_1,
            query_options=query_options,
        )

        self.assertTrue(
            self.cache.is_cached(self.NAMESPACE_1,
                                 self.POD_1,
                                 allow_expired=True))
        self.assertEqual(obj.name, self.POD_1)
        self.assertEqual(obj.namespace, self.NAMESPACE_1)
示例#3
0
    def test_query_api_with_retries_not_found_not_rate_limited( self ):
        with mock.patch.object( KubernetesApi, "query_api" ) as mock_query:
            mock_query.side_effect = K8sApiNotFoundException( "/foo/bar" )

            k8s = KubernetesApi()
            rate_limiter = BlockingRateLimiter(
                num_agents=1, initial_cluster_rate=100, max_cluster_rate=1000, min_cluster_rate=1,
                consecutive_success_threshold=1,
                strategy='multiply',
            )
            options = ApiQueryOptions( rate_limiter=rate_limiter )
            self.assertRaises( K8sApiNotFoundException, lambda: k8s.query_api_with_retries( "/foo/bar", options ) )
            self.assertEqual( rate_limiter.current_cluster_rate, 200.0 )
示例#4
0
    def test_raise_exception_on_query_error(self):
        query_options = ApiQueryOptions()

        self.k8s.set_response(self.NAMESPACE_1,
                              self.POD_1,
                              permanent_error=True)
        self.assertRaises(
            K8sApiPermanentError,
            lambda: self.cache.lookup(self.k8s,
                                      self.clock.time(),
                                      self.NAMESPACE_1,
                                      self.POD_1,
                                      query_options=query_options))
示例#5
0
    def test_query_api_with_retries_success_not_rate_limited( self ):
        with mock.patch.object( KubernetesApi, "query_api" ) as mock_query:
            mock_query.return_value = { "success": "success" }

            k8s = KubernetesApi()
            rate_limiter = BlockingRateLimiter(
                num_agents=1, initial_cluster_rate=100, max_cluster_rate=1000, min_cluster_rate=1,
                consecutive_success_threshold=1,
                strategy='multiply',
            )
            options = ApiQueryOptions( rate_limiter=rate_limiter )
            result = k8s.query_api_with_retries( "/foo/bar", options )
            self.assertEqual( result, { "success": "success" } )
            self.assertEqual( rate_limiter.current_cluster_rate, 200.0 )
示例#6
0
    def run(self):
        """Begins executing the monitor, writing metric output to logger.
        """
        if self.__disable_monitor:
            global_log.info(
                "kubernetes_events_monitor exiting because it has been disabled."
            )
            return

        try:
            self._global_config.k8s_api_url
            self._global_config.k8s_verify_api_queries

            # We only create the k8s_cache while we are the leader
            k8s_cache = None

            if self.__log_watcher:
                self.log_config = self.__log_watcher.add_log_config(
                    self.module_name, self.log_config
                )

            # First instance of k8s api uses the main rate limiter.  Leader election related API calls to the k8s
            # masters will go through this api/rate limiter.
            k8s_api_main = KubernetesApi.create_instance(
                self._global_config, rate_limiter_key="K8S_CACHE_MAIN_RATELIMITER"
            )

            # Second instance of k8s api uses an ancillary ratelimiter (for exclusive use by events monitor)
            k8s_api_events = KubernetesApi.create_instance(
                self._global_config, rate_limiter_key="K8S_EVENTS_RATELIMITER"
            )

            # k8s_cache is initialized with the main rate limiter. However, streaming-related API calls should go
            # through the ancillary ratelimiter. This is achieved by passing ApiQueryOptions with desired rate_limiter.
            k8s_events_query_options = ApiQueryOptions(
                max_retries=self._global_config.k8s_controlled_warmer_max_query_retries,
                rate_limiter=k8s_api_events.default_query_options.rate_limiter,
            )

            pod_name = k8s_api_main.get_pod_name()
            self._node_name = k8s_api_main.get_node_name(pod_name)
            cluster_name = k8s_api_main.get_cluster_name()

            last_event = None
            last_resource = 0

            last_check = time.time() - self._leader_check_interval

            last_reported_leader = None
            while not self._is_thread_stopped():
                current_time = time.time()

                # if we are the leader, we could be going through this loop before the leader_check_interval
                # has expired, so make sure to only check for a new leader if the interval has expired
                if last_check + self._leader_check_interval <= current_time:
                    last_check = current_time
                    # check if we are the leader
                    if not self._is_leader(k8s_api_main):
                        # if not, then sleep and try again
                        global_log.log(
                            scalyr_logging.DEBUG_LEVEL_1,
                            "Leader is %s" % (six.text_type(self._current_leader)),
                        )
                        if (
                            self._current_leader is not None
                            and last_reported_leader != self._current_leader
                        ):
                            global_log.info(
                                "Kubernetes event leader is %s"
                                % six.text_type(self._current_leader)
                            )
                            last_reported_leader = self._current_leader
                        if k8s_cache is not None:
                            k8s_cache.stop()
                            k8s_cache = None
                        self._sleep_but_awaken_if_stopped(self._leader_check_interval)
                        continue

                    global_log.log(
                        scalyr_logging.DEBUG_LEVEL_1,
                        "Leader is %s" % (six.text_type(self._current_leader)),
                    )
                try:
                    if last_reported_leader != self._current_leader:
                        global_log.info("Acting as Kubernetes event leader")
                        last_reported_leader = self._current_leader

                    if k8s_cache is None:
                        # create the k8s cache
                        k8s_cache = k8s_utils.cache(self._global_config)

                    # start streaming events
                    lines = k8s_api_events.stream_events(last_event=last_event)

                    json = {}
                    for line in lines:
                        try:
                            json = scalyr_util.json_decode(line)
                        except Exception as e:
                            global_log.warning(
                                "Error parsing event json: %s, %s, %s"
                                % (line, six.text_type(e), traceback.format_exc())
                            )
                            continue

                        try:
                            # check to see if the resource version we are using has expired
                            if self._is_resource_expired(json):
                                last_event = None
                                global_log.log(
                                    scalyr_logging.DEBUG_LEVEL_1, "K8S resource expired"
                                )
                                continue

                            obj = json.get("object", dict())
                            event_type = json.get("type", "UNKNOWN")

                            # resource version hasn't expired, so update it to the most recently seen version
                            last_event = last_resource

                            metadata = obj.get("metadata", dict())

                            # skip any events with resourceVersions higher than ones we've already seen
                            resource_version = metadata.get("resourceVersion", None)
                            if resource_version is not None:
                                resource_version = int(resource_version)
                            if resource_version and resource_version <= last_resource:
                                global_log.log(
                                    scalyr_logging.DEBUG_LEVEL_2,
                                    "Skipping older resource events",
                                )
                                continue

                            last_resource = resource_version
                            last_event = resource_version

                            # see if this event is about an object we are interested in
                            (kind, namespace, name) = self._get_involved_object(obj)

                            if kind is None:
                                global_log.log(
                                    scalyr_logging.DEBUG_LEVEL_1,
                                    "Ignoring event due to None kind",
                                )
                                continue

                            # exclude any events that don't involve objects we are interested in
                            if (
                                self.__event_object_filter
                                and kind not in self.__event_object_filter
                            ):
                                global_log.log(
                                    scalyr_logging.DEBUG_LEVEL_1,
                                    "Ignoring event due to unknown kind %s - %s"
                                    % (kind, six.text_type(metadata)),
                                )
                                continue

                            # ignore events that belong to namespaces we are not interested in
                            if namespace not in self.__k8s_namespaces_to_include:
                                global_log.log(
                                    scalyr_logging.DEBUG_LEVEL_1,
                                    "Ignoring event due to belonging to an excluded namespace '%s'"
                                    % (namespace),
                                )
                                continue

                            # get cluster and deployment information
                            extra_fields = {
                                "k8s-cluster": cluster_name,
                                "watchEventType": event_type,
                            }
                            if kind:
                                if kind == "Pod":
                                    extra_fields["pod_name"] = name
                                    extra_fields["pod_namespace"] = namespace
                                    pod = k8s_cache.pod(
                                        namespace,
                                        name,
                                        current_time,
                                        query_options=k8s_events_query_options,
                                    )
                                    if pod and pod.controller:
                                        extra_fields[
                                            "k8s-controller"
                                        ] = pod.controller.name
                                        extra_fields["k8s-kind"] = pod.controller.kind
                                elif kind != "Node":
                                    controller = k8s_cache.controller(
                                        namespace,
                                        name,
                                        kind,
                                        current_time,
                                        query_options=k8s_events_query_options,
                                    )
                                    if controller:
                                        extra_fields["k8s-controller"] = controller.name
                                        extra_fields["k8s-kind"] = controller.kind

                            # if so, log to disk
                            self.__disk_logger.info(
                                "event=%s extra=%s"
                                % (
                                    six.text_type(scalyr_util.json_encode(obj)),
                                    six.text_type(
                                        scalyr_util.json_encode(extra_fields)
                                    ),
                                )
                            )

                            # see if we need to check for a new leader
                            if last_check + self._leader_check_interval <= current_time:
                                global_log.log(
                                    scalyr_logging.DEBUG_LEVEL_1,
                                    "Time to check for a new event leader",
                                )
                                break

                        except Exception as e:
                            global_log.exception(
                                "Failed to process single k8s event line due to following exception: %s, %s, %s"
                                % (repr(e), six.text_type(e), traceback.format_exc()),
                                limit_once_per_x_secs=300,
                                limit_key="k8s-stream-events-general-exception",
                            )
                except K8sApiAuthorizationException:
                    global_log.warning(
                        "Could not stream K8s events due to an authorization error.  The "
                        "Scalyr Service Account does not have permission to watch available events.  "
                        "Please recreate the role with the latest definition which can be found "
                        "at https://raw.githubusercontent.com/scalyr/scalyr-agent-2/release/k8s/scalyr-service-account.yaml "
                        "K8s event collection will be disabled until this is resolved.  See the K8s install "
                        "directions for instructions on how to create the role "
                        "https://www.scalyr.com/help/install-agent-kubernetes",
                        limit_once_per_x_secs=300,
                        limit_key="k8s-stream-events-no-permission",
                    )
                except ConnectionError:
                    # ignore these, and just carry on querying in the next loop
                    pass
                except Exception as e:
                    global_log.exception(
                        "Failed to stream k8s events due to the following exception: %s, %s, %s"
                        % (repr(e), six.text_type(e), traceback.format_exc())
                    )

            if k8s_cache is not None:
                k8s_cache.stop()
                k8s_cache = None

        except Exception:
            # TODO:  Maybe remove this catch here and let the higher layer catch it.  However, we do not
            # right now join on the monitor threads, so no one would catch it.  We should change that.
            global_log.exception(
                "Monitor died due to exception:", error_code="failedMonitor"
            )