Exemplo n.º 1
0
def monitor(kubectl):
    logger.info("Starting AXKubeMonitor")

    monitor = AXKubeMonitor(kubectl=kubectl)
    monitor.reload_monitors(namespace="default")
    monitor.start()
    yield monitor

    logger.info("Tearing down AXKubeMonitor")
    monitor.stop(force=True)
Exemplo n.º 2
0
    def run(self):
        """AXMon main thread"""
        import signal
        import sys

        def signal_handler(signal, frame):
            logger.info("AXMon killed with signal %s", signal)
            sys.exit(0)

        signal.signal(signal.SIGTERM, signal_handler)
        signal.signal(signal.SIGINT, signal_handler)

        # now that axdb is running post events
        cbs = ContainerEventCallbacks()
        from ax.platform.stats import container_oom_cb
        cbs.add_cb(container_oom_cb)

        kube_monitor = AXKubeMonitor()
        kube_monitor.reload_monitors()
        kube_monitor.start()

        logger.info("kube monitor started")

        try:
            while True:
                time.sleep(1)
        except (KeyboardInterrupt, SystemExit):
            self.shutdown()
Exemplo n.º 3
0
    def stop_one(self, name, namespace=AXNameSpaces.AXSYS):
        time.sleep(
            random.randint(0, AXPlatformConfigDefaults.ObjectOperationJitter))
        logger.info("Deleting %s in namespace %s ...", name, namespace)
        start = time.time()
        kube_obj = self._kube_objects[name]
        kube_obj.namespace = namespace
        kube_obj.replacing = self._replacing
        assert isinstance(kube_obj, KubeObject)

        result = {
            "name": name,
            "code": [],
            "events": [],
            "failed": False,
            "duration": ""
        }

        # Don't delete if object does not exist
        if not kube_obj.exists():
            result["code"] += [
                "{:.25s}:{}".format(name, KubeObjStatusCode.DELETED)
            ]
            result["duration"] = str(round(time.time() - start, 2))
            return result

        monitor_info = kube_obj.get_delete_monitor_info()
        if monitor_info:
            # use monitor
            waiters = []

            # Create and register waiters for all objects that can be monitored
            for m in monitor_info:
                wait_info = {
                    "kind": KubeKindToKubeApiObjKind[m.kube_kind],
                    "name": m.name,
                    "validator": m.validator
                }
                waiter = KubeObjWaiter()
                waiters.append((waiter, wait_info))
                AXKubeMonitor().wait_for_kube_object(
                    wait_info, AXPlatformConfigDefaults.ObjDeleteWaitTimeout,
                    waiter)

            # Call kubectl delete
            kube_obj.delete()

            # Wait on all waiters to retrieve status and events
            for waiter, wait_info in waiters:
                waiter.wait()
                result["events"] += waiter.details
                if waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN:
                    result["code"].append("{:.25s}:{}".format(
                        wait_info["name"], KubeObjStatusCode.DELETED))
                    logger.info("Successfully deleted %s in %s with code %s.",
                                wait_info["name"], name, result["code"])
                else:
                    result["failed"] = True
                    result["code"].append("{:.25s}:{}".format(
                        wait_info["name"], KubeObjStatusCode.UNKNOWN))
                    logger.error(
                        "Failed to delete %s in %s with code %s. Events: %s",
                        wait_info["name"], name, result["code"],
                        str(waiter.details))

            # Poll once to confirm all components from this Kubenetes config file exist
            # In case there are objects in this config file cannot be monitored, i.e. svc without elb
            if kube_obj.exists():
                logger.error("Object %s deleted but still exists", name)
                result["failed"] = True
                result["code"].append("{:.25s}:{}".format(
                    name, KubeObjStatusCode.UNKNOWN))
                result["events"].append(
                    "Object {} deleted but still exists.".format(name))
            result["duration"] = str(round(time.time() - start, 2))
            logger.info("Successfully deleted %s.", name)
            return result
        else:
            # use polling
            kube_obj.delete()
            return self._poll_till_not_exists(
                name=name,
                kube_obj=kube_obj,
                start_time=start,
                poll_interval=AXPlatformConfigDefaults.ObjDeletePollInterval,
                poll_max_retry=AXPlatformConfigDefaults.ObjDeletePollMaxRetry,
                rst=result)
Exemplo n.º 4
0
    def __init__(
            self,
            cluster_name_id=None,
            aws_profile=None,
            debug=True,
            manifest_root=AXPlatformConfigDefaults.DefaultManifestRoot,
            config_file=AXPlatformConfigDefaults.DefaultPlatformConfigFile,
            software_info=None):
        """
        AX Platform bootstrap

        :param cluster_name_id: cluster name id
        :param aws_profile: aws profile to authenticate all aws clients
        :param debug: debug mode
        :param manifest_root: root directory to all ax service objects
        """
        self._software_info = software_info if software_info else SoftwareInfo(
        )
        assert isinstance(
            self._software_info, SoftwareInfo
        ), "Wrong type ({}) of software info passed in.".format(
            self._software_info)
        self._aws_profile = aws_profile
        self._manifest_root = manifest_root
        self._config = AXPlatformConfig(config_file)

        logger.info("Using Kubernetes manifest from %s", self._manifest_root)
        logger.info("Using platform configuration \"%s\" from %s",
                    self._config.name, config_file)

        self._cluster_name_id = AXClusterId(
            cluster_name_id).get_cluster_name_id()
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._cluster_name_id,
            aws_profile=self._aws_profile)
        self._cluster_config_path = AXClusterConfigPath(cluster_name_id)
        self._cluster_info = AXClusterInfo(self._cluster_name_id,
                                           aws_profile=self._aws_profile)

        self._region = self._cluster_config.get_region()
        if Cloud().target_cloud_aws():
            self._account = AWSAccountInfo(
                aws_profile=self._aws_profile).get_account_id()
        else:
            self._account = ""
        self._bucket_name = self._cluster_config_path.bucket()
        self._bucket = Cloud().get_bucket(self._bucket_name,
                                          aws_profile=self._aws_profile,
                                          region=self._region)

        # In debug mode, when we failed to create an object, we don't delete it but just
        # leave it for debug.
        self._debug = debug

        # DNS
        self.cluster_dns_name = None

        # Get kube cluster config. Automatic if in pod already.
        self._kube_config = self._cluster_info.get_kube_config_file_path(
        ) if self._cluster_name_id else None
        if self._cluster_name_id:
            if not os.path.isfile(self._kube_config):
                logger.info(
                    "Can't find config file at %s; downloading from s3",
                    self._kube_config)
                self._kube_config = self._cluster_info.download_kube_config()
            assert os.path.isfile(
                self._kube_config), "No kube_config file available"

        # Kubernetes related objects and macros
        self.kube_namespaces = [AXNameSpaces.AXSYS, AXNameSpaces.AXUSER]
        self.kube_axsys_namespace = AXNameSpaces.AXSYS
        self.kube_user_namespace = AXNameSpaces.AXUSER
        self.kubectl = KubernetesApiClient(config_file=self._kube_config)
        self.kube_poll = KubeObjPoll(kubectl=self.kubectl)

        self._monitor = AXKubeMonitor(kubectl=self.kubectl)
        self._monitor.reload_monitors(namespace=self.kube_axsys_namespace)
        self._monitor.start()

        # Kube Objects
        self._kube_objects = {}
        self._replacing = {}
Exemplo n.º 5
0
class AXPlatform(object):
    def __new__(cls, *args, **kwargs):
        if Cloud().target_cloud_gcp():
            from .gke_platform import AXGKEPlatform
            return super(AXPlatform, cls).__new__(AXGKEPlatform)
        else:
            return super(AXPlatform, cls).__new__(cls)

    def __init__(
            self,
            cluster_name_id=None,
            aws_profile=None,
            debug=True,
            manifest_root=AXPlatformConfigDefaults.DefaultManifestRoot,
            config_file=AXPlatformConfigDefaults.DefaultPlatformConfigFile,
            software_info=None):
        """
        AX Platform bootstrap

        :param cluster_name_id: cluster name id
        :param aws_profile: aws profile to authenticate all aws clients
        :param debug: debug mode
        :param manifest_root: root directory to all ax service objects
        """
        self._software_info = software_info if software_info else SoftwareInfo(
        )
        assert isinstance(
            self._software_info, SoftwareInfo
        ), "Wrong type ({}) of software info passed in.".format(
            self._software_info)
        self._aws_profile = aws_profile
        self._manifest_root = manifest_root
        self._config = AXPlatformConfig(config_file)

        logger.info("Using Kubernetes manifest from %s", self._manifest_root)
        logger.info("Using platform configuration \"%s\" from %s",
                    self._config.name, config_file)

        self._cluster_name_id = AXClusterId(
            cluster_name_id).get_cluster_name_id()
        self._cluster_config = AXClusterConfig(
            cluster_name_id=self._cluster_name_id,
            aws_profile=self._aws_profile)
        self._cluster_config_path = AXClusterConfigPath(cluster_name_id)
        self._cluster_info = AXClusterInfo(self._cluster_name_id,
                                           aws_profile=self._aws_profile)

        self._region = self._cluster_config.get_region()
        if Cloud().target_cloud_aws():
            self._account = AWSAccountInfo(
                aws_profile=self._aws_profile).get_account_id()
        else:
            self._account = ""
        self._bucket_name = self._cluster_config_path.bucket()
        self._bucket = Cloud().get_bucket(self._bucket_name,
                                          aws_profile=self._aws_profile,
                                          region=self._region)

        # In debug mode, when we failed to create an object, we don't delete it but just
        # leave it for debug.
        self._debug = debug

        # DNS
        self.cluster_dns_name = None

        # Get kube cluster config. Automatic if in pod already.
        self._kube_config = self._cluster_info.get_kube_config_file_path(
        ) if self._cluster_name_id else None
        if self._cluster_name_id:
            if not os.path.isfile(self._kube_config):
                logger.info(
                    "Can't find config file at %s; downloading from s3",
                    self._kube_config)
                self._kube_config = self._cluster_info.download_kube_config()
            assert os.path.isfile(
                self._kube_config), "No kube_config file available"

        # Kubernetes related objects and macros
        self.kube_namespaces = [AXNameSpaces.AXSYS, AXNameSpaces.AXUSER]
        self.kube_axsys_namespace = AXNameSpaces.AXSYS
        self.kube_user_namespace = AXNameSpaces.AXUSER
        self.kubectl = KubernetesApiClient(config_file=self._kube_config)
        self.kube_poll = KubeObjPoll(kubectl=self.kubectl)

        self._monitor = AXKubeMonitor(kubectl=self.kubectl)
        self._monitor.reload_monitors(namespace=self.kube_axsys_namespace)
        self._monitor.start()

        # Kube Objects
        self._kube_objects = {}
        self._replacing = {}

    def _load_kube_objects_from_steps(self, steps):
        """
        Extract kube objects from steps in config, and load them into memory
        :param steps: list
        :return:
        """
        for object_group in steps:
            assert isinstance(object_group, AXPlatformObjectGroup)
            for obj in object_group.object_set:
                assert isinstance(obj, AXPlatformObject)
                name = obj.name
                filename = obj.manifest
                namespace = obj.namespace
                if name in self._kube_objects:
                    raise ValueError("Duplicated object name {}".format(name))
                kubeobj_conf_path = os.path.join(self._manifest_root, filename)
                self._kube_objects[name] = KubeObject(
                    config_file=kubeobj_conf_path,
                    kubepoll=self.kube_poll,
                    replacing=None,
                    kube_config=self._kube_config,
                    kube_namespace=namespace)

    def _generate_replacing(self):
        # Platform code are running in python 2.7, and therefore for trusted cidr list, the str() method
        # will return something like [u'54.149.149.230/32', u'73.70.250.25/32', u'104.10.248.90/32'], and
        # this 'u' prefix cannot be surpressed. With this prefix, our macro replacing would create invalid
        # yaml files, and therefore we construct string manually here
        trusted_cidr = self._cluster_config.get_trusted_cidr()
        if isinstance(trusted_cidr, list):
            trusted_cidr_str = "["
            for cidr in trusted_cidr:
                trusted_cidr_str += "\"{}\",".format(str(cidr))
            trusted_cidr_str = trusted_cidr_str[:-1]
            trusted_cidr_str += "]"
        else:
            trusted_cidr_str = "[{}]".format(trusted_cidr)

        axsys_cpu = 0
        axsys_mem = 0
        daemon_cpu = 0
        daemon_mem = 0
        for name in self._kube_objects.keys():
            cpu, mem, dcpu, dmem = self._kube_objects[name].resource_usage
            axsys_cpu += cpu
            axsys_mem += mem
            daemon_cpu += dcpu
            daemon_mem += dmem

        # kube-proxy (100m CPU and 100Mi memory. Note kube-proxy does not
        # have a memory request, but this is an approximation)
        daemon_cpu += 100
        daemon_mem += 100

        logger.info(
            "Resource Usages: axsys_cpu: %s milicores, axsys_mem: %s Mi, node_daemon_cpu: %s milicores, node_daemon_mem: %s Mi",
            axsys_cpu, axsys_mem, daemon_cpu, daemon_mem)

        axsys_node_count = int(self._cluster_config.get_asxys_node_count())
        axuser_min_count = str(
            int(self._cluster_config.get_min_node_count()) - axsys_node_count)
        axuser_max_count = str(
            int(self._cluster_config.get_max_node_count()) - axsys_node_count)
        autoscaler_scan_interval = str(
            self._cluster_config.get_autoscaler_scan_interval())

        usr_node_cpu_rsvp = float(daemon_cpu) / EC2_PARAMS[
            self._cluster_config.get_axuser_node_type()]["cpu"]
        usr_node_mem_rsvp = float(daemon_mem) / EC2_PARAMS[
            self._cluster_config.get_axuser_node_type()]["memory"]
        scale_down_util_thresh = round(
            max(usr_node_cpu_rsvp, usr_node_mem_rsvp), 3) + 0.001
        logger.info("Setting node scale down utilization threshold to %s",
                    scale_down_util_thresh)

        self._persist_node_resource_rsvp(daemon_cpu, daemon_mem)

        with open("/kubernetes/cluster/version.txt", "r") as f:
            cluster_install_version = f.read().strip()

        # Prepare autoscaler
        asg_manager = AXUserASGManager(self._cluster_name_id, self._region,
                                       self._aws_profile)
        asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg(
        ) or asg_manager.get_on_demand_asg()
        if not asg:
            raise AXPlatformException(
                "Failed to get autoscaling group for cluster {}".format(
                    self._cluster_name_id))
        asg_name = asg["AutoScalingGroupName"]

        if not asg_name:
            logger.error("Autoscaling group name not found for %s",
                         self._cluster_name_id)
            raise AXPlatformException("Cannot find cluster autoscaling group")

        # Prepare minion-manager.
        spot_instances_option = self._cluster_config.get_spot_instances_option(
        )
        minion_manager_asgs = ""
        if spot_instances_option == SpotInstanceOption.ALL_SPOT:
            for asg in asg_manager.get_all_asgs():
                minion_manager_asgs = minion_manager_asgs + asg[
                    "AutoScalingGroupName"] + " "
            minion_manager_asgs = minion_manager_asgs[:-1]
        elif spot_instances_option == SpotInstanceOption.PARTIAL_SPOT:
            minion_manager_asgs = asg_manager.get_variable_asg(
            )["AutoScalingGroupName"]

        return {
            "REGISTRY":
            self._software_info.registry,
            "REGISTRY_SECRETS":
            self._software_info.registry_secrets,
            "NAMESPACE":
            self._software_info.image_namespace,
            "VERSION":
            self._software_info.image_version,
            "AX_CLUSTER_NAME_ID":
            self._cluster_name_id,
            "AX_AWS_REGION":
            self._region,
            "AX_AWS_ACCOUNT":
            self._account,
            "AX_CUSTOMER_ID":
            AXCustomerId().get_customer_id(),
            "TRUSTED_CIDR":
            trusted_cidr_str,
            "NEW_KUBE_SALT_SHA1":
            os.getenv("NEW_KUBE_SALT_SHA1") or " ",
            "NEW_KUBE_SERVER_SHA1":
            os.getenv("NEW_KUBE_SERVER_SHA1") or " ",
            "AX_KUBE_VERSION":
            os.getenv("AX_KUBE_VERSION"),
            "AX_CLUSTER_INSTALL_VERSION":
            cluster_install_version,
            "SANDBOX_ENABLED":
            str(self._cluster_config.get_sandbox_flag()),
            "ARGO_LOG_BUCKET_NAME":
            self._cluster_config.get_support_object_store_name(),
            "ASG_MIN":
            axuser_min_count,
            "ASG_MAX":
            axuser_max_count,
            "AUTOSCALER_SCAN_INTERVAL":
            autoscaler_scan_interval,
            "SCALE_DOWN_UTIL_THRESH":
            str(scale_down_util_thresh),
            "AX_CLUSTER_META_URL_V1":
            self._bucket.get_object_url_from_key(
                key=self._cluster_config_path.cluster_metadata()),
            "ASG_NAME":
            asg_name,
            "DNS_SERVER_IP":
            os.getenv("DNS_SERVER_IP", default_kube_up_env["DNS_SERVER_IP"]),
            "AX_ENABLE_SPOT_INSTANCES":
            str(spot_instances_option != SpotInstanceOption.NO_SPOT),
            "AX_SPOT_INSTANCE_ASGS":
            minion_manager_asgs,
        }

    def _persist_node_resource_rsvp(self, user_node_daemon_cpu,
                                    user_node_daemon_mem):
        self._cluster_config.set_user_node_resource_rsvp(
            cpu=user_node_daemon_cpu, mem=user_node_daemon_mem)
        self._cluster_config.save_config()

    def start(self):
        """
        Bring up platform using "platform-start.cfg" configuration from manifest directory
        :return:
        """
        # Generate kube-objects
        steps = self._config.steps
        self._load_kube_objects_from_steps(steps)
        self._replacing = self._generate_replacing()

        # TODO: remove component's dependencies to AXOPS_EXT_DNS env (#32)
        # At this moment, we MUST separate first step due to the above dependency
        assert len(steps) >= 2, "Should have at least 1 step to create axops"
        self.create_objects(steps[0])
        self.create_objects(steps[1])
        self.create_objects(steps[2])

        # Prepare axops_eip
        self._set_ext_dns()

        logger.debug("Replacing ENVs: %s", self._replacing)

        info_bound = "=======================================================\n"
        img_namespace = "Image Namespace: {}\n".format(
            self._software_info.image_namespace)
        img_version = "Image Version: {}\n".format(
            self._software_info.image_version)
        start_info = "\n\n{}{}{}{}{}".format(
            info_bound, "Platform Up: Bringing up Argo services...\n",
            img_namespace, img_version, info_bound)
        logger.info(start_info)

        # Start rest of the objects
        for i in range(3, len(steps)):
            self.create_objects(steps[i])

        # update application namespace
        logger.info("Updating application managers")
        for app in Applications(client=self.kubectl).list():
            logger.info("--- updating {}".format(app))
            a = Application(app, client=self.kubectl)
            a.create(force_recreate=True)
        logger.info("Done updating application managers")

        # Upload version information to target cluster
        self._update_version()
        logger.info("\n\n%sCluster %s is up. Cluster is available at %s%s\n",
                    COLOR_GREEN, self._cluster_name_id, self.cluster_dns_name,
                    COLOR_NORM)

    def stop(self):
        """
        Bring down platform using "platform-stop.cfg" configuration from manifest directory
        :return:
        """
        # Generate kube-objects (Does not need to generate replacing during platform down)
        # Stop order should be the reverse of start
        steps = self._config.steps
        steps.reverse()
        self._load_kube_objects_from_steps(steps)

        info_bound = "=======================================================\n"
        stop_info = "\n\n{}{}{}".format(
            info_bound, "Platform Down: Shutting down Argo services...\n",
            info_bound)
        logger.info(stop_info)

        # Bring down objects according to steps
        for i in range(len(steps)):
            object_group = steps[i]
            self.delete_objects(object_group)

    def stop_monitor(self):
        self._monitor.stop()

    def create_objects(self, objects):
        """
        Start kubernetes objects based on records.
        Wait for all of them.

        :param objects: AXPlatformObjectGroup
        """
        assert isinstance(objects, AXPlatformObjectGroup)
        if not self._should_create_group(
                policy=objects.policy,
                policy_predicate=objects.policy_predicate,
                consistency=objects.consistency):
            logger.debug(
                "Skipping object group (%s) creation based on policy (%s), policy predicate (%s), consistency (%s)",
                objects.name, objects.policy, objects.policy_predicate,
                objects.consistency)
            return
        logger.info("Create step: %s", objects.name)
        logger.info("Creating platform objects\n\n%s",
                    self._generate_object_summary(objects.object_set))
        pool = ThreadPool(len(objects.object_set))
        async_results = {}
        for obj in objects.object_set:
            assert isinstance(obj, AXPlatformObject)
            name = obj.name
            namespace = obj.namespace
            async_results[name] = pool.apply_async(
                self.start_one, args=(name, ), kwds={"namespace": namespace})
        pool.close()
        pool.join()

        report, failed = self._generate_report(async_results, "Create")
        logger.info(report)

        if failed:
            raise AXPlatformException("Failed to create platform objects.")

    def _should_create_group(self, policy, policy_predicate, consistency):
        """
        Take AXPlatformObjectGroup policy, predicate and consistency and determine
        if this group should be created or not
        :param policy:
        :param policy_predicate:
        :param consistency:
        :return:
        """
        # Since we are not using consistency, we should always create if not
        # explicitly told not to, i.e. if there is a PrivateRegistryOnly
        # We are just leaving the interface here that should create or not
        # need to be decided by policy, policy_predicate and consistency

        if policy_predicate == ObjectGroupPolicyPredicate.PrivateRegistryOnly and \
                not self._software_info.registry_is_private():
            return False
        return True

    def delete_objects(self, objects):
        """
        Stop kubernetes objects based on records.
        Wait for all of them.

        :param objects: AXPlatformObjectGroup
        """
        assert isinstance(objects, AXPlatformObjectGroup)
        if not self._should_delete_group(
                policy=objects.policy,
                policy_predicate=objects.policy_predicate):
            logger.debug(
                "Skipping object group (%s) deletion based on policy (%s), policy predicate (%s)",
                objects.name, objects.policy, objects.policy_predicate)
            return
        logger.info("Delete step: %s", objects.name)
        logger.info("Deleting platform objects\n\n%s.",
                    self._generate_object_summary(objects.object_set))
        pool = ThreadPool(len(objects.object_set))
        async_results = {}
        for obj in objects.object_set:
            assert isinstance(obj, AXPlatformObject)
            name = obj.name
            namespace = obj.namespace
            async_results[name] = pool.apply_async(
                self.stop_one, args=(name, ), kwds={"namespace": namespace})
        pool.close()
        pool.join()

        report, failed = self._generate_report(async_results, "Delete")
        logger.info(report)
        if failed:
            raise AXPlatformException("Failed to create platform objects.")

    def _should_delete_group(self, policy, policy_predicate):
        """
        Take AXPlatformObjectGroup policy and determine if this group should be deleted or not.
        Consistency is not needed
        for deletion

        :param policy:
        :param policy_predicate:
        :return:
        """
        if policy == ObjectGroupPolicy.CreateMany:
            return True
        return False

    def start_one(self, name, namespace=AXNameSpaces.AXSYS):
        time.sleep(
            random.randint(0, AXPlatformConfigDefaults.ObjectOperationJitter))
        logger.info("Creating %s in namespace %s ...", name, namespace)
        start = time.time()
        kube_obj = self._kube_objects[name]

        # Update them as there are new updates in replacing in platform start
        kube_obj.namespace = namespace
        kube_obj.replacing = self._replacing

        assert isinstance(kube_obj, KubeObject)
        result = {
            "name": name,
            "code": [],
            "events": [],
            "failed": False,
            "duration": ""
        }
        if kube_obj.healthy():
            result["code"] += [
                "{:.25s}:{}".format(name, KubeObjStatusCode.OBJ_EXISTS)
            ]
            result["duration"] = str(round(time.time() - start, 2))
            return result

        # Previous platform start might fail, and might result in some componenets created
        # but not healthy (i.e. in CrashLoopBackoff). In this case, we delete the existing
        # object and try to create a new one
        if kube_obj.exists():
            logger.warning(
                "Object %s exists but not healthy. Deleting object for idempotency ...",
                name)
            self.stop_one(name, namespace)

        assert not kube_obj.exists(
        ), "Kubeobject {} already created but is not healthy. Not Expected".format(
            name)

        monitor_info = kube_obj.get_create_monitor_info()
        if monitor_info:
            # use monitor
            waiters = []

            # Create and register waiters for all objects that can be monitored
            for m in monitor_info:
                wait_info = {
                    "kind": KubeKindToKubeApiObjKind[m.kube_kind],
                    "name": m.name,
                    "validator": m.validator
                }
                waiter = KubeObjWaiter()
                waiters.append((waiter, wait_info))
                AXKubeMonitor().wait_for_kube_object(
                    wait_info, AXPlatformConfigDefaults.ObjCreateWaitTimeout,
                    waiter)

            # Call kubectl create
            kube_obj.create()

            # Wait on all waiters to retrieve status and events
            for waiter, wait_info in waiters:
                waiter.wait()
                result["events"] += waiter.details
                result["code"].append("{:.25s}:{}".format(
                    wait_info["name"], waiter.result))
                if waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN:
                    logger.info("Successfully created %s with code %s.",
                                wait_info["name"], waiter.result)
                else:
                    result["failed"] = True
                    logger.error(
                        "Failed to create %s in %s with code %s. Events: %s",
                        wait_info["name"], namespace, waiter.result,
                        str(waiter.details))
                    if not self._debug:
                        logger.info("Deleting %s due to creation failure",
                                    name)
                        del_rst = self.stop_one(name, namespace)
                        result["code"] += del_rst["code"]
                        result["events"] += del_rst["events"]
                        result["duration"] = str(round(time.time() - start, 2))
                        return result

            # Poll extra if required (for Petset and Deployments with multiple replicas)
            if kube_obj.extra_poll:
                logger.info(
                    "Polling till healthy to make sure rest of components of %s are up and running ...",
                    name)
                create_rst = self._poll_till_healthy(
                    name=name,
                    kube_obj=kube_obj,
                    start_time=start,
                    poll_interval=AXPlatformConfigDefaults.
                    ObjCreateExtraPollInterval,
                    poll_max_retry=AXPlatformConfigDefaults.
                    ObjCreateExtraPollMaxRetry,
                    rst=result)
                if create_rst["failed"] and not self._debug:
                    logger.info("Deleting %s due to creation failure", name)
                    del_rst = self.stop_one(name, namespace)
                    create_rst["code"] += del_rst["code"]
                    create_rst["events"] += del_rst["events"]
                    create_rst["duration"] = str(round(time.time() - start, 2))
                return create_rst

            # Poll once to confirm all components from this Kubernetes config file exist,
            # In case there are objects in this config file cannot be monitored, i.e. svc
            # without elb. This is really not expected so we don't delete it
            if not kube_obj.healthy():
                logger.error(
                    "Object %s created but is not healthy. This is NOT EXPECTED, please check manually.",
                    name)
                result["code"].append("{:.25s}:{}".format(
                    name, KubeObjStatusCode.UNHEALTHY))
                result["failed"] = True
                result["events"].append(
                    "Object {} created byt is not healthy".format(name))
            result["duration"] = str(round(time.time() - start, 2))

            if not result["failed"]:
                logger.info("Successfully created object %s.", name)
            return result
        else:
            # use polling
            kube_obj.create()
            create_rst = self._poll_till_healthy(
                name=name,
                kube_obj=kube_obj,
                start_time=start,
                poll_interval=AXPlatformConfigDefaults.ObjCreatePollInterval,
                poll_max_retry=AXPlatformConfigDefaults.ObjCreatePollMaxRetry,
                rst=result)
            if create_rst["failed"] and not self._debug:
                logger.info("Deleting %s due to creation failure", name)
                del_rst = self.stop_one(name, namespace)
                create_rst["code"] += del_rst["code"]
                create_rst["events"] += del_rst["events"]
                create_rst["duration"] = str(round(time.time() - start, 2))
            return create_rst

    @staticmethod
    def _poll_till_healthy(name, kube_obj, start_time, poll_interval,
                           poll_max_retry, rst):
        trail = 0
        assert isinstance(kube_obj, KubeObject)
        while True:
            if not kube_obj.healthy():
                trail += 1
                if trail > poll_max_retry:

                    logger.error("Failed to create KubeObject %s", name)
                    rst["code"] += [
                        "{:.25s}:{}".format(name, KubeObjStatusCode.UNHEALTHY)
                    ]
                    rst["events"] += [
                        "Object {} creation timeout. Not healthy".format(name)
                    ]
                    rst["failed"] = True
                    rst["duration"] = str(round(time.time() - start_time, 2))
                    return rst
            else:
                logger.info("Successfully created %s.", name)
                rst["code"] += [
                    "{:.25s}:{}".format(name, KubeObjStatusCode.OK)
                ]
                rst["failed"] = False
                rst["duration"] = str(round(time.time() - start_time, 2))
                return rst
            time.sleep(poll_interval)

    def stop_one(self, name, namespace=AXNameSpaces.AXSYS):
        time.sleep(
            random.randint(0, AXPlatformConfigDefaults.ObjectOperationJitter))
        logger.info("Deleting %s in namespace %s ...", name, namespace)
        start = time.time()
        kube_obj = self._kube_objects[name]
        kube_obj.namespace = namespace
        kube_obj.replacing = self._replacing
        assert isinstance(kube_obj, KubeObject)

        result = {
            "name": name,
            "code": [],
            "events": [],
            "failed": False,
            "duration": ""
        }

        # Don't delete if object does not exist
        if not kube_obj.exists():
            result["code"] += [
                "{:.25s}:{}".format(name, KubeObjStatusCode.DELETED)
            ]
            result["duration"] = str(round(time.time() - start, 2))
            return result

        monitor_info = kube_obj.get_delete_monitor_info()
        if monitor_info:
            # use monitor
            waiters = []

            # Create and register waiters for all objects that can be monitored
            for m in monitor_info:
                wait_info = {
                    "kind": KubeKindToKubeApiObjKind[m.kube_kind],
                    "name": m.name,
                    "validator": m.validator
                }
                waiter = KubeObjWaiter()
                waiters.append((waiter, wait_info))
                AXKubeMonitor().wait_for_kube_object(
                    wait_info, AXPlatformConfigDefaults.ObjDeleteWaitTimeout,
                    waiter)

            # Call kubectl delete
            kube_obj.delete()

            # Wait on all waiters to retrieve status and events
            for waiter, wait_info in waiters:
                waiter.wait()
                result["events"] += waiter.details
                if waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN:
                    result["code"].append("{:.25s}:{}".format(
                        wait_info["name"], KubeObjStatusCode.DELETED))
                    logger.info("Successfully deleted %s in %s with code %s.",
                                wait_info["name"], name, result["code"])
                else:
                    result["failed"] = True
                    result["code"].append("{:.25s}:{}".format(
                        wait_info["name"], KubeObjStatusCode.UNKNOWN))
                    logger.error(
                        "Failed to delete %s in %s with code %s. Events: %s",
                        wait_info["name"], name, result["code"],
                        str(waiter.details))

            # Poll once to confirm all components from this Kubenetes config file exist
            # In case there are objects in this config file cannot be monitored, i.e. svc without elb
            if kube_obj.exists():
                logger.error("Object %s deleted but still exists", name)
                result["failed"] = True
                result["code"].append("{:.25s}:{}".format(
                    name, KubeObjStatusCode.UNKNOWN))
                result["events"].append(
                    "Object {} deleted but still exists.".format(name))
            result["duration"] = str(round(time.time() - start, 2))
            logger.info("Successfully deleted %s.", name)
            return result
        else:
            # use polling
            kube_obj.delete()
            return self._poll_till_not_exists(
                name=name,
                kube_obj=kube_obj,
                start_time=start,
                poll_interval=AXPlatformConfigDefaults.ObjDeletePollInterval,
                poll_max_retry=AXPlatformConfigDefaults.ObjDeletePollMaxRetry,
                rst=result)

    @staticmethod
    def _poll_till_not_exists(name, kube_obj, start_time, poll_interval,
                              poll_max_retry, rst):
        trail = 0
        assert isinstance(kube_obj, KubeObject)
        while True:
            if kube_obj.exists():
                trail += 1
                if trail > poll_max_retry:
                    logger.error("Failed to delete KubeObject %s", name)
                    rst["code"] += [
                        "{:.25s}:{}".format(name, KubeObjStatusCode.UNKNOWN)
                    ]
                    rst["events"] += [
                        "Object {} deletion timeout. Please manually check remaining pods"
                        .format(name)
                    ]
                    rst["failed"] = True
                    rst["duration"] = str(round(time.time() - start_time, 2))
                    return rst
            else:
                logger.info("Successfully deleted %s.", name)
                rst["code"] += [
                    "{:.25s}:{}".format(name, KubeObjStatusCode.DELETED)
                ]
                rst["failed"] = False
                rst["duration"] = str(round(time.time() - start_time, 2))
                return rst
            time.sleep(poll_interval)

    def _generate_object_summary(self, objects):
        """
        :param objects: list of AXPlatformObject
        :return:
        """
        report_title = "\n{:25s} |  {:110s} |  {:20s}\n".format(
            "NAME", "MANIFEST", "NAMESPACE")
        report_bar = "{}\n".format("-" * 174)
        content = ""
        for obj in objects:
            assert isinstance(obj, AXPlatformObject)
            name = obj.name
            filename = os.path.join(self._manifest_root, obj.manifest)
            namespace = obj.namespace
            content += "{:25s} |  {:110s} |  {:20s}\n".format(
                name, filename, namespace)

        return report_title + report_bar + content

    @staticmethod
    def _generate_report(results, operation):
        failed = False
        report_body = ""
        warnings = "\n======= WARNING EVENTS =======\n"
        for name in results.keys():
            individual_report = "{:25s} |  {:110s} |  {:20s}\n"
            individual_warning = "{name}: {events}\n\n"
            try:
                result = results[name].get()
                if result["failed"]:
                    failed = True
                code = result["code"][0]
                for c in result["code"][1:]:
                    code += " / {}".format(c)
                individual_report = individual_report.format(
                    name, code, result["duration"], 2)
                if len(result["events"]) > 0:
                    warnings += individual_warning.format(
                        name=name, events=str(result["events"]))
            except Exception as e:
                failed = True
                logger.exception(str(e))
                individual_report = individual_report.format(
                    name, "EXCEPTION", "UNKNOWN")
                warnings += individual_warning.format(name=name, events=str(e))
            report_body += individual_report

        report_head = "\n\nPlatform {} {}. Report:\n".format(
            operation, "FAILED" if failed else "SUCCESSFULLY")
        report_title = "\n{:25s} |  {:110s} |  {:20s}\n".format(
            "NAME", "STATUS", "TIME (sec)")
        report_bar = "{}\n".format("-" * 174)
        return "{}{}{}{}{}{}".format(
            report_head, report_title, report_bar, report_body, warnings,
            "==============================\n"), failed

    def _get_eip_from_config_map(self):
        try:
            cmd = [
                "kubectl", "get", "configmap", "cluster-dns-name", "-o",
                "yaml", "--namespace", self.kube_axsys_namespace,
                "--kubeconfig", self._kube_config
            ]
            out = subprocess.check_output(cmd)
            return [yaml.load(out)["data"]["cluster-external-dns-name"]]
        except Exception:
            logger.error("Failed to get cluster dns name from config map.")
            return None

    def _get_svc_eip(self, svclabel, namespace):
        svc = self.kube_poll.poll_kubernetes_sync(KubeKind.SERVICE, namespace,
                                                  svclabel)
        assert len(
            svc.items) == 1, "Currently services should only have one ingress"
        rst = []
        for ig in svc.items[0].status.load_balancer.ingress:
            if ig.hostname:
                rst.append(ig.hostname)
            if ig.ip:
                rst.append(ig.ip)
        return rst

    def _set_ext_dns(self):
        axops_eip = self._get_eip_from_config_map() or self._get_svc_eip(
            svclabel="app=axops", namespace=AXNameSpaces.AXSYS)

        if not axops_eip:
            logger.error(
                "Platform Start Failed: cannot find External IP for AXOPS")
            raise AXPlatformException("AXOPS elastic IP does not exist")

        self.cluster_dns_name = axops_eip[0]
        # Don't change format of this message. Portal parses this line to get cluster IP/DNS.
        logger.info(
            "\n\n%s>>>>> Starting Argo platform... cluster DNS: %s%s\n",
            COLOR_GREEN, self.cluster_dns_name, COLOR_NORM)
        self._replacing["AXOPS_EXT_DNS"] = self.cluster_dns_name

    def get_cluster_external_dns(self):
        if not self.cluster_dns_name:
            self._set_ext_dns()
        return self.cluster_dns_name

    def _set_autoscaling(self):
        # Prepare autoscaler
        asg_manager = AXUserASGManager(self._cluster_name_id, self._region,
                                       self._aws_profile)
        asg = asg_manager.get_variable_asg() or asg_manager.get_spot_asg(
        ) or asg_manager.get_on_demand_asg()
        if not asg:
            raise AXPlatformException(
                "Failed to get autoscaling group for cluster {}".format(
                    self._cluster_name_id))
        asg_name = asg["AutoScalingGroupName"]

        if asg_name is not None:
            self._replacing["ASG_NAME"] = asg_name
        else:
            logger.error("Autoscaling group name not found for %s",
                         self._cluster_name_id)
            raise AXPlatformException("Cannot find cluster autoscaling group")

    # TODO (#157) Version should only be uploaded during install and upgrade time
    def _update_version(self):
        # Software info we get during install / upgrade does not contain ami id
        # need to persist it as well
        self._software_info.ami_id = self._cluster_config.get_ami_id()

        AXVersion(AXCustomerId().get_customer_id(), self._cluster_name_id,
                  self._aws_profile).update(self._software_info.to_dict())
Exemplo n.º 6
0
    def start_one(self, name, namespace=AXNameSpaces.AXSYS):
        time.sleep(
            random.randint(0, AXPlatformConfigDefaults.ObjectOperationJitter))
        logger.info("Creating %s in namespace %s ...", name, namespace)
        start = time.time()
        kube_obj = self._kube_objects[name]

        # Update them as there are new updates in replacing in platform start
        kube_obj.namespace = namespace
        kube_obj.replacing = self._replacing

        assert isinstance(kube_obj, KubeObject)
        result = {
            "name": name,
            "code": [],
            "events": [],
            "failed": False,
            "duration": ""
        }
        if kube_obj.healthy():
            result["code"] += [
                "{:.25s}:{}".format(name, KubeObjStatusCode.OBJ_EXISTS)
            ]
            result["duration"] = str(round(time.time() - start, 2))
            return result

        # Previous platform start might fail, and might result in some componenets created
        # but not healthy (i.e. in CrashLoopBackoff). In this case, we delete the existing
        # object and try to create a new one
        if kube_obj.exists():
            logger.warning(
                "Object %s exists but not healthy. Deleting object for idempotency ...",
                name)
            self.stop_one(name, namespace)

        assert not kube_obj.exists(
        ), "Kubeobject {} already created but is not healthy. Not Expected".format(
            name)

        monitor_info = kube_obj.get_create_monitor_info()
        if monitor_info:
            # use monitor
            waiters = []

            # Create and register waiters for all objects that can be monitored
            for m in monitor_info:
                wait_info = {
                    "kind": KubeKindToKubeApiObjKind[m.kube_kind],
                    "name": m.name,
                    "validator": m.validator
                }
                waiter = KubeObjWaiter()
                waiters.append((waiter, wait_info))
                AXKubeMonitor().wait_for_kube_object(
                    wait_info, AXPlatformConfigDefaults.ObjCreateWaitTimeout,
                    waiter)

            # Call kubectl create
            kube_obj.create()

            # Wait on all waiters to retrieve status and events
            for waiter, wait_info in waiters:
                waiter.wait()
                result["events"] += waiter.details
                result["code"].append("{:.25s}:{}".format(
                    wait_info["name"], waiter.result))
                if waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN:
                    logger.info("Successfully created %s with code %s.",
                                wait_info["name"], waiter.result)
                else:
                    result["failed"] = True
                    logger.error(
                        "Failed to create %s in %s with code %s. Events: %s",
                        wait_info["name"], namespace, waiter.result,
                        str(waiter.details))
                    if not self._debug:
                        logger.info("Deleting %s due to creation failure",
                                    name)
                        del_rst = self.stop_one(name, namespace)
                        result["code"] += del_rst["code"]
                        result["events"] += del_rst["events"]
                        result["duration"] = str(round(time.time() - start, 2))
                        return result

            # Poll extra if required (for Petset and Deployments with multiple replicas)
            if kube_obj.extra_poll:
                logger.info(
                    "Polling till healthy to make sure rest of components of %s are up and running ...",
                    name)
                create_rst = self._poll_till_healthy(
                    name=name,
                    kube_obj=kube_obj,
                    start_time=start,
                    poll_interval=AXPlatformConfigDefaults.
                    ObjCreateExtraPollInterval,
                    poll_max_retry=AXPlatformConfigDefaults.
                    ObjCreateExtraPollMaxRetry,
                    rst=result)
                if create_rst["failed"] and not self._debug:
                    logger.info("Deleting %s due to creation failure", name)
                    del_rst = self.stop_one(name, namespace)
                    create_rst["code"] += del_rst["code"]
                    create_rst["events"] += del_rst["events"]
                    create_rst["duration"] = str(round(time.time() - start, 2))
                return create_rst

            # Poll once to confirm all components from this Kubernetes config file exist,
            # In case there are objects in this config file cannot be monitored, i.e. svc
            # without elb. This is really not expected so we don't delete it
            if not kube_obj.healthy():
                logger.error(
                    "Object %s created but is not healthy. This is NOT EXPECTED, please check manually.",
                    name)
                result["code"].append("{:.25s}:{}".format(
                    name, KubeObjStatusCode.UNHEALTHY))
                result["failed"] = True
                result["events"].append(
                    "Object {} created byt is not healthy".format(name))
            result["duration"] = str(round(time.time() - start, 2))

            if not result["failed"]:
                logger.info("Successfully created object %s.", name)
            return result
        else:
            # use polling
            kube_obj.create()
            create_rst = self._poll_till_healthy(
                name=name,
                kube_obj=kube_obj,
                start_time=start,
                poll_interval=AXPlatformConfigDefaults.ObjCreatePollInterval,
                poll_max_retry=AXPlatformConfigDefaults.ObjCreatePollMaxRetry,
                rst=result)
            if create_rst["failed"] and not self._debug:
                logger.info("Deleting %s due to creation failure", name)
                del_rst = self.stop_one(name, namespace)
                create_rst["code"] += del_rst["code"]
                create_rst["events"] += del_rst["events"]
                create_rst["duration"] = str(round(time.time() - start, 2))
            return create_rst
Exemplo n.º 7
0
    def stop(self, jobname=None):
        """
        NOTE: This function assumes that a pod is already running.
        This process kills the user command so that artifacts collection can occur
        Once this is done, the pod will be completed. This call will return when
        pod is completed. Note: pod is not deleted (just completed)
        """
        def get_container_status(s, container_name, name):
            if isinstance(s, dict):
                try:
                    c_status = s.get("containerStatuses", None)
                    for c in c_status or []:
                        n = c.get("name", None)
                        if n == container_name:
                            return c
                except Exception:
                    logger.exception(
                        "cannot get_container_status for [%s] [%s]", name,
                        container_name)

            return None

        def get_container_state(s, container_name, name):
            container_status = get_container_status(s,
                                                    container_name,
                                                    name=name)
            container_states = ["waiting", "running", "terminated"]
            if isinstance(container_status, dict):
                if "state" in container_status:
                    for state_string in container_states:
                        if state_string in container_status["state"]:
                            # wait if state in state_strings
                            logger.debug("state=%s for [%s] [%s]",
                                         state_string, name, container_name)
                            return state_string
                    logger.error("unknown state for [%s] [%s]: %s", name,
                                 container_name, s)
                    return None
                else:
                    # No state
                    logger.error("no state for [%s] [%s]: %s", name,
                                 container_name, s)
                    return None
            else:
                # no status
                logger.error("no status for [%s] [%s]: %s", name,
                             container_name, s)
                return None

        def get_pod_phase(s):
            if isinstance(s, dict):
                return s.get("phase", None)
            else:
                return None

        def validator_func(pod_status):
            # always return true for any event
            return True

        def send_kill_signal_to_main_container():
            ax_command_path = "/ax-execu-host/art"
            busybox_command_path = os.path.join(ax_command_path,
                                                "busybox-i686")
            bash_path = os.path.join(ax_command_path, "ax_bash_ax")
            touch_command = "{} {}".format(busybox_command_path, "touch")
            pgrep_command = "{} {}".format(busybox_command_path, "pgrep")
            xargs_command = "{} {}".format(busybox_command_path, "xargs")
            kill_command = os.path.join(ax_command_path, "ax_kill_ax")
            cat_command = os.path.join(ax_command_path, "ax_cat_ax")

            # execute command to initiate user command kill
            # This command may or may not execute properly if the container is already dying or dead
            # but it does not matter to us here since we will have a waiter. This command will ensure
            # that if a container is running, it will start the process of terminating
            # TODO: we may have pods that are started programmatically that do not have artifacts later
            # HACK HACK
            cmd = [
                bash_path, "-c",
                "{touch} {scratch_path}/.ax_delete ;  {kill} -9 `{cat} {scratch_path}/.ax_pid` "
                .format(touch=touch_command,
                        scratch_path=ARTIFACTS_CONTAINER_SCRATCH_PATH,
                        pgrep=pgrep_command,
                        xargs=xargs_command,
                        kill=kill_command,
                        cat=cat_command)
            ]
            logger.debug(
                "Try gracefully stop main container in [%s][%s]. cmd=%s",
                jobname, self.name, cmd)
            output = self.exec_commands(cmd)
            logger.debug("Kill output:\n%s", output)

        main_name = self.get_main_container_name()
        wait_name = SIDEKICK_WAIT_CONTAINER_NAME

        logger.debug("About to stop pod [%s][%s]", jobname, self.name)

        count = 0
        while True:
            count += 1
            if count > 180:
                logger.warning("Pod [%s][%s] too many lopps, abort. count=%s",
                               jobname, self.name, count)
                return False
            obj = {
                "kind": "pods",
                "name": jobname if jobname else self.name,
                "validator": validator_func
            }
            waiter = KubeObjWaiter()
            monitor = AXKubeMonitor()
            monitor.wait_for_kube_object(obj,
                                         timeout=DELETE_WAITER_WAIT_TIMEOUT,
                                         waiter=waiter)

            # read status here
            read_count = 0
            while True:
                read_count += 1
                if read_count > 180:
                    logger.warning(
                        "Pod [%s][%s] too many retry, abort. count=%s",
                        jobname, self.name, count)
                    return False
                try:
                    status = self.client.api.read_namespaced_pod_status(
                        self.namespace, self.name).status
                    assert isinstance(status, swagger_client.V1PodStatus)
                    status_dict = swagger_client.ApiClient(
                    ).sanitize_for_serialization(status)
                    break
                except Exception:
                    # xxx todo: what if self.name is not there?
                    logger.exception(
                        "exception in get status for Pod [%s][%s] retry=%s count=%s",
                        jobname, self.name, read_count, count)
                    time.sleep(10)
                    continue

            main_container_state = get_container_state(status_dict, main_name,
                                                       self.name)
            wait_container_state = get_container_state(status_dict, wait_name,
                                                       self.name)
            pod_phase = get_pod_phase(status_dict)
            logger.debug("Pod [%s][%s] phase=%s. main=%s, wait=%s count=%s",
                         jobname, self.name, pod_phase, main_container_state,
                         wait_container_state, count)

            if main_container_state == "waiting":
                logger.debug("Pod [%s][%s] main in %s count=%s", jobname,
                             self.name, main_container_state, count)
            elif main_container_state == "running":
                logger.debug("Pod [%s][%s] main in %s count=%s", jobname,
                             self.name, main_container_state, count)
                send_kill_signal_to_main_container()
            elif main_container_state is None:
                if pod_phase == "Pending":
                    logger.debug("Pod [%s][%s] in %s phase count=%s", jobname,
                                 self.name, pod_phase, count)
                else:
                    logger.warning(
                        "Pod [%s][%s] unknown main container state, abort. %s count=%s",
                        jobname, self.name, status_dict, count)
                    return False
            else:
                assert main_container_state == "terminated", "bad state {}".format(
                    main_container_state)
                if wait_container_state in ["waiting", "running"]:
                    logger.debug("Pod [%s][%s] wait in %s count=%s", jobname,
                                 self.name, wait_container_state, count)
                    pass
                elif wait_container_state == "terminated":
                    logger.debug(
                        "Pod [%s][%s] all containers are terminated. stop() done. count=%s",
                        jobname, self.name, count)
                    return True
                else:
                    logger.warning(
                        "Pod [%s][%s] unknown wait container state, abort. %s. count=%s",
                        jobname, self.name, status_dict, count)
                    return False

            logger.debug("Pod [%s][%s] wait for new event. count=%s", jobname,
                         self.name, count)
            waiter.wait()
            if waiter.result != KubeObjStatusCode.OK:
                logger.info("Pod [%s][%s] waiter return %s, events: %s",
                            jobname, self.name, waiter.result, waiter.details)
            else:
                logger.debug("Pod [%s][%s] waiter return ok count=%s", jobname,
                             self.name, count)