Exemplo n.º 1
0
Arquivo: pod.py Projeto: zhan849/argo
    def _init_containers_spec(self):
        init_c = []

        if "init" not in self.cmap:
            return json.dumps(init_c)

        for c in self.cmap["init"] or []:
            c_spec = c.generate_spec()
            c_formatted = swagger_client.ApiClient().sanitize_for_serialization(c_spec)
            init_c.append(c_formatted)

        return json.dumps(init_c)
Exemplo n.º 2
0
    def stop(self, jobname=None):
        """
        NOTE: This function assumes that a pod is already running.
        This process kills the user command so that artifacts collection can occur
        Once this is done, the pod will be completed. This call will return when
        pod is completed. Note: pod is not deleted (just completed)
        """
        def get_container_status(s, container_name, name):
            if isinstance(s, dict):
                try:
                    c_status = s.get("containerStatuses", None)
                    for c in c_status or []:
                        n = c.get("name", None)
                        if n == container_name:
                            return c
                except Exception:
                    logger.exception(
                        "cannot get_container_status for [%s] [%s]", name,
                        container_name)

            return None

        def get_container_state(s, container_name, name):
            container_status = get_container_status(s,
                                                    container_name,
                                                    name=name)
            container_states = ["waiting", "running", "terminated"]
            if isinstance(container_status, dict):
                if "state" in container_status:
                    for state_string in container_states:
                        if state_string in container_status["state"]:
                            # wait if state in state_strings
                            logger.debug("state=%s for [%s] [%s]",
                                         state_string, name, container_name)
                            return state_string
                    logger.error("unknown state for [%s] [%s]: %s", name,
                                 container_name, s)
                    return None
                else:
                    # No state
                    logger.error("no state for [%s] [%s]: %s", name,
                                 container_name, s)
                    return None
            else:
                # no status
                logger.error("no status for [%s] [%s]: %s", name,
                             container_name, s)
                return None

        def get_pod_phase(s):
            if isinstance(s, dict):
                return s.get("phase", None)
            else:
                return None

        def validator_func(pod_status):
            # always return true for any event
            return True

        def send_kill_signal_to_main_container():
            ax_command_path = "/ax-execu-host/art"
            busybox_command_path = os.path.join(ax_command_path,
                                                "busybox-i686")
            bash_path = os.path.join(ax_command_path, "ax_bash_ax")
            touch_command = "{} {}".format(busybox_command_path, "touch")
            pgrep_command = "{} {}".format(busybox_command_path, "pgrep")
            xargs_command = "{} {}".format(busybox_command_path, "xargs")
            kill_command = os.path.join(ax_command_path, "ax_kill_ax")
            cat_command = os.path.join(ax_command_path, "ax_cat_ax")

            # execute command to initiate user command kill
            # This command may or may not execute properly if the container is already dying or dead
            # but it does not matter to us here since we will have a waiter. This command will ensure
            # that if a container is running, it will start the process of terminating
            # TODO: we may have pods that are started programmatically that do not have artifacts later
            # HACK HACK
            cmd = [
                bash_path, "-c",
                "{touch} {scratch_path}/.ax_delete ;  {kill} -9 `{cat} {scratch_path}/.ax_pid` "
                .format(touch=touch_command,
                        scratch_path=ARTIFACTS_CONTAINER_SCRATCH_PATH,
                        pgrep=pgrep_command,
                        xargs=xargs_command,
                        kill=kill_command,
                        cat=cat_command)
            ]
            logger.debug(
                "Try gracefully stop main container in [%s][%s]. cmd=%s",
                jobname, self.name, cmd)
            output = self.exec_commands(cmd)
            logger.debug("Kill output:\n%s", output)

        main_name = self.get_main_container_name()
        wait_name = SIDEKICK_WAIT_CONTAINER_NAME

        logger.debug("About to stop pod [%s][%s]", jobname, self.name)

        count = 0
        while True:
            count += 1
            if count > 180:
                logger.warning("Pod [%s][%s] too many lopps, abort. count=%s",
                               jobname, self.name, count)
                return False
            obj = {
                "kind": "pods",
                "name": jobname if jobname else self.name,
                "validator": validator_func
            }
            waiter = KubeObjWaiter()
            monitor = AXKubeMonitor()
            monitor.wait_for_kube_object(obj,
                                         timeout=DELETE_WAITER_WAIT_TIMEOUT,
                                         waiter=waiter)

            # read status here
            read_count = 0
            while True:
                read_count += 1
                if read_count > 180:
                    logger.warning(
                        "Pod [%s][%s] too many retry, abort. count=%s",
                        jobname, self.name, count)
                    return False
                try:
                    status = self.client.api.read_namespaced_pod_status(
                        self.namespace, self.name).status
                    assert isinstance(status, swagger_client.V1PodStatus)
                    status_dict = swagger_client.ApiClient(
                    ).sanitize_for_serialization(status)
                    break
                except Exception:
                    # xxx todo: what if self.name is not there?
                    logger.exception(
                        "exception in get status for Pod [%s][%s] retry=%s count=%s",
                        jobname, self.name, read_count, count)
                    time.sleep(10)
                    continue

            main_container_state = get_container_state(status_dict, main_name,
                                                       self.name)
            wait_container_state = get_container_state(status_dict, wait_name,
                                                       self.name)
            pod_phase = get_pod_phase(status_dict)
            logger.debug("Pod [%s][%s] phase=%s. main=%s, wait=%s count=%s",
                         jobname, self.name, pod_phase, main_container_state,
                         wait_container_state, count)

            if main_container_state == "waiting":
                logger.debug("Pod [%s][%s] main in %s count=%s", jobname,
                             self.name, main_container_state, count)
            elif main_container_state == "running":
                logger.debug("Pod [%s][%s] main in %s count=%s", jobname,
                             self.name, main_container_state, count)
                send_kill_signal_to_main_container()
            elif main_container_state is None:
                if pod_phase == "Pending":
                    logger.debug("Pod [%s][%s] in %s phase count=%s", jobname,
                                 self.name, pod_phase, count)
                else:
                    logger.warning(
                        "Pod [%s][%s] unknown main container state, abort. %s count=%s",
                        jobname, self.name, status_dict, count)
                    return False
            else:
                assert main_container_state == "terminated", "bad state {}".format(
                    main_container_state)
                if wait_container_state in ["waiting", "running"]:
                    logger.debug("Pod [%s][%s] wait in %s count=%s", jobname,
                                 self.name, wait_container_state, count)
                    pass
                elif wait_container_state == "terminated":
                    logger.debug(
                        "Pod [%s][%s] all containers are terminated. stop() done. count=%s",
                        jobname, self.name, count)
                    return True
                else:
                    logger.warning(
                        "Pod [%s][%s] unknown wait container state, abort. %s. count=%s",
                        jobname, self.name, status_dict, count)
                    return False

            logger.debug("Pod [%s][%s] wait for new event. count=%s", jobname,
                         self.name, count)
            waiter.wait()
            if waiter.result != KubeObjStatusCode.OK:
                logger.info("Pod [%s][%s] waiter return %s, events: %s",
                            jobname, self.name, waiter.result, waiter.details)
            else:
                logger.debug("Pod [%s][%s] waiter return ok count=%s", jobname,
                             self.name, count)