def test_svc_lb_create(kubectl, monitor): svc = "test_svc_lb" svc_file = PWD + "/testdata/" + svc + ".yml" svc_name = "{}-{:08d}".format(svc.replace("_", "-"), random.randint(1, 99999999)) kube_obj["name"] = svc_name kube_obj["kind"] = KubeApiObjKind.SERVICE kube_obj["validator"] = wait_for_svc_lb_validator waiter = KubeObjWaiter() monitor.wait_for_kube_object(kube_obj=kube_obj, timeout=DEFAULT_SVC_CREATION_TIMEOUT, waiter=waiter) with open(svc_file, "r") as f: data = f.read() yaml_obj = [obj for obj in yaml.load_all(data)] assert len(yaml_obj) == 1, "Loaded more than 1 yaml obj {}".format( yaml_obj) swagger_obj = yaml_to_swagger(yaml_obj[0]) swagger_obj.metadata.name = svc_name create_svc_with_retry(kubectl, swagger_obj) waiter.wait() delete_svc_with_retry(kubectl, swagger_obj) if waiter.result != KubeObjStatusCode.OK: logger.info("Service created with status %s, events:\n%s", waiter.result, pformat(waiter.details)) test_result = waiter.result == KubeObjStatusCode.OK assert test_result
def test_pod_create_insufficient_resource(kubectl, monitor): pod = "test_pod_insufficient_resource" pod_file = PWD + "/testdata/" + pod + ".yml" pod_name = "{}-{:08d}".format(pod.replace("_", "-"), random.randint(1, 99999999)) kube_obj["name"] = pod_name waiter = KubeObjWaiter() monitor.wait_for_kube_object(kube_obj=kube_obj, timeout=DEFAULT_POD_CREATION_TIMEOUT, waiter=waiter) with open(pod_file, "r") as f: data = f.read() yaml_obj = [obj for obj in yaml.load_all(data)] assert len(yaml_obj) == 1, "Loaded more than 1 yaml obj {}".format( yaml_obj) swagger_obj = yaml_to_swagger(yaml_obj[0]) swagger_obj.metadata.name = pod_name create_pod_with_retry(kubectl, swagger_obj) waiter.wait() delete_pod_with_retry(kubectl, swagger_obj) if waiter.result != KubeObjStatusCode.ERR_INSUFFICIENT_RESOURCE: logger.info("Pod created with status %s, events:\n%s", waiter.result, pformat(waiter.details)) test_result = waiter.result == KubeObjStatusCode.ERR_INSUFFICIENT_RESOURCE assert test_result
def test_pod_create_timeout(monitor): # just don't create a pod pod = "test_pod_timeout" pod_name = "{}-{:08d}".format(pod.replace("_", "-"), random.randint(1, 99999999)) kube_obj["name"] = pod_name waiter = KubeObjWaiter() monitor.wait_for_kube_object(kube_obj=kube_obj, timeout=10, waiter=waiter) waiter.wait() if waiter.result != KubeObjStatusCode.ERR_PLAT_TASK_CREATE_TIMEOUT: logger.info("Pod created with status %s, events:\n%s", waiter.result, pformat(waiter.details)) test_result = waiter.result == KubeObjStatusCode.ERR_PLAT_TASK_CREATE_TIMEOUT assert test_result
def stop_one(self, name, namespace=AXNameSpaces.AXSYS): time.sleep( random.randint(0, AXPlatformConfigDefaults.ObjectOperationJitter)) logger.info("Deleting %s in namespace %s ...", name, namespace) start = time.time() kube_obj = self._kube_objects[name] kube_obj.namespace = namespace kube_obj.replacing = self._replacing assert isinstance(kube_obj, KubeObject) result = { "name": name, "code": [], "events": [], "failed": False, "duration": "" } # Don't delete if object does not exist if not kube_obj.exists(): result["code"] += [ "{:.25s}:{}".format(name, KubeObjStatusCode.DELETED) ] result["duration"] = str(round(time.time() - start, 2)) return result monitor_info = kube_obj.get_delete_monitor_info() if monitor_info: # use monitor waiters = [] # Create and register waiters for all objects that can be monitored for m in monitor_info: wait_info = { "kind": KubeKindToKubeApiObjKind[m.kube_kind], "name": m.name, "validator": m.validator } waiter = KubeObjWaiter() waiters.append((waiter, wait_info)) AXKubeMonitor().wait_for_kube_object( wait_info, AXPlatformConfigDefaults.ObjDeleteWaitTimeout, waiter) # Call kubectl delete kube_obj.delete() # Wait on all waiters to retrieve status and events for waiter, wait_info in waiters: waiter.wait() result["events"] += waiter.details if waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN: result["code"].append("{:.25s}:{}".format( wait_info["name"], KubeObjStatusCode.DELETED)) logger.info("Successfully deleted %s in %s with code %s.", wait_info["name"], name, result["code"]) else: result["failed"] = True result["code"].append("{:.25s}:{}".format( wait_info["name"], KubeObjStatusCode.UNKNOWN)) logger.error( "Failed to delete %s in %s with code %s. Events: %s", wait_info["name"], name, result["code"], str(waiter.details)) # Poll once to confirm all components from this Kubenetes config file exist # In case there are objects in this config file cannot be monitored, i.e. svc without elb if kube_obj.exists(): logger.error("Object %s deleted but still exists", name) result["failed"] = True result["code"].append("{:.25s}:{}".format( name, KubeObjStatusCode.UNKNOWN)) result["events"].append( "Object {} deleted but still exists.".format(name)) result["duration"] = str(round(time.time() - start, 2)) logger.info("Successfully deleted %s.", name) return result else: # use polling kube_obj.delete() return self._poll_till_not_exists( name=name, kube_obj=kube_obj, start_time=start, poll_interval=AXPlatformConfigDefaults.ObjDeletePollInterval, poll_max_retry=AXPlatformConfigDefaults.ObjDeletePollMaxRetry, rst=result)
def start_one(self, name, namespace=AXNameSpaces.AXSYS): time.sleep( random.randint(0, AXPlatformConfigDefaults.ObjectOperationJitter)) logger.info("Creating %s in namespace %s ...", name, namespace) start = time.time() kube_obj = self._kube_objects[name] # Update them as there are new updates in replacing in platform start kube_obj.namespace = namespace kube_obj.replacing = self._replacing assert isinstance(kube_obj, KubeObject) result = { "name": name, "code": [], "events": [], "failed": False, "duration": "" } if kube_obj.healthy(): result["code"] += [ "{:.25s}:{}".format(name, KubeObjStatusCode.OBJ_EXISTS) ] result["duration"] = str(round(time.time() - start, 2)) return result # Previous platform start might fail, and might result in some componenets created # but not healthy (i.e. in CrashLoopBackoff). In this case, we delete the existing # object and try to create a new one if kube_obj.exists(): logger.warning( "Object %s exists but not healthy. Deleting object for idempotency ...", name) self.stop_one(name, namespace) assert not kube_obj.exists( ), "Kubeobject {} already created but is not healthy. Not Expected".format( name) monitor_info = kube_obj.get_create_monitor_info() if monitor_info: # use monitor waiters = [] # Create and register waiters for all objects that can be monitored for m in monitor_info: wait_info = { "kind": KubeKindToKubeApiObjKind[m.kube_kind], "name": m.name, "validator": m.validator } waiter = KubeObjWaiter() waiters.append((waiter, wait_info)) AXKubeMonitor().wait_for_kube_object( wait_info, AXPlatformConfigDefaults.ObjCreateWaitTimeout, waiter) # Call kubectl create kube_obj.create() # Wait on all waiters to retrieve status and events for waiter, wait_info in waiters: waiter.wait() result["events"] += waiter.details result["code"].append("{:.25s}:{}".format( wait_info["name"], waiter.result)) if waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN: logger.info("Successfully created %s with code %s.", wait_info["name"], waiter.result) else: result["failed"] = True logger.error( "Failed to create %s in %s with code %s. Events: %s", wait_info["name"], namespace, waiter.result, str(waiter.details)) if not self._debug: logger.info("Deleting %s due to creation failure", name) del_rst = self.stop_one(name, namespace) result["code"] += del_rst["code"] result["events"] += del_rst["events"] result["duration"] = str(round(time.time() - start, 2)) return result # Poll extra if required (for Petset and Deployments with multiple replicas) if kube_obj.extra_poll: logger.info( "Polling till healthy to make sure rest of components of %s are up and running ...", name) create_rst = self._poll_till_healthy( name=name, kube_obj=kube_obj, start_time=start, poll_interval=AXPlatformConfigDefaults. ObjCreateExtraPollInterval, poll_max_retry=AXPlatformConfigDefaults. ObjCreateExtraPollMaxRetry, rst=result) if create_rst["failed"] and not self._debug: logger.info("Deleting %s due to creation failure", name) del_rst = self.stop_one(name, namespace) create_rst["code"] += del_rst["code"] create_rst["events"] += del_rst["events"] create_rst["duration"] = str(round(time.time() - start, 2)) return create_rst # Poll once to confirm all components from this Kubernetes config file exist, # In case there are objects in this config file cannot be monitored, i.e. svc # without elb. This is really not expected so we don't delete it if not kube_obj.healthy(): logger.error( "Object %s created but is not healthy. This is NOT EXPECTED, please check manually.", name) result["code"].append("{:.25s}:{}".format( name, KubeObjStatusCode.UNHEALTHY)) result["failed"] = True result["events"].append( "Object {} created byt is not healthy".format(name)) result["duration"] = str(round(time.time() - start, 2)) if not result["failed"]: logger.info("Successfully created object %s.", name) return result else: # use polling kube_obj.create() create_rst = self._poll_till_healthy( name=name, kube_obj=kube_obj, start_time=start, poll_interval=AXPlatformConfigDefaults.ObjCreatePollInterval, poll_max_retry=AXPlatformConfigDefaults.ObjCreatePollMaxRetry, rst=result) if create_rst["failed"] and not self._debug: logger.info("Deleting %s due to creation failure", name) del_rst = self.stop_one(name, namespace) create_rst["code"] += del_rst["code"] create_rst["events"] += del_rst["events"] create_rst["duration"] = str(round(time.time() - start, 2)) return create_rst
def stop(self, jobname=None): """ NOTE: This function assumes that a pod is already running. This process kills the user command so that artifacts collection can occur Once this is done, the pod will be completed. This call will return when pod is completed. Note: pod is not deleted (just completed) """ def get_container_status(s, container_name, name): if isinstance(s, dict): try: c_status = s.get("containerStatuses", None) for c in c_status or []: n = c.get("name", None) if n == container_name: return c except Exception: logger.exception( "cannot get_container_status for [%s] [%s]", name, container_name) return None def get_container_state(s, container_name, name): container_status = get_container_status(s, container_name, name=name) container_states = ["waiting", "running", "terminated"] if isinstance(container_status, dict): if "state" in container_status: for state_string in container_states: if state_string in container_status["state"]: # wait if state in state_strings logger.debug("state=%s for [%s] [%s]", state_string, name, container_name) return state_string logger.error("unknown state for [%s] [%s]: %s", name, container_name, s) return None else: # No state logger.error("no state for [%s] [%s]: %s", name, container_name, s) return None else: # no status logger.error("no status for [%s] [%s]: %s", name, container_name, s) return None def get_pod_phase(s): if isinstance(s, dict): return s.get("phase", None) else: return None def validator_func(pod_status): # always return true for any event return True def send_kill_signal_to_main_container(): ax_command_path = "/ax-execu-host/art" busybox_command_path = os.path.join(ax_command_path, "busybox-i686") bash_path = os.path.join(ax_command_path, "ax_bash_ax") touch_command = "{} {}".format(busybox_command_path, "touch") pgrep_command = "{} {}".format(busybox_command_path, "pgrep") xargs_command = "{} {}".format(busybox_command_path, "xargs") kill_command = os.path.join(ax_command_path, "ax_kill_ax") cat_command = os.path.join(ax_command_path, "ax_cat_ax") # execute command to initiate user command kill # This command may or may not execute properly if the container is already dying or dead # but it does not matter to us here since we will have a waiter. This command will ensure # that if a container is running, it will start the process of terminating # TODO: we may have pods that are started programmatically that do not have artifacts later # HACK HACK cmd = [ bash_path, "-c", "{touch} {scratch_path}/.ax_delete ; {kill} -9 `{cat} {scratch_path}/.ax_pid` " .format(touch=touch_command, scratch_path=ARTIFACTS_CONTAINER_SCRATCH_PATH, pgrep=pgrep_command, xargs=xargs_command, kill=kill_command, cat=cat_command) ] logger.debug( "Try gracefully stop main container in [%s][%s]. cmd=%s", jobname, self.name, cmd) output = self.exec_commands(cmd) logger.debug("Kill output:\n%s", output) main_name = self.get_main_container_name() wait_name = SIDEKICK_WAIT_CONTAINER_NAME logger.debug("About to stop pod [%s][%s]", jobname, self.name) count = 0 while True: count += 1 if count > 180: logger.warning("Pod [%s][%s] too many lopps, abort. count=%s", jobname, self.name, count) return False obj = { "kind": "pods", "name": jobname if jobname else self.name, "validator": validator_func } waiter = KubeObjWaiter() monitor = AXKubeMonitor() monitor.wait_for_kube_object(obj, timeout=DELETE_WAITER_WAIT_TIMEOUT, waiter=waiter) # read status here read_count = 0 while True: read_count += 1 if read_count > 180: logger.warning( "Pod [%s][%s] too many retry, abort. count=%s", jobname, self.name, count) return False try: status = self.client.api.read_namespaced_pod_status( self.namespace, self.name).status assert isinstance(status, swagger_client.V1PodStatus) status_dict = swagger_client.ApiClient( ).sanitize_for_serialization(status) break except Exception: # xxx todo: what if self.name is not there? logger.exception( "exception in get status for Pod [%s][%s] retry=%s count=%s", jobname, self.name, read_count, count) time.sleep(10) continue main_container_state = get_container_state(status_dict, main_name, self.name) wait_container_state = get_container_state(status_dict, wait_name, self.name) pod_phase = get_pod_phase(status_dict) logger.debug("Pod [%s][%s] phase=%s. main=%s, wait=%s count=%s", jobname, self.name, pod_phase, main_container_state, wait_container_state, count) if main_container_state == "waiting": logger.debug("Pod [%s][%s] main in %s count=%s", jobname, self.name, main_container_state, count) elif main_container_state == "running": logger.debug("Pod [%s][%s] main in %s count=%s", jobname, self.name, main_container_state, count) send_kill_signal_to_main_container() elif main_container_state is None: if pod_phase == "Pending": logger.debug("Pod [%s][%s] in %s phase count=%s", jobname, self.name, pod_phase, count) else: logger.warning( "Pod [%s][%s] unknown main container state, abort. %s count=%s", jobname, self.name, status_dict, count) return False else: assert main_container_state == "terminated", "bad state {}".format( main_container_state) if wait_container_state in ["waiting", "running"]: logger.debug("Pod [%s][%s] wait in %s count=%s", jobname, self.name, wait_container_state, count) pass elif wait_container_state == "terminated": logger.debug( "Pod [%s][%s] all containers are terminated. stop() done. count=%s", jobname, self.name, count) return True else: logger.warning( "Pod [%s][%s] unknown wait container state, abort. %s. count=%s", jobname, self.name, status_dict, count) return False logger.debug("Pod [%s][%s] wait for new event. count=%s", jobname, self.name, count) waiter.wait() if waiter.result != KubeObjStatusCode.OK: logger.info("Pod [%s][%s] waiter return %s, events: %s", jobname, self.name, waiter.result, waiter.details) else: logger.debug("Pod [%s][%s] waiter return ok count=%s", jobname, self.name, count)
def test_volume_create_delete(kubectl, monitor, kubepoll): pvc = "test_pvc" pvc_label = "app=testpvc" pvc_file = PWD + "/testdata/" + pvc + ".yml" pvc_name = "{}-{:08d}".format(pvc.replace("_", "-"), random.randint(1, 99999999)) kube_obj["name"] = pvc_name kube_obj["kind"] = KubeApiObjKind.PVC kube_obj["validator"] = wait_for_pvc_validator waiter = KubeObjWaiter() monitor.wait_for_kube_object(kube_obj=kube_obj, timeout=DEFAULT_PVC_CREATION_TIMEOUT, waiter=waiter) with open(pvc_file, "r") as f: data = f.read() yaml_obj = [obj for obj in yaml.load_all(data)] assert len(yaml_obj) == 1, "Loaded more than 1 yaml obj {}".format( yaml_obj) swagger_obj = yaml_to_swagger(yaml_obj[0]) swagger_obj.metadata.name = pvc_name # Manually patch access mode as swagger client mistakenly interprets this as map swagger_obj.spec.access_modes = ["ReadWriteOnce"] create_pvc_with_retry(kubectl, swagger_obj) waiter.wait() try: if waiter.result != KubeObjStatusCode.OK: logger.info("PVC created with status %s, events:\n%s", waiter.result, pformat(waiter.details)) test_result = waiter.result == KubeObjStatusCode.OK assert test_result pvcs = kubepoll.poll_kubernetes_sync(KubeKind.PVC, TEST_NAMESPACE, pvc_label) pvc = None for p in pvcs.items: if p.metadata.name == pvc_name: pvc = p break assert pvc kube_obj["name"] = pvc.spec.volume_name kube_obj["kind"] = KubeApiObjKind.PV kube_obj["validator"] = pv_release_validator waiter = KubeObjWaiter() monitor.wait_for_kube_object(kube_obj=kube_obj, timeout=DEFAULT_PV_DELETE_TIMEOUT, waiter=waiter) delete_pvc_with_retry(kubectl, swagger_obj) waiter.wait() if waiter.result != KubeObjStatusCode.OK: logger.info("PVC created with status %s, events:\n%s", waiter.result, pformat(waiter.details)) test_result = waiter.result == KubeObjStatusCode.OK or waiter.result == KubeObjStatusCode.WARN assert test_result except Exception as e: delete_pvc_with_retry(kubectl, swagger_obj) raise e