def evict_pod(api_instance, name, namespace): """ Evict a given pod so that it will be rescheduled on a schedulable node. We do catch errors and log them here, however, we don't halt the whole process because the node needs to go down regardless. This makes our evicting a "best effort". Parameters: api_instance (object): The K8S API object to use name (string): The name of the pod to evict namespace (string): The namespace the pod to evict is in """ print("Evicting " + name + " in namespace " + namespace + "!") delete_options = client.V1DeleteOptions() # After checking pod status for 12 minute, we'll assume the any remaining # pods won't evict, and tell the lifecycle hook to move on. Let's do 12 minutes # + 30 seconds to ungracefully terminate any remaining pods, yet still let us # list any pods that will be ungracefully terminated delete_options.grace_period_seconds = 750 metadata = client.V1ObjectMeta(name=name, namespace=namespace) body = client.V1beta1Eviction(metadata=metadata, api_version="policy/v1beta1", kind="Eviction", delete_options=delete_options) try: api_instance.create_namespaced_pod_eviction(name=name, namespace=namespace, body=body) except ApiException as e: print("Exception when evicting %s: %s\n" % (name, e))
def evict_pod(pod): delete_options = client.V1DeleteOptions(grace_period_seconds=30) eviction = client.V1beta1Eviction(delete_options=delete_options, metadata=pod.metadata) if DRYRUN: print("DRYRUN: skipping evict_pod step") else: v1.create_namespaced_pod_eviction( name=pod.metadata.name, namespace=pod.metadata.namespace, body=eviction ) annotate_pod(pod)
def evict_pod(pod): delete_options = client.V1DeleteOptions(grace_period_seconds=30) eviction = client.V1beta1Eviction(delete_options=delete_options, metadata=pod.metadata) if pod.metadata.namespace in ["kube-system"]: print("PROTECTED: not evicting from {}: {}".format( pod.metadata.namespace, pod.metadata.name)) elif DRYRUN: print("DRYRUN: skipping evict_pod step for {}/{}".format( pod.metadata.namespace, pod.metadata.name)) else: v1.create_namespaced_pod_eviction(name=pod.metadata.name, namespace=pod.metadata.namespace, body=eviction)
def drain_node(node: str): core_v1.patch_node(node, body={"spec": {"unschedulable": True}}) pods = core_v1.list_pod_for_all_namespaces( field_selector=f'spec.nodeName={node}').items for pod in pods: if is_daemonset_pod(pod): continue pod_meta = client.V1ObjectMeta(name=pod.metadata.name, namespace=pod.metadata.namespace) eviction = client.V1beta1Eviction( metadata=pod_meta, delete_options=client.V1DeleteOptions(grace_period_seconds=60)) core_v1.create_namespaced_pod_eviction( name=pod.metadata.name, namespace=pod.metadata.namespace, body=eviction)
def evict_pod(name, namespace): """ Evict a pod from a node. This method evicts a single pod from a node """ logger = get_logger("evict_pod") api_instance = client.CoreV1Api() ev = client.V1beta1Eviction() ev.metadata = client.V1ObjectMeta() ev.metadata.name = name ev.metadata.namespace = namespace ev.delete_options = client.V1DeleteOptions() try: api_instance.create_namespaced_pod_eviction(name=name, namespace=namespace, body=ev) except Exception as e: logger.debug(e) raise Exception("Failed to evict pod " + name + ": " + e)
def drain_nodes(name: str = None, label_selector: str = None, delete_pods_with_local_storage: bool = False, timeout: int = 120, secrets: Secrets = None) -> bool: """ Drain nodes matching the given label or name, so that no pods are scheduled on them any longer and running pods are evicted. It does a similar job to `kubectl drain --ignore-daemonsets` or `kubectl drain --delete-local-data --ignore-daemonsets` if `delete_pods_with_local_storage` is set to `True`. There is no equivalent to the `kubectl drain --force` flag. You probably want to call `uncordon` from in your experiment's rollbacks. """ # first let's make the node unschedulable cordon_node(name=name, label_selector=label_selector, secrets=secrets) api = create_k8s_api_client(secrets) v1 = client.CoreV1Api(api) if name: ret = v1.list_node(field_selector="metadata.name={}".format(name)) logger.debug("Found {d} node named '{s}'".format( d=len(ret.items), s=name)) else: ret = v1.list_node(label_selector=label_selector) logger.debug("Found {d} node(s) labelled '{s}'".format( d=len(ret.items), s=label_selector)) nodes = ret.items if not nodes: raise FailedActivity( "failed to find a node that matches selector {}".format( label_selector)) for node in nodes: node_name = node.metadata.name ret = v1.list_pod_for_all_namespaces( include_uninitialized=True, field_selector="spec.nodeName={}".format(node_name)) logger.debug("Found {d} pods on node '{n}'".format( d=len(ret.items), n=node_name)) if not ret.items: continue # following the drain command from kubectl as best as we can eviction_candidates = [] for pod in ret.items: name = pod.metadata.name phase = pod.status.phase volumes = pod.spec.volumes annotations = pod.metadata.annotations # do not handle mirror pods if annotations and "kubernetes.io/config.mirror" in annotations: logger.debug("Not deleting mirror pod '{}' on " "node '{}'".format(name, node_name)) continue if any(filter(lambda v: v.empty_dir is not None, volumes)): logger.debug( "Pod '{}' on node '{}' has a volume made " "of a local storage".format(name, node_name)) if not delete_pods_with_local_storage: logger.debug("Not evicting a pod with local storage") continue logger.debug("Deleting anyway due to flag") eviction_candidates.append(pod) continue if phase in ["Succeeded", "Failed"]: eviction_candidates.append(pod) continue for owner in pod.metadata.owner_references: if owner.controller and owner.kind != "DaemonSet": eviction_candidates.append(pod) break elif owner.kind == "DaemonSet": logger.debug( "Pod '{}' on node '{}' is owned by a DaemonSet. Will " "not evict it".format(name, node_name)) break else: raise FailedActivity( "Pod '{}' on node '{}' is unmanaged, cannot drain this " "node. Delete it manually first?".format(name, node_name)) if not eviction_candidates: logger.debug("No pods to evict. Let's return.") return True logger.debug("Found {} pods to evict".format(len(eviction_candidates))) for pod in eviction_candidates: eviction = client.V1beta1Eviction() eviction.metadata = client.V1ObjectMeta() eviction.metadata.name = pod.metadata.name eviction.metadata.namespace = pod.metadata.namespace eviction.delete_options = client.V1DeleteOptions() try: v1.create_namespaced_pod_eviction( pod.metadata.name, pod.metadata.namespace, body=eviction) except ApiException as x: raise FailedActivity( "Failed to evict pod {}: {}".format( pod.metadata.name, x.body)) pods = eviction_candidates[:] started = time.time() while True: logger.debug("Waiting for {} pods to go".format(len(pods))) if time.time() - started > timeout: remaining_pods = "\n".join([p.metadata.name for p in pods]) raise FailedActivity( "Draining nodes did not completed within {}s. " "Remaining pods are:\n{}".format(timeout, remaining_pods)) pending_pods = pods[:] for pod in pods: try: p = v1.read_namespaced_pod( pod.metadata.name, pod.metadata.namespace) # rescheduled elsewhere? if p.metadata.uid != pod.metadata.uid: pending_pods.remove(pod) continue logger.debug("Pod '{}' still around in phase: {}".format( p.metadata.name, p.status.phase)) except ApiException as x: if x.status == 404: # gone... pending_pods.remove(pod) pods = pending_pods[:] if not pods: logger.debug("Evicted all pods we could") break time.sleep(10) return True
def drain_node(node_name): ret = v1.list_pod_for_all_namespaces( field_selector="spec.nodeName={}".format(node_name)) if not ret.items: continue # following the drain command from kubectl as best as we can eviction_candidates = [] for pod in ret.items: name = pod.metadata.name phase = pod.status.phase volumes = pod.spec.volumes annotations = pod.metadata.annotations # do not handle mirror pods if annotations and "kubernetes.io/config.mirror" in annotations: logger.debug("Not deleting mirror pod '{}' on " "node '{}'".format(name, node_name)) continue if any(filter(lambda v: v.empty_dir is not None, volumes)): logger.debug("Pod '{}' on node '{}' has a volume made " "of a local storage".format(name, node_name)) if not delete_pods_with_local_storage: logger.debug("Not evicting a pod with local storage") continue logger.debug("Deleting anyway due to flag") eviction_candidates.append(pod) continue if phase in ["Succeeded", "Failed"]: eviction_candidates.append(pod) continue for owner in pod.metadata.owner_references: if owner.controller and owner.kind != "DaemonSet": eviction_candidates.append(pod) break elif owner.kind == "DaemonSet": logger.debug( "Pod '{}' on node '{}' is owned by a DaemonSet. Will " "not evict it".format(name, node_name)) break else: raise ActivityFailed( "Pod '{}' on node '{}' is unmanaged, cannot drain this " "node. Delete it manually first?".format(name, node_name)) if not eviction_candidates: logger.debug("No pods to evict. Let's return.") return True logger.debug("Found {} pods to evict".format(len(eviction_candidates))) for pod in eviction_candidates: eviction = client.V1beta1Eviction() eviction.metadata = client.V1ObjectMeta() eviction.metadata.name = pod.metadata.name eviction.metadata.namespace = pod.metadata.namespace eviction.delete_options = client.V1DeleteOptions() try: v1.create_namespaced_pod_eviction(pod.metadata.name, pod.metadata.namespace, body=eviction) except ApiException as x: raise ActivityFailed("Failed to evict pod {}: {}".format( pod.metadata.name, x.body)) pods = eviction_candidates[:] started = time.time() while True: logger.debug("Waiting for {} pods to go".format(len(pods))) if time.time() - started > timeout: remaining_pods = "\n".join([p.metadata.name for p in pods]) raise ActivityFailed( "Draining nodes did not completed within {}s. " "Remaining pods are:\n{}".format(timeout, remaining_pods)) pending_pods = pods[:] for pod in pods: try: p = v1.read_namespaced_pod(pod.metadata.name, pod.metadata.namespace) # rescheduled elsewhere? if p.metadata.uid != pod.metadata.uid: pending_pods.remove(pod) continue logger.debug("Pod '{}' still around in phase: {}".format( p.metadata.name, p.status.phase)) except ApiException as x: if x.status == 404: # gone... pending_pods.remove(pod) pods = pending_pods[:] if not pods: logger.debug("Evicted all pods we could") break time.sleep(10) return True
def toolkit_clean_evicted_pod(cluster, task_id=None): cluster = KubernetesCluster.objects.get(name=cluster) if task_id is None: user = User.objects.get_or_create(username="******")[0] cleaner = toolKitKubernetesCleaner( user=user, status="RUNNING", cluster=cluster, task_id=toolkit_clean_evicted_pod.request.id) else: cleaner = toolKitKubernetesCleaner.objects.get(id=task_id) cleaner.status = "RUNNING" cleaner.task_id = toolkit_clean_evicted_pod.request.id dingding.send( title=cleaner.status, content= "### **{}** @{} \n > * TASK: {}\n > * ID: {} \n > * TASK_ID: {}". format(cleaner.status.upper(), cleaner.user.profile.phone, 'toolkit_clean_evicted_pod', cleaner.id, cleaner.task_id), users=[cleaner.user.profile.phone]) try: cleaner.save() client = KubernetesNamespace.get_client(cluster_config=cluster.config) v1 = client.AppsV1Api() coreV1 = client.CoreV1Api() pods = coreV1.list_pod_for_all_namespaces().items __pod_list = [] for pod in pods: if pod.status.phase == "Failed" and pod.status.reason == "Evicted": namespace = pod.metadata.namespace pod = pod.metadata.name body = client.V1beta1Eviction(metadata=client.V1ObjectMeta( name=pod, namespace=namespace)) response = coreV1.create_namespaced_pod_eviction( name=pod, namespace=namespace, body=body, pretty=True) __pod_list.append("{}/{}".format(namespace, pod)) except Exception as err: cleaner.status = "FAILURE" cleaner.save() dingding.send( title=cleaner.status, content= "### **{}** @{} \n > * TASK: {}\n > * ID: {} \n > * TASK_ID: {}". format(cleaner.status.upper(), cleaner.user.profile.phone, 'toolkit_clean_evicted_pod', cleaner.id, cleaner.task_id), users=[cleaner.user.profile.phone]) raise Exception(err) else: cleaner.status = "SUCCESS" cleaner.pods = __pod_list cleaner.save() dingding.send( title=cleaner.status, content= "### **{}** @{} \n > * TASK: {}\n > * ID: {} \n > * TASK_ID: {}". format(cleaner.status.upper(), cleaner.user.profile.phone, 'toolkit_clean_evicted_pod', cleaner.id, cleaner.task_id), users=[cleaner.user.profile.phone]) return "Pod: {} is evicted!".format(__pod_list)