def terminate_replicas(api_client, namespace, name, replica, num_targets, exit_code=0): """Terminates the specified replica(s). Args: api_client: K8s client namespace: K8s namespace name: TFJob name replica: Replica type (chief, worker, ps) num_targets: Number of replicas to terminate. exit_code: What exit code to terminate the pods with. """ target = "{name}-{replica}".format(name=name, replica=replica) pod_labels = get_labels(namespace, name) pod_selector = to_selector(pod_labels) masterHost = api_client.configuration.host # Wait for the pods to be ready before we shutdown # TODO(jlewi): We are get pods using a label selector so there is # a risk that the pod we actual care about isn't present. logging.info("Waiting for pods to be running before shutting down.") k8s_util.wait_for_pods_to_be_in_phases( api_client, namespace, pod_selector, ["Running"], timeout=datetime.timedelta(minutes=4)) logging.info("Pods are ready") logging.info("Issuing the terminate request") for num in range(num_targets): full_target = target + "-{0}".format(num) terminate_replica(masterHost, namespace, full_target, exit_code)
def wait_for_replica_type_in_phases(api_client, namespace, tfjob_name, replica_type, phases): pod_labels = get_labels(tfjob_name, replica_type) pod_selector = to_selector(pod_labels) k8s_util.wait_for_pods_to_be_in_phases( api_client, namespace, pod_selector, phases, timeout=datetime.timedelta(minutes=4))