예제 #1
0
def terminate_replicas(api_client,
                       namespace,
                       name,
                       replica,
                       num_targets,
                       exit_code=0):
    """Terminates the specified replica(s).

  Args:
    api_client: K8s client
    namespace: K8s namespace
    name: TFJob name
    replica: Replica type (chief, worker, ps)
    num_targets: Number of replicas to terminate.
    exit_code: What exit code to terminate the pods with.
  """
    target = "{name}-{replica}".format(name=name, replica=replica)
    pod_labels = get_labels(namespace, name)
    pod_selector = to_selector(pod_labels)
    masterHost = api_client.configuration.host

    # Wait for the pods to be ready before we shutdown
    # TODO(jlewi): We are get pods using a label selector so there is
    # a risk that the pod we actual care about isn't present.
    logging.info("Waiting for pods to be running before shutting down.")
    k8s_util.wait_for_pods_to_be_in_phases(
        api_client,
        namespace,
        pod_selector, ["Running"],
        timeout=datetime.timedelta(minutes=4))
    logging.info("Pods are ready")
    logging.info("Issuing the terminate request")
    for num in range(num_targets):
        full_target = target + "-{0}".format(num)
        terminate_replica(masterHost, namespace, full_target, exit_code)
예제 #2
0
def wait_for_replica_type_in_phases(api_client, namespace, tfjob_name,
                                    replica_type, phases):
    pod_labels = get_labels(tfjob_name, replica_type)
    pod_selector = to_selector(pod_labels)
    k8s_util.wait_for_pods_to_be_in_phases(
        api_client,
        namespace,
        pod_selector,
        phases,
        timeout=datetime.timedelta(minutes=4))