def run_simple_tfjob(self, component): api_client = k8s_client.ApiClient() # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return # Check for creation failures. creation_failures = tf_job_client.get_creation_failures_from_tfjob( api_client, self.namespace, results) if creation_failures: # TODO(jlewi): Starting with # https://github.com/kubeflow/tf-operator/pull/646 the number of events # no longer seems to match the expected; it looks like maybe events # are being combined? For now we just log a warning rather than an # error. logging.warning(creation_failures) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def run_test(args, test_case): # pylint: disable=too-many-branches,too-many-statements """Run a test.""" util.load_kube_config() api_client = k8s_client.ApiClient() t = test_util.TestCase() t.class_name = "tfjob_test" namespace, name, env = test_runner.setup_ks_app(args) t.name = os.path.basename(name) try: # pylint: disable=too-many-nested-blocks util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) logging.info("Created job %s in namespaces %s", name, namespace) logging.info("Wait for conditions Failed") results = tf_job_client.wait_for_condition( api_client, namespace, name, ["Succeeded", "Failed"], status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) # For v1alpha2 check for non-empty completionTime last_condition = results.get("status", {}).get("conditions", [])[-1] if last_condition.get("type", "").lower() != "failed": message = "Job {0} in namespace {1} did not fail; status {2}".format( name, namespace, results.get("status", {})) logging.error(message) test_case.add_failure_info(message) return pattern = ".*the spec is invalid.*" condition_message = last_condition.get("message", "") if not re.match(pattern, condition_message): message = "Condition message {0} did not match pattern {1}".format( condition_message, pattern) logging.error(message) test_case.add_failure_info(message) except tf_operator_util.JobTimeoutError as e: if e.job: spec = "Job:\n" + json.dumps(e.job, indent=2) else: spec = "JobTimeoutError did not contain job" message = ("Timeout waiting for {0} in namespace {1} to finish; " ).format(name, namespace) + spec logging.exception(message) test_case.add_failure_info(message) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. message = "There was a problem running the job; Exception {0}".format( e) logging.exception(message) test_case.add_failure_info(message)
def test_tf_job_simple(test_case): # pylint: disable=redefined-outer-name args = parse_args() namespace = "default" name = "tf-job-simple" util.load_kube_config() api_client = k8s_client.ApiClient() create_app_and_job(args, namespace, name) try: tf_job_client.wait_for_condition( api_client, namespace, name, ["Running"], status_callback=tf_job_client.log_status) logging.info("TFJob launched successfully") except Exception as e: logging.error("Test failed waiting for job; %s", e) test_case.add_failure_info(e.message)
def run_tfjob_with_shutdown_policy(self, component, shutdown_policy): api_client = k8s_client.ApiClient() # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) if shutdown_policy == "worker": tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "worker", 1) else: tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "chief", 1) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def run_distributed_training_job(self, component): api_client = k8s_client.ApiClient() # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return # Check for creation failures. creation_failures = tf_job_client.get_creation_failures_from_tfjob( api_client, self.namespace, results) if creation_failures: logging.warning(creation_failures) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def test_invalid_tfjob_spec(self): api_client = k8s_client.ApiClient() component = INVALID_TFJOB_COMPONENT_NAME + "_" + self.tfjob_version # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) logging.info("Wait for conditions Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) # For v1alpha2 check for non-empty completionTime last_condition = results.get("status", {}).get("conditions", [])[-1] if last_condition.get("type", "").lower() != "failed": self.failure = "Job {0} in namespace {1} did not fail; status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return pattern = ".*the spec is invalid.*" condition_message = last_condition.get("message", "") if not re.match(pattern, condition_message): self.failure = "Condition message {0} did not match pattern {1}".format( condition_message, pattern) logging.error(self.failure) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def test_tfjob_and_verify_runconfig(self): api_client = k8s_client.ApiClient() masterHost = api_client.configuration.host # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, COMPONENT_NAME, self.params) # Create the TF job util.run(["ks", "apply", self.env, "-c", COMPONENT_NAME], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) num_ps = results.get("spec", {}).get("tfReplicaSpecs", {}).get( "PS", {}).get("replicas", 0) num_workers = results.get("spec", {}).get("tfReplicaSpecs", {}).get( "Worker", {}).get("replicas", 0) verify_runconfig(masterHost, self.namespace, self.name, "chief", num_ps, num_workers) verify_runconfig(masterHost, self.namespace, self.name, "worker", num_ps, num_workers) verify_runconfig(masterHost, self.namespace, self.name, "ps", num_ps, num_workers) tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "chief", 1) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def run_test(args): # pylint: disable=too-many-branches,too-many-statements """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone # TODO(jlewi): When using GKE we should copy the .kube config and any other # files to the test directory. We should then set the environment variable # KUBECONFIG to point at that file. This should prevent us from having # to rerun util.configure_kubectl on each step. Instead we could run it once # as part of GKE cluster creation and store the config in the NFS directory. # This would make the handling of credentials # and KUBECONFIG more consistent between GKE and minikube and eventually # this could be extended to other K8s deployments. if cluster_name: util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() api_client = k8s_client.ApiClient() masterHost = api_client.configuration.host t = test_util.TestCase() t.class_name = "tfjob_test" namespace, name, env = _setup_ks_app(args) t.name = os.path.basename(name) start = time.time() try: # pylint: disable=too-many-nested-blocks # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. # TODO(jlewi): We should make this an argument. num_trials = 2 for trial in range(num_trials): logging.info("Trial %s", trial) util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) logging.info("Created job %s in namespaces %s", name, namespace) logging.info("tfjob_version=%s", args.tfjob_version) # Wait for the job to either be in Running state or a terminal state if args.tfjob_version == "v1alpha1": logging.info("Wait for Phase Running, Done, or Failed") results = tf_job_client.wait_for_phase( api_client, namespace, name, ["Running", "Done", "Failed"], status_callback=tf_job_client.log_status) else: logging.info( "Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, namespace, name, ["Running", "Succeeded", "Failed"], status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) # The job is now either running or done. if args.shutdown_policy: logging.info("Enforcing shutdownPolicy %s", args.shutdown_policy) if args.shutdown_policy in ["master", "chief"]: if args.tfjob_version == "v1alpha1": replica = "master" else: replica = "chief" elif args.shutdown_policy in ["worker", "all_workers"]: replica = "worker" else: raise ValueError("Unrecognized shutdown_policy " "%s" % args.shutdown_policy) # Number of targets. num_targets = 1 if args.shutdown_policy in ["all_workers"]: # Assume v1alpha2 num_targets = results.get("spec", {}).get( "tfReplicaSpecs", {}).get("Worker", {}).get("replicas", 0) logging.info("There are %s worker replicas", num_targets) if args.tfjob_version == "v1alpha1": runtime_id = results.get("spec", {}).get("RuntimeId") target = "{name}-{replica}-{runtime}".format( name=name, replica=replica, runtime=runtime_id) pod_labels = get_labels(name, runtime_id) pod_selector = to_selector(pod_labels) else: target = "{name}-{replica}".format(name=name, replica=replica) pod_labels = get_labels_v1alpha2(namespace, name) pod_selector = to_selector(pod_labels) # Wait for the pods to be ready before we shutdown # TODO(jlewi): We are get pods using a label selector so there is # a risk that the pod we actual care about isn't present. logging.info( "Waiting for pods to be running before shutting down.") wait_for_pods_to_be_in_phases( api_client, namespace, pod_selector, ["Running"], timeout=datetime.timedelta(minutes=4)) logging.info("Pods are ready") logging.info("Issuing the terminate request") for num in range(num_targets): full_target = target + "-{0}".format(num) terminateReplica(masterHost, namespace, full_target) logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, namespace, name, args.tfjob_version, status_callback=tf_job_client.log_status) if args.tfjob_version == "v1alpha1": if results.get("status", {}).get("state", {}).lower() != "succeeded": t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format( trial, name, namespace, results.get("status", {}).get("state", None)) logging.error(t.failure) break else: # For v1alpha2 check for non-empty completionTime last_condition = results.get("status", {}).get("conditions", [])[-1] if last_condition.get("type", "").lower() != "succeeded": t.failure = "Trial {0} Job {1} in namespace {2} in status {3}".format( trial, name, namespace, results.get("status", {})) logging.error(t.failure) break runtime_id = results.get("spec", {}).get("RuntimeId") logging.info("Trial %s Job %s in namespace %s runtime ID %s", trial, name, namespace, runtime_id) uid = results.get("metadata", {}).get("uid") events = get_events(api_client, namespace, uid) for e in events: logging.info("K8s event: %s", e.message) # Print out the K8s events because it can be useful for debugging. for e in events: logging.info("Recieved K8s Event:\n%s", e) created_pods, created_services = parse_events(events) num_expected = 0 if args.tfjob_version == "v1alpha1": for replica in results.get("spec", {}).get("replicaSpecs", []): num_expected += replica.get("replicas", 0) else: for replicakey in results.get("spec", {}).get("tfReplicaSpecs", {}): replica_spec = results.get("spec", {}).get("tfReplicaSpecs", {}).get(replicakey, {}) if replica_spec: num_expected += replica_spec.get("replicas", 1) creation_failures = [] if len(created_pods) != num_expected: message = ("Expected {0} pods to be created but only " "got {1} create events.").format( num_expected, len(created_pods)) creation_failures.append(message) if len(created_services) != num_expected: message = ("Expected {0} services to be created but only " "got {1} create events.").format( num_expected, len(created_services)) creation_failures.append(message) if creation_failures: # TODO(jlewi): Starting with # https://github.com/kubeflow/tf-operator/pull/646 the number of events # no longer seems to match the expected; it looks like maybe events # are being combined? For now we just log a warning rather than an # error. logging.warning(creation_failures) if args.tfjob_version == "v1alpha1": pod_labels = get_labels(name, runtime_id) pod_selector = to_selector(pod_labels) else: pod_labels = get_labels_v1alpha2(name) pod_selector = to_selector(pod_labels) # We don't wait for pods to be deleted in v1alpha2 because CleanPodPolicy # means completed pods won't be deleted. # TODO(jlewi): We should add a test to deal with deleted pods. if args.tfjob_version == "v1alpha1": wait_for_pods_to_be_deleted(api_client, namespace, pod_selector) tf_job_client.delete_tf_job(api_client, namespace, name, version=args.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", name, namespace) wait_for_delete(api_client, namespace, name, args.tfjob_version, status_callback=tf_job_client.log_status) # TODO(jlewi): # Here are some validation checks to run: # 1. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except util.TimeoutError: t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format( name, namespace) logging.exception(t.failure) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. logging.exception("There was a problem running the job; Exception %s", e) # We want to catch all exceptions because we want the test as failed. t.failure = ("Exception occured; type {0} message {1}".format( e.__class__, e.message)) finally: t.time = time.time() - start if args.junit_path: test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def test_pod_names(self): api_client = k8s_client.ApiClient() component = COMPONENT_NAME + "_" + self.tfjob_version ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) job_specs = extract_job_specs( results.get("spec", {}).get("tfReplicaSpecs", {})) expected_pod_names = [] for replica_type, replica_num in job_specs.items(): logging.info("job_type = %s, replica = %s", replica_type, replica_num) for i in range(replica_num): expected_pod_names.append( POD_NAME_FORMAT.format(name=self.name, replica=replica_type, index=i)) expected_pod_names = set(expected_pod_names) actual_pod_names = tf_job_client.get_pod_names(api_client, self.namespace, self.name) # We are not able to guarantee pods selected with default namespace and job # name are only for this test run only. Therefore we only do partial check, # e.g. make sure expected set of pod names are in the selected pod names. if not (expected_pod_names & actual_pod_names) == expected_pod_names: msg = "Actual pod names doesn't match. Expected: {0} Actual: {1}".format( str(expected_pod_names), str(actual_pod_names)) logging.error(msg) raise RuntimeError(msg) tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "chief", 1) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def run_tfjob_with_replica_restart_policy(self, component, replica_restart_policy, exit_code): api_client = k8s_client.ApiClient() # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) if replica_restart_policy == "Always" and exit_code == 0: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, True) elif replica_restart_policy == "Always" and exit_code == 1: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, True) elif replica_restart_policy == "OnFailure" and exit_code == 1: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, True) elif replica_restart_policy == "OnFailure" and exit_code == 0: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, False) elif replica_restart_policy == "Never" and exit_code == 1: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, False) elif replica_restart_policy == "Never" and exit_code == 0: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, False) elif replica_restart_policy == "ExitCode" and exit_code == 1: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, False) else: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, True) if res is False: self.failure = "Job {0} in namespace {1} with restart policy {2} failed test \ with exit_code {3}".format(self.name, self.namespace, replica_restart_policy, exit_code) logging.error(self.failure) return # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def run_tfjob_with_cleanpod_policy(self, component, clean_pod_policy): api_client = k8s_client.ApiClient() # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return # All pods are deleted. if clean_pod_policy == "All": pod_labels = tf_job_client.get_labels(self.name) pod_selector = tf_job_client.to_selector(pod_labels) k8s_util.wait_for_pods_to_be_deleted(api_client, self.namespace, pod_selector) # Only running pods (PS) are deleted, completed pods are not. elif clean_pod_policy == "Running": tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "Chief", ["Succeeded"]) tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "Worker", ["Succeeded"]) pod_labels = tf_job_client.get_labels(self.name, "PS") pod_selector = tf_job_client.to_selector(pod_labels) k8s_util.wait_for_pods_to_be_deleted(api_client, self.namespace, pod_selector) # No pods are deleted. elif clean_pod_policy == "None": tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "Chief", ["Succeeded"]) tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "Worker", ["Succeeded"]) tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "PS", ["Running"]) # Delete the TFJob. tf_job_client.delete_tf_job( api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
# Hack to try using the tf_job_client to get the status of a job. # Use this to diagnose why v1alpha2 TFJob tests are failing. from kubernetes import client as k8s_client from py import tf_job_client from kubeflow.testing import util name = "tfjob-issue-summarization" namespace = "kubeflow" util.load_kube_config() client = k8s_client.ApiClient() masterHost = client.configuration.host results = tf_job_client.wait_for_condition( client, namespace, name, ["Running", "Succeeded", "Failed"], status_callback=tf_job_client.log_status) print("Done")