def run_simple_tfjob(self, component): api_client = k8s_client.ApiClient() # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return # Check for creation failures. creation_failures = tf_job_client.get_creation_failures_from_tfjob( api_client, self.namespace, results) if creation_failures: # TODO(jlewi): Starting with # https://github.com/kubeflow/tf-operator/pull/646 the number of events # no longer seems to match the expected; it looks like maybe events # are being combined? For now we just log a warning rather than an # error. logging.warning(creation_failures) # Delete the TFJob. tf_job_client.delete_tf_job( api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def run_tfjob_with_shutdown_policy(self, component, shutdown_policy): tf_operator_util.load_kube_config() api_client = k8s_client.ApiClient() # Setup the ksonnet app tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) if shutdown_policy == "worker": tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "worker", 1) else: tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "chief", 1) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def run_distributed_training_job(self, component): api_client = k8s_client.ApiClient() # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return # Check for creation failures. creation_failures = tf_job_client.get_creation_failures_from_tfjob( api_client, self.namespace, results) if creation_failures: logging.warning(creation_failures) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def test_train(self): # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. api_client = k8s_client.ApiClient() component = "tfjob" # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job util.run([self.ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) # Check for errors creating pods and services. Can potentially # help debug failed test runs. creation_failures = tf_job_client.get_creation_failures_from_tfjob( api_client, self.namespace, results) if creation_failures: logging.warning(creation_failures) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( # pylint: disable=attribute-defined-outside-init self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return
def test_train(self): # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. api_client = k8s_client.ApiClient() # Setup parameters for kustomize # TODO(jinchihe): Should enhance here after the kustomize util created. configmap = 'mnist-map-gcs' for pair in self.params.split(","): k, v = pair.split("=", 1) if k == "namespace" or k == "image": util.run(["kustomize edit set", k, v], cwd=self.app_dir) elif k == "numPs": util.run(["./definition.sh --numPs", v], cwd=self.app_dir) elif k == "numWorkers": util.run(["./definition.sh --numWorkers", v], cwd=self.app_dir) elif k == "secret": secretName, secretMountPath = v.split("=", 1) util.run([ "kustomize edit add configmap", configmap, "--from-literal=secretName=", secretName ], cwd=self.app_dir) util.run([ "kustomize edit add configmap", configmap, "--from-literal=secretMountPath=", secretMountPath ], cwd=self.app_dir) elif k == "envVariables": var_k, var_v = v.split("=", 1) util.run([ "kustomize edit add configmap", configmap, "--from-literal=", var_k, "=", var_v ], cwd=self.app_dir) else: util.run([ "kustomize edit add configmap", configmap, "--from-literal=", k, "=", v ], cwd=self.app_dir) # Create the TF job util.run(["kustomize build . |kubectl apply -f -"], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) # Check for errors creating pods and services. Can potentially # help debug failed test runs. creation_failures = tf_job_client.get_creation_failures_from_tfjob( api_client, self.namespace, results) if creation_failures: logging.warning(creation_failures) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( # pylint: disable=attribute-defined-outside-init self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return
def test_tfjob_and_verify_runconfig(self): api_client = k8s_client.ApiClient() masterHost = api_client.configuration.host component = COMPONENT_NAME + "_" + self.tfjob_version # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) num_ps = results.get("spec", {}).get("tfReplicaSpecs", {}).get("PS", {}).get("replicas", 0) num_workers = results.get("spec", {}).get("tfReplicaSpecs", {}).get("Worker", {}).get("replicas", 0) verify_runconfig(masterHost, self.namespace, self.name, "chief", num_ps, num_workers) verify_runconfig(masterHost, self.namespace, self.name, "worker", num_ps, num_workers) verify_runconfig(masterHost, self.namespace, self.name, "ps", num_ps, num_workers) verify_runconfig(masterHost, self.namespace, self.name, "evaluator", num_ps, num_workers) tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "chief", 1) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def test_training( record_xml_attribute, tfjob_name, namespace, trainer_image, num_ps, #pylint: disable=too-many-arguments num_workers, train_steps, batch_size, learning_rate, model_dir, export_dir): util.set_pytest_junit(record_xml_attribute, "test_mnist") util.maybe_activate_service_account() app_dir = os.path.join(os.path.dirname(__file__), "../training/GCS") app_dir = os.path.abspath(app_dir) logging.info("--app_dir not set defaulting to: %s", app_dir) # TODO (@jinchihe) Using kustomize 2.0.3 to work around below issue: # https://github.com/kubernetes-sigs/kustomize/issues/1295 kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \ 'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64' util.run(['wget', '-q', '-O', '/usr/local/bin/kustomize', kusUrl], cwd=app_dir) util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=app_dir) # TODO (@jinchihe): The kubectl need to be upgraded to 1.14.0 due to below issue. # Invalid object doesn't have additional properties ... kusUrl = 'https://storage.googleapis.com/kubernetes-release/' \ 'release/v1.14.0/bin/linux/amd64/kubectl' util.run(['wget', '-q', '-O', '/usr/local/bin/kubectl', kusUrl], cwd=app_dir) util.run(['chmod', 'a+x', '/usr/local/bin/kubectl'], cwd=app_dir) # Configurate custom parameters using kustomize util.run(['kustomize', 'edit', 'set', 'namespace', namespace], cwd=app_dir) util.run([ 'kustomize', 'edit', 'set', 'image', 'training-image=' + trainer_image ], cwd=app_dir) util.run(['../base/definition.sh', '--numPs', num_ps], cwd=app_dir) util.run(['../base/definition.sh', '--numWorkers', num_workers], cwd=app_dir) trainning_config = { "name": tfjob_name, "trainSteps": train_steps, "batchSize": batch_size, "learningRate": learning_rate, "modelDir": model_dir, "exportDir": export_dir, } configmap = 'mnist-map-training' for key, value in trainning_config.items(): util.run([ 'kustomize', 'edit', 'add', 'configmap', configmap, '--from-literal=' + key + '=' + value ], cwd=app_dir) # Created the TFJobs. util.run(['kustomize', 'build', app_dir, '-o', 'generated.yaml'], cwd=app_dir) util.run(['kubectl', 'apply', '-f', 'generated.yaml'], cwd=app_dir) logging.info("Created job %s in namespaces %s", tfjob_name, namespace) kube_config.load_kube_config() api_client = k8s_client.ApiClient() # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, namespace, tfjob_name, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) # Check for errors creating pods and services. Can potentially # help debug failed test runs. creation_failures = tf_job_client.get_creation_failures_from_tfjob( api_client, namespace, results) if creation_failures: logging.warning(creation_failures) if not tf_job_client.job_succeeded(results): failure = "Job {0} in namespace {1} in status {2}".format( # pylint: disable=attribute-defined-outside-init tfjob_name, namespace, results.get("status", {})) logging.error(failure) # if the TFJob failed, print out the pod logs for debugging. pod_names = tf_job_client.get_pod_names(api_client, namespace, tfjob_name) logging.info("The Pods name:\n %s", pod_names) core_api = k8s_client.CoreV1Api(api_client) for pod in pod_names: logging.info("Getting logs of Pod %s.", pod) try: pod_logs = core_api.read_namespaced_pod_log(pod, namespace) logging.info("The logs of Pod %s log:\n %s", pod, pod_logs) except k8s_client.rest.ApiException as e: logging.info( "Exception when calling CoreV1Api->read_namespaced_pod_log: %s\n", e) return
def test_train(self): # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. api_client = k8s_client.ApiClient() # TODO (jinchihe) beflow code will be removed once new test-worker image # is publish in https://github.com/kubeflow/testing/issues/373. kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \ 'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64' util.run(['wget', '-O', '/usr/local/bin/kustomize', kusUrl], cwd=self.app_dir) util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=self.app_dir) # Setup parameters for kustomize configmap = 'mnist-map-training' for pair in self.params.split(","): k, v = pair.split("=", 1) if k == "namespace": util.run(['kustomize', 'edit', 'set', k, v], cwd=self.app_dir) elif k == "image": util.run( ['kustomize', 'edit', 'set', k, 'training-image=' + v], cwd=self.app_dir) elif k == "numPs": util.run(['../base/definition.sh', '--numPs', v], cwd=self.app_dir) elif k == "numWorkers": util.run(['../base/definition.sh', '--numWorkers', v], cwd=self.app_dir) elif k == "secret": secretName, secretMountPath = v.split("=", 1) util.run([ 'kustomize', 'edit', 'add', 'configmap', configmap, '--from-literal=secretName=' + secretName ], cwd=self.app_dir) util.run([ 'kustomize', 'edit', 'add', 'configmap', configmap, '--from-literal=secretMountPath=' + secretMountPath ], cwd=self.app_dir) elif k == "envVariables": var_k, var_v = v.split("=", 1) util.run([ 'kustomize', 'edit', 'add', 'configmap', configmap, '--from-literal=' + var_k + '=' + var_v ], cwd=self.app_dir) else: util.run([ 'kustomize', 'edit', 'add', 'configmap', configmap, '--from-literal=' + k + '=' + v ], cwd=self.app_dir) # Create the TF job # Seems the util.run cannot handle pipes case, using check_call. subCmd = 'kustomize build ' + self.app_dir + '| kubectl apply -f -' subprocess.check_call(subCmd, shell=True) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) # Check for errors creating pods and services. Can potentially # help debug failed test runs. creation_failures = tf_job_client.get_creation_failures_from_tfjob( api_client, self.namespace, results) if creation_failures: logging.warning(creation_failures) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( # pylint: disable=attribute-defined-outside-init self.name, self.namespace, results.get("status", {})) logging.error(self.failure) # if the TFJob failed, print out the pod logs for debugging. pod_names = tf_job_client.get_pod_names(api_client, self.namespace, self.name) logging.info("The Pods name:\n %s", pod_names) core_api = k8s_client.CoreV1Api(api_client) for pod in pod_names: logging.info("Getting logs of Pod %s.", pod) try: pod_logs = core_api.read_namespaced_pod_log( pod, self.namespace) logging.info("The logs of Pod %s log:\n %s", pod, pod_logs) except k8s_client.rest.ApiException as e: logging.info( "Exception when calling CoreV1Api->read_namespaced_pod_log: %s\n", e) return
def run_tfjob_with_cleanpod_policy(self, component, clean_pod_policy): api_client = k8s_client.ApiClient() # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return # All pods are deleted. if clean_pod_policy == "All": pod_labels = tf_job_client.get_labels(self.name) pod_selector = tf_job_client.to_selector(pod_labels) k8s_util.wait_for_pods_to_be_deleted(api_client, self.namespace, pod_selector) # Only running pods (PS) are deleted, completed pods are not. elif clean_pod_policy == "Running": tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "Chief", ["Succeeded"]) tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "Worker", ["Succeeded"]) pod_labels = tf_job_client.get_labels(self.name, "PS") pod_selector = tf_job_client.to_selector(pod_labels) k8s_util.wait_for_pods_to_be_deleted(api_client, self.namespace, pod_selector) # No pods are deleted. elif clean_pod_policy == "None": tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "Chief", ["Succeeded"]) tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "Worker", ["Succeeded"]) tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "PS", ["Running"]) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def test_pod_names(self): api_client = k8s_client.ApiClient() component = COMPONENT_NAME + "_" + self.tfjob_version ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) job_specs = extract_job_specs( results.get("spec", {}).get("tfReplicaSpecs", {})) expected_pod_names = [] for replica_type, replica_num in job_specs.items(): logging.info("job_type = %s, replica = %s", replica_type, replica_num) for i in range(replica_num): expected_pod_names.append( POD_NAME_FORMAT.format(name=self.name, replica=replica_type, index=i)) expected_pod_names = set(expected_pod_names) actual_pod_names = tf_job_client.get_pod_names(api_client, self.namespace, self.name) # We are not able to guarantee pods selected with default namespace and job # name are only for this test run only. Therefore we only do partial check, # e.g. make sure expected set of pod names are in the selected pod names. if not (expected_pod_names & actual_pod_names) == expected_pod_names: msg = "Actual pod names doesn't match. Expected: {0} Actual: {1}".format( str(expected_pod_names), str(actual_pod_names)) logging.error(msg) raise RuntimeError(msg) tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "chief", 1) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)