def _run_eks_multi_node_training_mpijob(namespace, job_name, remote_yaml_file_path): """ Function to run eks multinode training MPI job """ run(f"kubectl create namespace {namespace}") try: training_job_start = run(f"kubectl create -f {remote_yaml_file_path} -n {namespace}", warn=True) if training_job_start.return_code: raise RuntimeError(f"Failed to start {job_name}:\n{training_job_start.stderr}") LOGGER.info("Check pods") run(f"kubectl get pods -n {namespace} -o wide") complete_pod_name = eks_utils.is_mpijob_launcher_pod_ready(namespace, job_name) _, pod_name = complete_pod_name.split("/") LOGGER.info(f"The Pods have been created and the name of the launcher pod is {pod_name}") LOGGER.info(f"Wait for the {job_name} job to complete") if eks_utils.is_eks_multinode_training_complete(remote_yaml_file_path, namespace, pod_name, job_name): LOGGER.info(f"Wait for the {pod_name} pod to reach completion") distributed_out = run(f"kubectl logs -n {namespace} -f {complete_pod_name}").stdout LOGGER.info(distributed_out) finally: eks_utils.eks_multinode_cleanup(remote_yaml_file_path, namespace)
def _run_eks_tensorflow_multi_node_training_mpijob(namespace, job_name, remote_yaml_file_path): """ Run Tensorflow distributed training on EKS using horovod docker images using MPIJob :param namespace: :param job_name: :param remote_yaml_file_path: :return: None """ pod_name = None run(f"kubectl create namespace {namespace}") try: training_job_start = run(f"kubectl create -f {remote_yaml_file_path} -n {namespace}", warn=True) if training_job_start.return_code: raise RuntimeError(f"Failed to start {job_name}:\n{training_job_start.stderr}") eks_utils.LOGGER.info("Check pods") run(f"kubectl get pods -n {namespace} -o wide") complete_pod_name = eks_utils.is_mpijob_launcher_pod_ready(namespace, job_name) _, pod_name = complete_pod_name.split("/") eks_utils.LOGGER.info(f"The Pods have been created and the name of the launcher pod is {pod_name}") eks_utils.LOGGER.info(f"Wait for the {job_name} job to complete") if eks_utils.is_eks_multinode_training_complete(remote_yaml_file_path, namespace, pod_name, job_name): eks_utils.LOGGER.info(f"Wait for the {pod_name} pod to reach completion") distributed_out = run(f"kubectl logs -n {namespace} -f {complete_pod_name}").stdout eks_utils.LOGGER.info(distributed_out) finally: eks_utils.eks_multinode_cleanup(remote_yaml_file_path, namespace)
def _run_eks_tensorflow_multi_node_training_mpijob( namespace, app_name, custom_image, job_name, command_to_run, args_to_pass, path_to_ksonnet_app, cluster_size, eks_gpus_per_worker): """ Run Tensorflow distributed training on EKS using horovod docker images using MPIJob :param namespace: :param app_name: :param custom_image: :param job_name: :param command_to_run: :param args_to_pass: :param path_to_ksonnet_app: :param cluster_size: :param eks_gpus_per_worker: :return: None """ KUBEFLOW_VERSION = "v0.5.1" pod_name = None env = f"{namespace}-env" ctx = Context() github_handler = GitHubHandler("aws", "kubeflow") github_handler.set_ksonnet_env() ctx.run(f"kubectl create namespace {namespace}") if not os.path.exists(path_to_ksonnet_app): ctx.run(f"mkdir -p {path_to_ksonnet_app}") with ctx.cd(path_to_ksonnet_app): ctx.run(f"rm -rf {app_name}") ctx.run(f"ks init {app_name} --namespace {namespace}") with ctx.cd(app_name): ctx.run(f"ks env add {env} --namespace {namespace}") # Check if the kubeflow registry exists and create. Registry will be available in each pod. registry_not_exist = ctx.run("ks registry list | grep kubeflow", warn=True) if registry_not_exist.return_code: ctx.run( f"ks registry add kubeflow github.com/kubeflow/kubeflow/tree/{KUBEFLOW_VERSION}/kubeflow", ) ctx.run(f"ks pkg install kubeflow/common@{KUBEFLOW_VERSION}") ctx.run(f"ks pkg install kubeflow/mpi-job@{KUBEFLOW_VERSION}") try: ctx.run("ks generate mpi-operator mpi-operator") # The latest mpi-operator docker image does not accept the gpus-per-node parameter # which is specified by the older spec file from v0.5.1. ctx.run( "ks param set mpi-operator image mpioperator/mpi-operator:0.2.0" ) ctx.run( "ks param set mpi-operator kubectlDeliveryImage mpioperator/kubectl-delivery:0.2.0" ) mpi_operator_start = ctx.run(f"ks apply {env} -c mpi-operator", warn=True) if mpi_operator_start.return_code: raise RuntimeError( f"Failed to start mpi-operator:\n{mpi_operator_start.stderr}" ) eks_utils.LOGGER.info( f"The mpi-operator package must be applied to {env} env before we can use mpiJob. " f"Check status before moving on.") ctx.run("kubectl get crd") # Use Ksonnet to generate manifest files which are then applied to the default context. ctx.run(f"ks generate mpi-job-custom {job_name}") ctx.run(f"ks param set {job_name} replicas {cluster_size}") ctx.run( f"ks param set {job_name} gpusPerReplica {eks_gpus_per_worker}" ) ctx.run(f"ks param set {job_name} image {custom_image}") ctx.run(f"ks param set {job_name} command {command_to_run}") ctx.run(f"ks param set {job_name} args {args_to_pass}") # use `$ks show default` to see details. ctx.run(f"kubectl get pods -n {namespace} -o wide") eks_utils.LOGGER.info( f"Apply the generated manifest to the {env} env.") training_job_start = ctx.run(f"ks apply {env} -c {job_name}", warn=True) if training_job_start.return_code: raise RuntimeError( f"Failed to start {job_name}:\n{training_job_start.stderr}" ) eks_utils.LOGGER.info("Check pods") ctx.run(f"kubectl get pods -n {namespace} -o wide") eks_utils.LOGGER.info( "First the mpi-operator and the n-worker pods will be created and then " "the launcher pod is created in the end. Use retries until launcher " "pod's name is available to read logs.") complete_pod_name = eks_utils.is_mpijob_launcher_pod_ready( ctx, namespace, job_name) _, pod_name = complete_pod_name.split("/") eks_utils.LOGGER.info( f"The Pods have been created and the name of the launcher pod is {pod_name}" ) eks_utils.LOGGER.info( f"Wait for the {job_name} job to complete") if eks_utils.is_eks_multinode_training_complete( ctx, namespace, env, pod_name, job_name): eks_utils.LOGGER.info( f"Wait for the {pod_name} pod to reach completion") distributed_out = ctx.run( f"kubectl logs -n {namespace} -f {complete_pod_name}" ).stdout eks_utils.LOGGER.info(distributed_out) finally: eks_utils.eks_multinode_cleanup(ctx, pod_name, job_name, namespace, env)