Exemplo n.º 1
0
def _run_eks_mxnet_multi_node_training(namespace, job_name,
                                       remote_yaml_file_path):
    """Run MXNet distributed training on EKS using MXNet Operator
    Args:
    namespace, job_name, remote_yaml_file_path
    """

    training_result = False

    # Namespaces will allow parallel runs on the same cluster. Create namespace if it doesnt exist.
    does_namespace_exist = run(f"kubectl get namespace | grep {namespace}",
                               warn=True)
    if not does_namespace_exist:
        run(f"kubectl create namespace {namespace}")

    try:

        # Delete old job with same name if exists
        run(f"kubectl delete -f {remote_yaml_file_path}", warn=True)
        run(f"kubectl create -f {remote_yaml_file_path} -n {namespace}")
        if is_mxnet_eks_multinode_training_complete(job_name, namespace):
            training_result = True
    finally:
        eks_utils.eks_multinode_cleanup(remote_yaml_file_path, namespace)

    return training_result
def run_eks_pytorch_multi_node_training(namespace, job_name,
                                        remote_yaml_file_path):
    """Run PyTorch distributed training on EKS using PyTorch Operator
    Args:
    namespace, job_name, remote_yaml_file_path
    """

    # Namespaces will allow parallel runs on the same cluster. Create namespace if it doesnt exist.
    does_namespace_exist = run(f"kubectl get namespace | grep {namespace}",
                               warn=True)
    if does_namespace_exist.return_code != 0:
        run(f"kubectl create namespace {namespace}")

    try:
        run(f"kubectl delete -f {remote_yaml_file_path}", warn=True)
        run(f"kubectl create -f {remote_yaml_file_path} -n {namespace}")
        training_result = is_pytorch_eks_multinode_training_complete(
            job_name, namespace)
        if training_result:
            run_out = run(f"kubectl logs {job_name}-master-0 -n {namespace}",
                          warn=True).stdout
            if "accuracy" in run_out:
                training_result = True
            else:
                eks_utils.LOGGER.info("**** training output ****")
                eks_utils.LOGGER.debug(run_out)
        assert training_result, f"Training for eks pytorch multinode failed"
    finally:
        eks_utils.eks_multinode_cleanup(remote_yaml_file_path, namespace)
def _run_eks_multi_node_training_mpijob(namespace, job_name, remote_yaml_file_path):
    """
    Function to run eks multinode training MPI job
    """

    run(f"kubectl create namespace {namespace}")

    try:
        training_job_start = run(f"kubectl create -f {remote_yaml_file_path} -n {namespace}", warn=True)
        if training_job_start.return_code:
            raise RuntimeError(f"Failed to start {job_name}:\n{training_job_start.stderr}")

        LOGGER.info("Check pods")
        run(f"kubectl get pods -n {namespace} -o wide")

        complete_pod_name = eks_utils.is_mpijob_launcher_pod_ready(namespace, job_name)

        _, pod_name = complete_pod_name.split("/")
        LOGGER.info(f"The Pods have been created and the name of the launcher pod is {pod_name}")

        LOGGER.info(f"Wait for the {job_name} job to complete")
        if eks_utils.is_eks_multinode_training_complete(remote_yaml_file_path, namespace, pod_name, job_name):
            LOGGER.info(f"Wait for the {pod_name} pod to reach completion")
            distributed_out = run(f"kubectl logs -n {namespace} -f {complete_pod_name}").stdout
            LOGGER.info(distributed_out)
    finally:
        eks_utils.eks_multinode_cleanup(remote_yaml_file_path, namespace)
def _run_eks_tensorflow_multi_node_training_mpijob(namespace, job_name, remote_yaml_file_path):
    """
    Run Tensorflow distributed training on EKS using horovod docker images using MPIJob
    :param namespace:
    :param job_name:
    :param remote_yaml_file_path:
    :return: None
    """
    pod_name = None
    run(f"kubectl create namespace {namespace}")

    try:
        training_job_start = run(f"kubectl create -f {remote_yaml_file_path} -n {namespace}", warn=True)
        if training_job_start.return_code:
            raise RuntimeError(f"Failed to start {job_name}:\n{training_job_start.stderr}")

        eks_utils.LOGGER.info("Check pods")
        run(f"kubectl get pods -n {namespace} -o wide")

        complete_pod_name = eks_utils.is_mpijob_launcher_pod_ready(namespace, job_name)

        _, pod_name = complete_pod_name.split("/")
        eks_utils.LOGGER.info(f"The Pods have been created and the name of the launcher pod is {pod_name}")

        eks_utils.LOGGER.info(f"Wait for the {job_name} job to complete")
        if eks_utils.is_eks_multinode_training_complete(remote_yaml_file_path, namespace, pod_name, job_name):
            eks_utils.LOGGER.info(f"Wait for the {pod_name} pod to reach completion")
            distributed_out = run(f"kubectl logs -n {namespace} -f {complete_pod_name}").stdout
            eks_utils.LOGGER.info(distributed_out)
    finally:
        eks_utils.eks_multinode_cleanup(remote_yaml_file_path, namespace)
Exemplo n.º 5
0
def _run_eks_mxnet_multi_node_training(namespace, app_name, job_name, remote_yaml_file_path, unique_id):
    """Run MXNet distributed training on EKS using MXNet Operator
    Args:
    namespace, app_name, job_name, remote_yaml_file_path
    """

    kubeflow_version = "v0.4.1"
    home_dir = run("echo $HOME").stdout.strip("\n")
    path_to_ksonnet_app = os.path.join(home_dir, f"mxnet_multi_node_eks_test-{unique_id}")
    env = f"{namespace}-env"

    training_result = False

    ctx = Context()

    # Namespaces will allow parallel runs on the same cluster. Create namespace if it doesnt exist.
    does_namespace_exist = ctx.run(f"kubectl get namespace | grep {namespace}", warn=True)
    if not does_namespace_exist:
        ctx.run(f"kubectl create namespace {namespace}")
    if not os.path.exists(path_to_ksonnet_app):
        ctx.run(f"mkdir -p {path_to_ksonnet_app}")

    with ctx.cd(f"{path_to_ksonnet_app}"):
        ctx.run(f"rm -rf {app_name}")
        github_handler = GitHubHandler("aws", "kubeflow")
        github_token = github_handler.get_auth_token()
        ctx.run(f"ks init {app_name} --namespace {namespace}")

        with ctx.cd(app_name):
            ctx.run(f"ks env add {env} --namespace {namespace}")
            # Check if the kubeflow registry exists and create. Registry will be available in each pod.
            does_registry_exist = ctx.run("ks registry list | grep kubeflow", warn=True)
            if not does_registry_exist:
                ctx.run(
                    f"ks registry add kubeflow github.com/kubeflow/kubeflow/tree/{kubeflow_version}/kubeflow",
                    env={"GITHUB_TOKEN": github_token},
                    hide=True,
                )
                ctx.run(
                    f"ks pkg install kubeflow/mxnet-job@{kubeflow_version}",
                    env={"GITHUB_TOKEN": github_token},
                    hide=True,
                )

                ctx.run("ks generate mxnet-operator mxnet-operator", hide=True)

                try:
                    ctx.run(f"kubectl get pods -n {namespace} -o wide")
                    LOGGER.debug(f"ks apply {env} -c mxnet-operator -n {namespace}")
                    ctx.run(f"ks apply {env} -c mxnet-operator -n {namespace}")
                    # Delete old job with same name if exists
                    ctx.run(f"kubectl delete -f {remote_yaml_file_path}", warn=True)
                    ctx.run(f"kubectl create -f {remote_yaml_file_path} -n {namespace}")
                    if is_mxnet_eks_multinode_training_complete(job_name, namespace):
                        training_result = True
                finally:
                    eks_utils.eks_multinode_cleanup("", job_name, namespace, env)

    return training_result
def run_eks_pytorch_multi_node_training(namespace, app_name, job_name,
                                        remote_yaml_file_path, unique_id):
    """Run PyTorch distributed training on EKS using PyTorch Operator
    Args:
    namespace, app_name, job_name, remote_yaml_file_path
    """
    KUBEFLOW_VERSION = "v0.6.1"
    home_dir = run("echo $HOME").stdout.strip("\n")
    path_to_ksonnet_app = os.path.join(
        home_dir, f"pytorch_multi_node_eks_test-{unique_id}")
    env = f"{namespace}-env"

    ctx = Context()

    # Namespaces will allow parallel runs on the same cluster. Create namespace if it doesnt exist.
    does_namespace_exist = run(f"kubectl get namespace | grep {namespace}",
                               warn=True)
    if not does_namespace_exist:
        run(f"kubectl create namespace {namespace}")

    if not os.path.exists(path_to_ksonnet_app):
        ctx.run(f"mkdir -p {path_to_ksonnet_app}")

    with ctx.cd(path_to_ksonnet_app):
        ctx.run(f"rm -rf {app_name}")
        # Create a new ksonnet app.
        github_handler = GitHubHandler("aws", "kubeflow")
        github_handler.set_ksonnet_env()
        ctx.run(f"ks init {app_name} --namespace {namespace}")

        with ctx.cd(app_name):
            ctx.run(f"ks env add {env} --namespace {namespace}")

            # Check if the kubeflow registry exists and create. Registry will be available in each pod.
            does_registry_exist = ctx.run("ks registry list | grep kubeflow",
                                          warn=True)
            if not does_registry_exist:
                ctx.run(
                    f"ks registry add kubeflow github.com/kubeflow/kubeflow/tree/{KUBEFLOW_VERSION}/kubeflow",
                )
                ctx.run(
                    f"ks pkg install kubeflow/pytorch-job@{KUBEFLOW_VERSION}",
                )
                ctx.run(f"ks generate pytorch-operator pytorch-operator")
                try:
                    # use `$ks show default` to see details.
                    ctx.run(f"kubectl get pods -n {namespace} -o wide")
                    LOGGER.debug(
                        f"ks apply {env} -c pytorch-operator -n {namespace}")
                    ctx.run(
                        f"ks apply {env} -c pytorch-operator -n {namespace}")
                    # Delete old job with same name if exists
                    ctx.run(f"kubectl delete -f {remote_yaml_file_path}",
                            warn=True)
                    ctx.run(
                        f"kubectl create -f {remote_yaml_file_path} -n {namespace}"
                    )
                    training_result = is_pytorch_eks_multinode_training_complete(
                        job_name, namespace)
                    if training_result:
                        run_out = run(
                            f"kubectl logs {job_name}-master-0 -n {namespace}",
                            warn=True).stdout
                        if "accuracy" in run_out:
                            training_result = True
                        else:
                            eks_utils.LOGGER.info("**** training output ****")
                            eks_utils.LOGGER.debug(run_out)
                    assert training_result, f"Training for eks pytorch multinode failed"
                finally:
                    eks_utils.eks_multinode_cleanup(ctx, "", job_name,
                                                    namespace, env)
Exemplo n.º 7
0
def _run_eks_tensorflow_multi_node_training_mpijob(
        namespace, app_name, custom_image, job_name, command_to_run,
        args_to_pass, path_to_ksonnet_app, cluster_size, eks_gpus_per_worker):
    """
    Run Tensorflow distributed training on EKS using horovod docker images using MPIJob
    :param namespace:
    :param app_name:
    :param custom_image:
    :param job_name:
    :param command_to_run:
    :param args_to_pass:
    :param path_to_ksonnet_app:
    :param cluster_size:
    :param eks_gpus_per_worker:
    :return: None
    """
    KUBEFLOW_VERSION = "v0.5.1"
    pod_name = None
    env = f"{namespace}-env"
    ctx = Context()
    github_handler = GitHubHandler("aws", "kubeflow")
    github_handler.set_ksonnet_env()

    ctx.run(f"kubectl create namespace {namespace}")

    if not os.path.exists(path_to_ksonnet_app):
        ctx.run(f"mkdir -p {path_to_ksonnet_app}")

    with ctx.cd(path_to_ksonnet_app):
        ctx.run(f"rm -rf {app_name}")
        ctx.run(f"ks init {app_name} --namespace {namespace}")

        with ctx.cd(app_name):
            ctx.run(f"ks env add {env} --namespace {namespace}")
            # Check if the kubeflow registry exists and create. Registry will be available in each pod.
            registry_not_exist = ctx.run("ks registry list | grep kubeflow",
                                         warn=True)

            if registry_not_exist.return_code:
                ctx.run(
                    f"ks registry add kubeflow github.com/kubeflow/kubeflow/tree/{KUBEFLOW_VERSION}/kubeflow",
                )
                ctx.run(f"ks pkg install kubeflow/common@{KUBEFLOW_VERSION}")
                ctx.run(f"ks pkg install kubeflow/mpi-job@{KUBEFLOW_VERSION}")

            try:
                ctx.run("ks generate mpi-operator mpi-operator")
                # The latest mpi-operator docker image does not accept the gpus-per-node parameter
                # which is specified by the older spec file from v0.5.1.
                ctx.run(
                    "ks param set mpi-operator image mpioperator/mpi-operator:0.2.0"
                )
                ctx.run(
                    "ks param set mpi-operator kubectlDeliveryImage mpioperator/kubectl-delivery:0.2.0"
                )
                mpi_operator_start = ctx.run(f"ks apply {env} -c mpi-operator",
                                             warn=True)
                if mpi_operator_start.return_code:
                    raise RuntimeError(
                        f"Failed to start mpi-operator:\n{mpi_operator_start.stderr}"
                    )

                eks_utils.LOGGER.info(
                    f"The mpi-operator package must be applied to {env} env before we can use mpiJob. "
                    f"Check status before moving on.")
                ctx.run("kubectl get crd")

                # Use Ksonnet to generate manifest files which are then applied to the default context.
                ctx.run(f"ks generate mpi-job-custom {job_name}")
                ctx.run(f"ks param set {job_name} replicas {cluster_size}")
                ctx.run(
                    f"ks param set {job_name} gpusPerReplica {eks_gpus_per_worker}"
                )
                ctx.run(f"ks param set {job_name} image {custom_image}")
                ctx.run(f"ks param set {job_name} command {command_to_run}")
                ctx.run(f"ks param set {job_name} args {args_to_pass}")

                # use `$ks show default` to see details.
                ctx.run(f"kubectl get pods -n {namespace} -o wide")
                eks_utils.LOGGER.info(
                    f"Apply the generated manifest to the {env} env.")
                training_job_start = ctx.run(f"ks apply {env} -c {job_name}",
                                             warn=True)
                if training_job_start.return_code:
                    raise RuntimeError(
                        f"Failed to start {job_name}:\n{training_job_start.stderr}"
                    )

                eks_utils.LOGGER.info("Check pods")
                ctx.run(f"kubectl get pods -n {namespace} -o wide")

                eks_utils.LOGGER.info(
                    "First the mpi-operator and the n-worker pods will be created and then "
                    "the launcher pod is created in the end. Use retries until launcher "
                    "pod's name is available to read logs.")
                complete_pod_name = eks_utils.is_mpijob_launcher_pod_ready(
                    ctx, namespace, job_name)

                _, pod_name = complete_pod_name.split("/")
                eks_utils.LOGGER.info(
                    f"The Pods have been created and the name of the launcher pod is {pod_name}"
                )

                eks_utils.LOGGER.info(
                    f"Wait for the {job_name} job to complete")
                if eks_utils.is_eks_multinode_training_complete(
                        ctx, namespace, env, pod_name, job_name):
                    eks_utils.LOGGER.info(
                        f"Wait for the {pod_name} pod to reach completion")
                    distributed_out = ctx.run(
                        f"kubectl logs -n {namespace} -f {complete_pod_name}"
                    ).stdout
                    eks_utils.LOGGER.info(distributed_out)
            finally:
                eks_utils.eks_multinode_cleanup(ctx, pod_name, job_name,
                                                namespace, env)