def _run_eks_mxnet_multinode_training_horovod_mpijob(example_image_uri, cluster_size, eks_gpus_per_worker):
    
    LOGGER.info("Starting run_eks_mxnet_multi_node_training on MNIST dataset using horovod")
    LOGGER.info("The test will run on an example image %s", example_image_uri)

    user = Context().run("echo $USER").stdout.strip("\n")
    random.seed(f"{example_image_uri}-{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}")
    unique_tag = f"{user}-{random.randint(1, 10000)}"

    namespace = f"mx-multi-node-train-{'py2' if 'py2' in example_image_uri else 'py3'}-{unique_tag}"
    job_name = f"mxnet-mnist-horovod-job-{unique_tag}"

    LOGGER.debug(f"Namespace: {namespace}")

    local_template_file_path = os.path.join(
        "eks",
        "eks_manifest_templates",
        "mxnet",
        "training",
        "multi_node_training_horovod_mnist.yaml"
    )

    remote_yaml_file_path = os.path.join(os.sep, "tmp", f"tensorflow_multi_node_training_{unique_tag}.yaml")

    replace_dict = {
        "<JOB_NAME>": job_name,
        "<NUM_WORKERS>": cluster_size,
        "<CONTAINER_IMAGE>": example_image_uri,
        "<GPUS>": str(eks_gpus_per_worker)
    }

    eks_utils.write_eks_yaml_file_from_template(local_template_file_path, remote_yaml_file_path, replace_dict)

    _run_eks_multi_node_training_mpijob(namespace, job_name, remote_yaml_file_path)
示例#2
0
def test_eks_mxnet_single_node_training(mxnet_training):
    """
    Function to create a pod using kubectl and given container image, and run MXNet training
    Args:
        :param mxnet_training: the ECR URI
    """

    training_result = False

    rand_int = random.randint(4001, 6000)

    framework_version_search = re.search(r"\d+\.\d+", mxnet_training)
    framework_version = "v" + framework_version_search.group() + ".x"

    yaml_path = os.path.join(os.sep, "tmp",
                             f"mxnet_single_node_training_{rand_int}.yaml")
    pod_name = f"mxnet-single-node-training-{rand_int}"

    # Temporariy fix for 503 error while downloading MNIST dataset. See https://github.com/pytorch/vision/issues/3549
    mnist_dataset_download_config = '''
      FROM="http:\/\/yann\.lecun\.com\/exdb\/mnist\/" &&
      TO="https:\/\/ossci-datasets\.s3\.amazonaws\.com\/mnist\/" &&
      sed -i -e "s/${FROM}/${TO}/g" /incubator-mxnet/example/image-classification/train_mnist.py
    '''
    args = (
        f"git clone -b {framework_version} https://github.com/apache/incubator-mxnet.git && {mnist_dataset_download_config}  && python "
        f"/incubator-mxnet/example/image-classification/train_mnist.py")

    processor_type = "gpu" if "gpu" in mxnet_training else "cpu"
    args = args + " --gpus 0" if processor_type == "gpu" else args

    # TODO: Change hardcoded value to read a mapping from the EKS cluster instance.
    cpu_limit = 72
    cpu_limit = str(int(cpu_limit) / 2)

    search_replace_dict = {
        "<POD_NAME>": pod_name,
        "<CONTAINER_NAME>": mxnet_training,
        "<ARGS>": args,
        "<CPU_LIMIT>": cpu_limit,
    }

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path,
        search_replace_dict)

    try:
        run("kubectl create -f {}".format(yaml_path))

        if eks_utils.is_eks_training_complete(pod_name):
            mxnet_out = run("kubectl logs {}".format(pod_name)).stdout
            if "Epoch[19] Validation-accuracy" in mxnet_out:
                training_result = True
            else:
                eks_utils.LOGGER.info("**** training output ****")
                eks_utils.LOGGER.debug(mxnet_out)

        assert training_result, f"Training failed"
    finally:
        run("kubectl delete pods {}".format(pod_name))
示例#3
0
def test_eks_mxnet_dgl_single_node_training(mxnet_training, py3_only):
    """
    Function to create a pod using kubectl and given container image, and run
    DGL training with MXNet backend
    Args:
        :param mxnet_training: the ECR URI
    """

    training_result = False
    rand_int = random.randint(4001, 6000)

    yaml_path = os.path.join(
        os.sep, "tmp", f"mxnet_single_node_training_dgl_{rand_int}.yaml")
    pod_name = f"mxnet-single-node-training-dgl-{rand_int}"

    dgl_branch = "0.4.x"

    args = (
        f"git clone -b {dgl_branch} https://github.com/dmlc/dgl.git && "
        f"cd /dgl/examples/mxnet/gcn/ && DGLBACKEND=mxnet python train.py --dataset cora"
    )

    # TODO: Change hardcoded value to read a mapping from the EKS cluster instance.
    cpu_limit = 72
    cpu_limit = str(int(cpu_limit) / 2)

    if "gpu" in mxnet_training:
        if "cu110" in mxnet_training:
            pytest.skip(
                "Skipping DGL tests for GPU until dgl-cu110 is available.")
        args = args + " --gpu 0"
    else:
        args = args + " --gpu -1"

    search_replace_dict = {
        "<POD_NAME>": pod_name,
        "<CONTAINER_NAME>": mxnet_training,
        "<ARGS>": args,
        "<CPU_LIMIT>": cpu_limit,
    }

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path,
        search_replace_dict)

    try:
        run("kubectl create -f {}".format(yaml_path))

        if eks_utils.is_eks_training_complete(pod_name):
            dgl_out = run("kubectl logs {}".format(pod_name)).stdout
            if "Test accuracy" in dgl_out:
                training_result = True
            else:
                eks_utils.LOGGER.info("**** training output ****")
                eks_utils.LOGGER.debug(dgl_out)

        assert training_result, f"Training failed"
    finally:
        run("kubectl delete pods {}".format(pod_name))
def test_eks_pytorch_single_node_training(pytorch_training):
    """
    Function to create a pod using kubectl and given container image, and run MXNet training
    Args:
        :param setup_utils: environment in which EKS tools are setup
        :param pytorch_training: the ECR URI
    """

    training_result = False

    rand_int = random.randint(4001, 6000)

    yaml_path = os.path.join(os.sep, "tmp",
                             f"pytorch_single_node_training_{rand_int}.yaml")
    pod_name = f"pytorch-single-node-training-{rand_int}"

    mnist_dataset_download_config = '''
      FILE=new_main.py &&
      echo "from __future__ import print_function" > $FILE &&
      echo "from six.moves import urllib" >> $FILE &&
      echo "opener = urllib.request.build_opener()" >> $FILE &&
      echo "opener.addheaders = [('User-agent', 'Mozilla/5.0')]" >> $FILE &&
      echo "urllib.request.install_opener(opener)" >> $FILE &&
      sed -i '1d' examples/mnist/main.py &&
      cat examples/mnist/main.py >> $FILE &&
      rm examples/mnist/main.py &&
      mv $FILE examples/mnist/main.py
    '''

    args = f"git clone https://github.com/pytorch/examples.git && {mnist_dataset_download_config}  && python examples/mnist/main.py"

    # TODO: Change hardcoded value to read a mapping from the EKS cluster instance.
    cpu_limit = 72
    cpu_limit = str(int(cpu_limit) / 2)

    search_replace_dict = {
        "<POD_NAME>": pod_name,
        "<CONTAINER_NAME>": pytorch_training,
        "<ARGS>": args,
        "<CPU_LIMIT>": cpu_limit,
    }

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path,
        search_replace_dict)

    try:
        run("kubectl create -f {}".format(yaml_path))

        if eks_utils.is_eks_training_complete(pod_name):
            pytorch_out = run("kubectl logs {}".format(pod_name)).stdout
            if "Accuracy" in pytorch_out:
                training_result = True
            else:
                eks_utils.LOGGER.info("**** training output ****")
                eks_utils.LOGGER.debug(pytorch_out)
        assert training_result, f"Training failed"
    finally:
        run("kubectl delete pods {}".format(pod_name))
def test_eks_pytorch_densenet_inference(pytorch_inference):
    server_type = test_utils.get_inference_server_type(pytorch_inference)
    if "eia" in pytorch_inference:
        pytest.skip("Skipping EKS Test for EIA")
    elif "neuron" in pytorch_inference:
        pytest.skip(
            "Neuron specific test is run and so skipping this test for Neuron")
    elif server_type == "ts":
        model = "pytorch-densenet=https://torchserve.s3.amazonaws.com/mar_files/densenet161.mar"
        server_cmd = "torchserve"
    else:
        model = "pytorch-densenet=https://dlc-samples.s3.amazonaws.com/pytorch/multi-model-server/densenet/densenet.mar"
        server_cmd = "multi-model-server"

    num_replicas = "1"

    rand_int = random.randint(4001, 6000)

    processor = "gpu" if "gpu" in pytorch_inference else "cpu"

    yaml_path = os.path.join(
        os.sep, "tmp",
        f"pytorch_single_node_{processor}_inference_{rand_int}.yaml")
    inference_service_name = selector_name = f"densenet-service-{processor}-{rand_int}"

    search_replace_dict = {
        "<MODELS>": model,
        "<NUM_REPLICAS>": num_replicas,
        "<SELECTOR_NAME>": selector_name,
        "<INFERENCE_SERVICE_NAME>": inference_service_name,
        "<DOCKER_IMAGE_BUILD_ID>": pytorch_inference,
        "<SERVER_TYPE>": server_type,
        "<SERVER_CMD>": server_cmd
    }

    if processor == "gpu":
        search_replace_dict["<NUM_GPUS>"] = "1"

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.get_single_node_inference_template_path(
            "pytorch", processor), yaml_path, search_replace_dict)

    try:
        run("kubectl apply -f {}".format(yaml_path))

        port_to_forward = random.randint(49152, 65535)

        if eks_utils.is_service_running(selector_name):
            eks_utils.eks_forward_port_between_host_and_container(
                selector_name, port_to_forward, "8080")

        assert test_utils.request_pytorch_inference_densenet(
            port=port_to_forward, server_type=server_type)
    except ValueError as excp:
        eks_utils.LOGGER.error("Service is not running: %s", excp)
    finally:
        run(f"kubectl delete deployment {selector_name}")
        run(f"kubectl delete service {selector_name}")
示例#6
0
def test_eks_mxnet_single_node_training(mxnet_training):
    """
    Function to create a pod using kubectl and given container image, and run MXNet training
    Args:
        :param mxnet_training: the ECR URI
    """

    training_result = False

    rand_int = random.randint(4001, 6000)

    framework_version_search = re.search(r"\d+(\.\d+){2}", mxnet_training)
    framework_version = framework_version_search.group()
    if not framework_version_search:
        framework_version_search = re.search(r"\d+\.\d+", mxnet_training)
        framework_version = framework_version_search.group() + ".0"

    yaml_path = os.path.join(os.sep, "tmp", f"mxnet_single_node_training_{rand_int}.yaml")
    pod_name = f"mxnet-single-node-training-{rand_int}"

    args = (
        f"git clone -b {framework_version} https://github.com/apache/incubator-mxnet.git && python "
        f"/incubator-mxnet/example/image-classification/train_mnist.py"
    )

    processor_type = "gpu" if "gpu" in mxnet_training else "cpu"
    args = args + " --gpus 0" if processor_type == "gpu" else args

    # TODO: Change hardcoded value to read a mapping from the EKS cluster instance.
    cpu_limit = 72
    cpu_limit = str(int(cpu_limit) / 2)

    search_replace_dict = {
        "<POD_NAME>": pod_name,
        "<CONTAINER_NAME>": mxnet_training,
        "<ARGS>": args,
        "<CPU_LIMIT>": cpu_limit,
    }

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict
    )

    try:
        run("kubectl create -f {}".format(yaml_path))

        if eks_utils.is_eks_training_complete(pod_name):
            mxnet_out = run("kubectl logs {}".format(pod_name)).stdout
            if "Epoch[19] Validation-accuracy" in mxnet_out:
                training_result = True
            else:
                eks_utils.LOGGER.info("**** training output ****")
                eks_utils.LOGGER.debug(mxnet_out)

        assert training_result, f"Training failed"
    finally:
        run("kubectl delete pods {}".format(pod_name))
示例#7
0
def test_eks_pytorch_neuron_inference(pytorch_inference, neuron_only):
    server_type = test_utils.get_inference_server_type(pytorch_inference)
    if "neuron" not in pytorch_inference:
        pytest.skip("Skipping EKS Neuron Test for EIA and Non Neuron Images")

    model = "pytorch-resnet-neuron=https://aws-dlc-sample-models.s3.amazonaws.com/pytorch/Resnet50-neuron.mar"
    server_cmd = "/usr/local/bin/entrypoint.sh -m pytorch-resnet-neuron=https://aws-dlc-sample-models.s3.amazonaws.com/pytorch/Resnet50-neuron.mar -t /home/model-server/config.properties"
    num_replicas = "1"
    rand_int = random.randint(4001, 6000)
    processor = "neuron"

    yaml_path = os.path.join(
        os.sep, "tmp",
        f"pytorch_single_node_{processor}_inference_{rand_int}.yaml")
    inference_service_name = selector_name = f"resnet-{processor}-{rand_int}"

    search_replace_dict = {
        "<NUM_REPLICAS>": num_replicas,
        "<SELECTOR_NAME>": selector_name,
        "<INFERENCE_SERVICE_NAME>": inference_service_name,
        "<DOCKER_IMAGE_BUILD_ID>": pytorch_inference,
        "<SERVER_TYPE>": server_type,
        "<SERVER_CMD>": server_cmd
    }

    search_replace_dict["<NUM_INF1S>"] = "1"

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.get_single_node_inference_template_path(
            "pytorch", processor), yaml_path, search_replace_dict)
    device_plugin_path = eks_utils.get_device_plugin_path("pytorch", processor)

    try:
        # TODO - once eksctl gets the latest neuron device plugin this can be removed
        run("kubectl delete -f {}".format(device_plugin_path))
        sleep(60)
        run("kubectl apply -f {}".format(device_plugin_path))
        sleep(10)

        run("kubectl apply -f {}".format(yaml_path))

        port_to_forward = random.randint(49152, 65535)

        if eks_utils.is_service_running(selector_name):
            eks_utils.eks_forward_port_between_host_and_container(
                selector_name, port_to_forward, "8080")

        assert test_utils.request_pytorch_inference_densenet(
            port=port_to_forward)
    except ValueError as excp:
        run("kubectl cluster-info dump")
        eks_utils.LOGGER.error("Service is not running: %s", excp)
    finally:
        run(f"kubectl delete deployment {selector_name}")
        run(f"kubectl delete service {selector_name}")
def test_eks_pt_s3_plugin_single_node_training(pytorch_training,
                                               pt17_and_above_only):
    """
    Function to create a pod using kubectl and given container image, and run MXNet training
    Args:
        :param setup_utils: environment in which EKS tools are setup
        :param pytorch_training: the ECR URI
    """
    _, image_framework_version = get_framework_and_version_from_tag(
        pytorch_training)
    if Version(image_framework_version) < Version("1.8"):
        pytest.skip("S3 plugin is supported on PyTorch version >=1.8")

    training_result = False

    rand_int = random.randint(4001, 6000)

    yaml_path = os.path.join(
        os.sep, "tmp", f"pytorch_s3_single_node_training_{rand_int}.yaml")
    pod_name = f"pytorch-s3-single-node-training-{rand_int}"

    args = f"git clone https://github.com/aws/amazon-s3-plugin-for-pytorch.git && python amazon-s3-plugin-for-pytorch/examples/s3_imagenet_example.py"

    # TODO: Change hardcoded value to read a mapping from the EKS cluster instance.
    cpu_limit = 96
    cpu_limit = str(int(cpu_limit) / 2)

    if "gpu" in pytorch_training:
        args = args + " --gpu 0"

    search_replace_dict = {
        "<POD_NAME>": pod_name,
        "<CONTAINER_NAME>": pytorch_training,
        "<ARGS>": args,
        "<CPU_LIMIT>": cpu_limit,
    }

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path,
        search_replace_dict)

    try:
        run("kubectl create -f {}".format(yaml_path))

        if eks_utils.is_eks_training_complete(pod_name):
            pytorch_out = run("kubectl logs {}".format(pod_name)).stdout
            if "Acc" in pytorch_out:
                training_result = True
            else:
                eks_utils.LOGGER.info("**** training output ****")
                eks_utils.LOGGER.debug(pytorch_out)
        assert training_result, f"Training failed"
    finally:
        run("kubectl delete pods {}".format(pod_name))
示例#9
0
def test_eks_tensorflow_single_node_training(tensorflow_training):
    """
    Function to create a pod using kubectl and given container image, and run MXNet training
    Args:
        :param setup_utils: environment in which EKS tools are setup
        :param tensorflow_training: the ECR URI
    """

    training_result = False

    rand_int = random.randint(4001, 6000)

    yaml_path = os.path.join(
        os.sep, "tmp", f"tensorflow_single_node_training_{rand_int}.yaml")
    pod_name = f"tensorflow-single-node-training-{rand_int}"

    args = (
        "git clone https://github.com/fchollet/keras.git "
        "&& sed -i 's/import keras/from tensorflow import keras/g; "
        "s/from keras/from tensorflow.keras/g' /keras/examples/mnist_cnn.py "
        "&& python /keras/examples/mnist_cnn.py")

    # TODO: Change hardcoded value to read a mapping from the EKS cluster instance.
    cpu_limit = 72
    cpu_limit = str(int(cpu_limit) / 2)

    search_replace_dict = {
        "<POD_NAME>": pod_name,
        "<CONTAINER_NAME>": tensorflow_training,
        "<ARGS>": args,
        "<CPU_LIMIT>": cpu_limit,
    }

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path,
        search_replace_dict)

    try:
        run("kubectl create -f {}".format(yaml_path))

        if eks_utils.is_eks_training_complete(pod_name):
            tensorflow_out = run("kubectl logs {}".format(pod_name)).stdout
            if "Test accuracy" in tensorflow_out:
                training_result = True
            else:
                eks_utils.LOGGER.info("**** training output ****")
                eks_utils.LOGGER.debug(tensorflow_out)

        assert training_result, f"Training failed"
    finally:
        run("kubectl delete pods {}".format(pod_name))
def _run_eks_tensorflow_multinode_training_resnet50_mpijob(
        example_image_uri, cluster_size, eks_gpus_per_worker):
    """
    Run Tensorflow distributed training on EKS using horovod docker images with synthetic dataset
    :param example_image_uri:
    :param cluster_size:
    :param eks_gpus_per_worker:
    :return: None
    """
    user = Context().run("echo $USER").stdout.strip("\n")
    framework_version = re.search(r"\d+(\.\d+)+", example_image_uri).group()
    major_version = framework_version.split(".")[0]
    random.seed(
        f"{example_image_uri}-{datetime.now().strftime('%Y%m%d%H%M%S%f')}")
    unique_tag = f"{user}-{random.randint(1, 10000)}"
    namespace = f"tf{major_version}-multi-node-train-{'py2' if 'py2' in example_image_uri else 'py3'}-{unique_tag}"
    job_name = f"tf-resnet50-horovod-job-{unique_tag}"

    script_name = (
        "/deep-learning-models/models/resnet/tensorflow2/train_tf2_resnet.py"
        if major_version == "2" else
        "/deep-learning-models/models/resnet/tensorflow/train_imagenet_resnet_hvd.py"
    )

    args_to_pass = (
        '["--synthetic","--batch_size,128","--num_batches","100","--clear_log","2"]'
        if major_version == "2" else '["--num_epochs=1","--synthetic"]')

    local_template_file_path = os.path.join("eks", "eks_manifest_templates",
                                            "tensorflow", "training",
                                            "multi_node_gpu_training.yaml")

    remote_yaml_file_path = os.path.join(
        os.sep, "tmp", f"tensorflow_multi_node_training_{unique_tag}.yaml")

    replace_dict = {
        "<JOB_NAME>": job_name,
        "<NUM_WORKERS>": cluster_size,
        "<CONTAINER_IMAGE>": example_image_uri,
        "<SCRIPT_NAME>": script_name,
        "<ARGS>": args_to_pass,
        "<GPUS>": str(eks_gpus_per_worker)
    }

    eks_utils.write_eks_yaml_file_from_template(local_template_file_path,
                                                remote_yaml_file_path,
                                                replace_dict)

    _run_eks_tensorflow_multi_node_training_mpijob(namespace, job_name,
                                                   remote_yaml_file_path)
示例#11
0
def test_eks_mxnet_neuron_inference(mxnet_inference, neuron_only):
    if "eia" in mxnet_inference or "neuron" not in mxnet_inference:
        pytest.skip("Skipping EKS Neuron Test for EIA and Non Neuron Images")
    num_replicas = "1"

    rand_int = random.randint(4001, 6000)

    processor = "neuron"

    model = "mxnet-resnet50=https://aws-dlc-sample-models.s3.amazonaws.com/mxnet/Resnet50-neuron.mar"
    yaml_path = os.path.join(os.sep, "tmp", f"mxnet_single_node_{processor}_inference_{rand_int}.yaml")
    inference_service_name = selector_name = f"resnet50-{processor}-{rand_int}"

    search_replace_dict = {
        "<MODELS>": model,
        "<NUM_REPLICAS>": num_replicas,
        "<SELECTOR_NAME>": selector_name,
        "<INFERENCE_SERVICE_NAME>": inference_service_name,
        "<DOCKER_IMAGE_BUILD_ID>": mxnet_inference
    }

    search_replace_dict["<NUM_INF1S>"] = "1"

    device_plugin_path = eks_utils.get_device_plugin_path("mxnet", processor)

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.get_single_node_inference_template_path("mxnet", processor), yaml_path, search_replace_dict
    )

    try:
        # TODO - once eksctl gets the latest neuron device plugin this can be removed
        run("kubectl delete -f {}".format(device_plugin_path))
        sleep(60)
        run("kubectl apply -f {}".format(device_plugin_path))
        sleep(10)
        run("kubectl apply -f {}".format(yaml_path))

        port_to_forward = random.randint(49152, 65535)

        if eks_utils.is_service_running(selector_name):
            eks_utils.eks_forward_port_between_host_and_container(selector_name, port_to_forward, "8080")

        assert test_utils.request_mxnet_inference(port=port_to_forward, model="mxnet-resnet50")
    except ValueError as excp:
        eks_utils.LOGGER.error("Service is not running: %s", excp)
    finally:
        run("kubectl cluster-info dump")
        run(f"kubectl delete deployment {selector_name}")
        run(f"kubectl delete service {selector_name}")
示例#12
0
def test_eks_pytorch_neuron_inference(pytorch_inference, neuron_only):
    server_type = test_utils.get_inference_server_type(pytorch_inference)
    if "neuron" not in pytorch_inference:
        pytest.skip("Skipping EKS Neuron Test for EIA and Non Neuron Images")
    else:
        model = "pytorch-resnet-neuron=https://aws-dlc-sample-models.s3.amazonaws.com/pytorch/Resnet50-neuron.mar"
        server_cmd = "torchserve"

    num_replicas = "1"
    rand_int = random.randint(4001, 6000)
    processor = "neuron"

    yaml_path = os.path.join(
        os.sep, "tmp",
        f"pytorch_single_node_{processor}_inference_{rand_int}.yaml")
    inference_service_name = selector_name = f"resnet-{processor}-{rand_int}"

    search_replace_dict = {
        "<MODELS>": model,
        "<NUM_REPLICAS>": num_replicas,
        "<SELECTOR_NAME>": selector_name,
        "<INFERENCE_SERVICE_NAME>": inference_service_name,
        "<DOCKER_IMAGE_BUILD_ID>": pytorch_inference,
        "<SERVER_TYPE>": server_type,
        "<SERVER_CMD>": server_cmd,
    }

    search_replace_dict["<NUM_INF1S>"] = "1"

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.get_single_node_inference_template_path(
            "pytorch", processor), yaml_path, search_replace_dict)

    try:
        run("kubectl apply -f {}".format(yaml_path))

        port_to_forward = random.randint(49152, 65535)

        if eks_utils.is_service_running(selector_name):
            eks_utils.eks_forward_port_between_host_and_container(
                selector_name, port_to_forward, "8080")

        assert test_utils.request_pytorch_inference_densenet(
            port=port_to_forward,
            server_type=server_type,
            model_name="pytorch-resnet-neuron")
    finally:
        run(f"kubectl delete deployment {selector_name}")
        run(f"kubectl delete service {selector_name}")
def test_eks_tensorflow_neuron_inference(tensorflow_inference, neuron_only):
    if "eia" in tensorflow_inference or "neuron" not in tensorflow_inference:
        pytest.skip("Skipping EKS Neuron Test for EIA and Non Neuron Images")
    num_replicas = "1"

    rand_int = random.randint(4001, 6000)

    processor = "neuron"

    model_name = "mnist_neuron"
    yaml_path = os.path.join(
        os.sep, "tmp",
        f"tensorflow_single_node_{processor}_inference_{rand_int}.yaml")
    inference_service_name = selector_name = f"mnist-{processor}-{rand_int}"

    search_replace_dict = {
        "<MODEL_NAME>": model_name,
        "<MODEL_BASE_PATH>": f"https://aws-dlc-sample-models.s3.amazonaws.com",
        "<NUM_REPLICAS>": num_replicas,
        "<SELECTOR_NAME>": selector_name,
        "<INFERENCE_SERVICE_NAME>": inference_service_name,
        "<DOCKER_IMAGE_BUILD_ID>": tensorflow_inference
    }

    search_replace_dict["<NUM_INF1S>"] = "1"

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.get_single_node_inference_template_path(
            "tensorflow", processor), yaml_path, search_replace_dict)

    secret_yml_path = eks_utils.get_aws_secret_yml_path()

    try:
        run("kubectl apply -f {}".format(yaml_path))

        port_to_forward = random.randint(49152, 65535)

        if eks_utils.is_service_running(selector_name):
            eks_utils.eks_forward_port_between_host_and_container(
                selector_name, port_to_forward, "8500")

        assert test_utils.request_tensorflow_inference(model_name=model_name,
                                                       port=port_to_forward)
    except ValueError as excp:
        run("kubectl cluster-info dump")
        eks_utils.LOGGER.error("Service is not running: %s", excp)
    finally:
        run(f"kubectl delete deployment {selector_name}")
        run(f"kubectl delete service {selector_name}")
示例#14
0
def __test_eks_tensorflow_half_plus_two_inference(tensorflow_inference):
    num_replicas = "1"

    rand_int = random.randint(4001, 6000)

    processor = "gpu" if "gpu" in tensorflow_inference else "cpu"

    model_name = f"saved_model_half_plus_two_{processor}"
    yaml_path = os.path.join(
        os.sep, "tmp",
        f"tensorflow_single_node_{processor}_inference_{rand_int}.yaml")
    inference_service_name = selector_name = f"half-plus-two-service-{processor}-{rand_int}"
    model_base_path = get_eks_tensorflow_model_base_path(
        tensorflow_inference, model_name)
    command, args = get_tensorflow_command_args(tensorflow_inference,
                                                model_name, model_base_path)
    test_type = test_utils.get_eks_k8s_test_type_label(tensorflow_inference)
    search_replace_dict = {
        "<NUM_REPLICAS>": num_replicas,
        "<SELECTOR_NAME>": selector_name,
        "<INFERENCE_SERVICE_NAME>": inference_service_name,
        "<DOCKER_IMAGE_BUILD_ID>": tensorflow_inference,
        "<COMMAND>": command,
        "<ARGS>": args,
        "<TEST_TYPE>": test_type,
    }

    if processor == "gpu":
        search_replace_dict["<NUM_GPUS>"] = "1"

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.get_single_node_inference_template_path(
            "tensorflow", processor), yaml_path, search_replace_dict)

    try:
        run("kubectl apply -f {}".format(yaml_path))

        port_to_forward = random.randint(49152, 65535)

        if eks_utils.is_service_running(selector_name):
            eks_utils.eks_forward_port_between_host_and_container(
                selector_name, port_to_forward, "8500")

        assert test_utils.request_tensorflow_inference(model_name=model_name,
                                                       port=port_to_forward)
    finally:
        run(f"kubectl delete deployment {selector_name}")
        run(f"kubectl delete service {selector_name}")
示例#15
0
def test_eks_tensorflow_neuron_inference(tensorflow_inference_neuron):
    num_replicas = "1"

    rand_int = random.randint(4001, 6000)

    processor = "neuron"

    model_name = "mnist_neuron"
    yaml_path = os.path.join(
        os.sep, "tmp",
        f"tensorflow_single_node_{processor}_inference_{rand_int}.yaml")
    inference_service_name = selector_name = f"mnist-{processor}-{rand_int}"

    search_replace_dict = {
        "<MODEL_NAME>": model_name,
        "<MODEL_BASE_PATH>": f"s3://aws-dlc-sample-models",
        "<NUM_REPLICAS>": num_replicas,
        "<SELECTOR_NAME>": selector_name,
        "<INFERENCE_SERVICE_NAME>": inference_service_name,
        "<DOCKER_IMAGE_BUILD_ID>": tensorflow_inference_neuron,
    }

    search_replace_dict["<NUM_INF1S>"] = "1"

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.get_single_node_inference_template_path(
            "tensorflow", processor), yaml_path, search_replace_dict)

    secret_yml_path = eks_utils.get_aws_secret_yml_path()

    try:
        run("kubectl apply -f {}".format(yaml_path))

        port_to_forward = random.randint(49152, 65535)

        if eks_utils.is_service_running(selector_name):
            eks_utils.eks_forward_port_between_host_and_container(
                selector_name, port_to_forward, "8501")

        inference_string = '\'{"instances": ' + "{}".format(
            [[0 for i in range(784)]]) + "}'"
        assert test_utils.request_tensorflow_inference(
            model_name=model_name,
            port=port_to_forward,
            inference_string=inference_string)
    finally:
        run(f"kubectl delete deployment {selector_name}")
        run(f"kubectl delete service {selector_name}")
def test_eks_tensorflow_half_plus_two_inference(tensorflow_inference):
    if "eia" in tensorflow_inference or "neuron" in tensorflow_inference:
        pytest.skip("Skipping EKS Test for EIA and neuron Images")
    num_replicas = "1"

    rand_int = random.randint(4001, 6000)

    processor = "gpu" if "gpu" in tensorflow_inference else "cpu"

    model_name = f"saved_model_half_plus_two_{processor}"
    yaml_path = os.path.join(
        os.sep, "tmp",
        f"tensorflow_single_node_{processor}_inference_{rand_int}.yaml")
    inference_service_name = selector_name = f"half-plus-two-service-{processor}-{rand_int}"

    search_replace_dict = {
        "<MODEL_NAME>": model_name,
        "<MODEL_BASE_PATH>": f"s3://tensoflow-trained-models/{model_name}",
        "<NUM_REPLICAS>": num_replicas,
        "<SELECTOR_NAME>": selector_name,
        "<INFERENCE_SERVICE_NAME>": inference_service_name,
        "<DOCKER_IMAGE_BUILD_ID>": tensorflow_inference
    }

    if processor == "gpu":
        search_replace_dict["<NUM_GPUS>"] = "1"

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.get_single_node_inference_template_path(
            "tensorflow", processor), yaml_path, search_replace_dict)

    try:
        run("kubectl apply -f {}".format(yaml_path))

        port_to_forward = random.randint(49152, 65535)

        if eks_utils.is_service_running(selector_name):
            eks_utils.eks_forward_port_between_host_and_container(
                selector_name, port_to_forward, "8500")

        assert test_utils.request_tensorflow_inference(model_name=model_name,
                                                       port=port_to_forward)
    except ValueError as excp:
        eks_utils.LOGGER.error("Service is not running: %s", excp)
    finally:
        run(f"kubectl delete deployment {selector_name}")
        run(f"kubectl delete service {selector_name}")
示例#17
0
def test_eks_mxnet_gluonnlp_inference(mxnet_inference, py3_only):
    if "eia" in mxnet_inference:
        pytest.skip("Skipping EKS Test for EIA")
    num_replicas = "1"

    rand_int = random.randint(4001, 6000)

    processor = "gpu" if "gpu" in mxnet_inference else "cpu"

    model = "https://aws-dlc-sample-models.s3.amazonaws.com/bert_sst/bert_sst.mar"
    yaml_path = os.path.join(
        os.sep, "tmp",
        f"mxnet_single_node_gluonnlp_{processor}_inference_{rand_int}.yaml")
    inference_service_name = selector_name = f"gluonnlp-service-{processor}-{rand_int}"

    search_replace_dict = {
        "<MODELS>": model,
        "<NUM_REPLICAS>": num_replicas,
        "<SELECTOR_NAME>": selector_name,
        "<INFERENCE_SERVICE_NAME>": inference_service_name,
        "<DOCKER_IMAGE_BUILD_ID>": mxnet_inference
    }

    if processor == "gpu":
        search_replace_dict["<NUM_GPUS>"] = "1"

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.get_single_node_inference_template_path("mxnet", processor),
        yaml_path, search_replace_dict)

    try:
        run("kubectl apply -f {}".format(yaml_path))

        port_to_forward = random.randint(49152, 65535)

        if eks_utils.is_service_running(selector_name):
            eks_utils.eks_forward_port_between_host_and_container(
                selector_name, port_to_forward, "8080")

        assert test_utils.request_mxnet_inference_gluonnlp(
            port=port_to_forward)
    except ValueError as excp:
        eks_utils.LOGGER.error("Service is not running: %s", excp)
    finally:
        run(f"kubectl delete deployment {selector_name}")
        run(f"kubectl delete service {selector_name}")
示例#18
0
def test_eks_mxnet_multinode_training(mxnet_training, example_only):
    """
    Run MXNet distributed training on EKS using docker images with MNIST dataset (parameter server)
    """
    random.seed(
        f"{mxnet_training}-{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}"
    )
    unique_id = random.randint(1, 6000)
    namespace = f"mxnet-multi-node-training-{unique_id}"
    job_name = f"kubeflow-mxnet-gpu-dist-job-{unique_id}"

    # TODO: This should either be dynamic or at least global variables
    num_workers = "3"
    num_servers = "2"
    gpu_limit = "1"
    epochs = '"20"'
    layers = '"2"'
    gpus = '"0"'

    local_template_file_path = os.path.join("eks", "eks_manifest_templates",
                                            "mxnet", "training",
                                            "multi_node_gpu_training.yaml")

    remote_yaml_file_path = os.path.join(
        os.sep, "tmp", f"mxnet_multi_node_training_{unique_id}.yaml")

    replace_dict = {
        "<JOB_NAME>": job_name,
        "<NUM_SERVERS>": num_servers,
        "<NUM_WORKERS>": num_workers,
        "<CONTAINER_IMAGE>": mxnet_training,
        "<EPOCHS>": epochs,
        "<LAYERS>": layers,
        "<GPUS>": gpus,
        "<GPU_LIMIT>": gpu_limit
    }

    eks_utils.write_eks_yaml_file_from_template(local_template_file_path,
                                                remote_yaml_file_path,
                                                replace_dict)

    training_result = _run_eks_mxnet_multi_node_training(
        namespace, job_name, remote_yaml_file_path)
    assert training_result, "EKS multinode training failed"
def test_eks_mxnet_neuron_inference(mxnet_inference, neuron_only):
    if "eia" in mxnet_inference or "neuron" not in mxnet_inference:
        pytest.skip("Skipping EKS Neuron Test for EIA and Non Neuron Images")
    num_replicas = "1"

    rand_int = random.randint(4001, 6000)

    processor = "neuron"

    server_cmd = "/usr/local/bin/entrypoint.sh -m mxnet-resnet50=https://aws-dlc-sample-models.s3.amazonaws.com/mxnet/Resnet50-neuron.mar -t /home/model-server/config.properties"
    yaml_path = os.path.join(
        os.sep, "tmp",
        f"mxnet_single_node_{processor}_inference_{rand_int}.yaml")
    inference_service_name = selector_name = f"resnet50-{processor}-{rand_int}"

    search_replace_dict = {
        "<NUM_REPLICAS>": num_replicas,
        "<SELECTOR_NAME>": selector_name,
        "<INFERENCE_SERVICE_NAME>": inference_service_name,
        "<DOCKER_IMAGE_BUILD_ID>": mxnet_inference,
        "<SERVER_CMD>": server_cmd,
    }

    search_replace_dict["<NUM_INF1S>"] = "1"

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.get_single_node_inference_template_path("mxnet", processor),
        yaml_path, search_replace_dict)

    try:
        run("kubectl apply -f {}".format(yaml_path))

        port_to_forward = random.randint(49152, 65535)

        if eks_utils.is_service_running(selector_name):
            eks_utils.eks_forward_port_between_host_and_container(
                selector_name, port_to_forward, "8080")

        assert test_utils.request_mxnet_inference(port=port_to_forward,
                                                  model="mxnet-resnet50")
    finally:
        run(f"kubectl delete deployment {selector_name}")
        run(f"kubectl delete service {selector_name}")
def test_eks_mxnet_squeezenet_inference(mxnet_inference):
    if "eia" in mxnet_inference or "neuron" in mxnet_inference:
        pytest.skip("Skipping EKS Test for EIA and neuron images")
    num_replicas = "1"

    rand_int = random.randint(4001, 6000)

    processor = "gpu" if "gpu" in mxnet_inference else "cpu"

    model = "squeezenet=https://s3.amazonaws.com/model-server/models/squeezenet_v1.1/squeezenet_v1.1.model"
    yaml_path = os.path.join(
        os.sep, "tmp",
        f"mxnet_single_node_{processor}_inference_{rand_int}.yaml")
    inference_service_name = selector_name = f"squeezenet-service-{rand_int}"

    search_replace_dict = {
        "<MODELS>": model,
        "<NUM_REPLICAS>": num_replicas,
        "<SELECTOR_NAME>": selector_name,
        "<INFERENCE_SERVICE_NAME>": inference_service_name,
        "<DOCKER_IMAGE_BUILD_ID>": mxnet_inference,
    }

    if processor == "gpu":
        search_replace_dict["<NUM_GPUS>"] = "1"

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.get_single_node_inference_template_path("mxnet", processor),
        yaml_path, search_replace_dict)

    try:
        run("kubectl apply -f {}".format(yaml_path))

        port_to_forward = random.randint(49152, 65535)

        if eks_utils.is_service_running(selector_name):
            eks_utils.eks_forward_port_between_host_and_container(
                selector_name, port_to_forward, "8080")

        assert test_utils.request_mxnet_inference(port=port_to_forward)
    finally:
        run(f"kubectl delete deployment {selector_name}")
        run(f"kubectl delete service {selector_name}")
def test_eks_pytorch_multinode_node_training(pytorch_training, example_only):
    """
       Function to create mutliple pods using kubectl and given container image, and run Pytorch training
       Args:
           :param setup_utils: environment in which EKS tools are setup
           :param pytorch_training: the ECR URI
       """
    # TODO: Change hardcoded value to read a mapping from the EKS cluster instance.
    random.seed(
        f"{pytorch_training}-{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}"
    )
    unique_id = random.randint(1, 6000)

    namespace = f"pytorch-multi-node-training-{unique_id}"
    app_name = f"eks-pytorch-mnist-app-{unique_id}"
    job_name = f"kubeflow-pytorch-gpu-dist-job-{unique_id}"
    num_masters = "1"
    num_workers = "3"
    gpu_limit = "1"
    backend = "gloo"
    epochs = '"10"'
    local_template_file_path = os.path.join("eks", "eks_manifest_templates",
                                            "pytorch", "training",
                                            "multi_node_gpu_training.yaml")
    remote_yaml_path = os.path.join(
        os.sep, "tmp", f"pytorch_multinode_node_training_{unique_id}.yaml")
    replace_dict = {
        "<JOB_NAME>": job_name,
        "<NUM_MASTERS>": num_masters,
        "<NUM_WORKERS>": num_workers,
        "<CONTAINER_IMAGE>": pytorch_training,
        "<BACKEND>": backend,
        "<EPOCHS>": epochs,
        "<GPU_LIMIT>": gpu_limit
    }

    eks_utils.write_eks_yaml_file_from_template(local_template_file_path,
                                                remote_yaml_path, replace_dict)
    run_eks_pytorch_multi_node_training(namespace, app_name, job_name,
                                        remote_yaml_path, unique_id)
def test_eks_pytorch_single_node_training(pytorch_training):
    """
    Function to create a pod using kubectl and given container image, and run MXNet training
    Args:
        :param setup_utils: environment in which EKS tools are setup
        :param pytorch_training: the ECR URI
    """

    training_result = False

    rand_int = random.randint(4001, 6000)

    yaml_path = os.path.join(os.sep, "tmp",
                             f"pytorch_single_node_training_{rand_int}.yaml")
    pod_name = f"pytorch-single-node-training-{rand_int}"
    # Workaround for https://github.com/pytorch/vision/issues/1938 and https://github.com/pytorch/vision/issues/3549
    mnist_dataset_download_config = '''
      FILE=new_main.py &&
      echo "from __future__ import print_function" > $FILE &&
      echo "from six.moves import urllib" >> $FILE &&
      echo "from packaging.version import Version" >> $FILE &&
      echo "opener = urllib.request.build_opener()" >> $FILE &&
      echo "opener.addheaders = [('User-agent', 'Mozilla/5.0')]" >> $FILE &&
      echo "urllib.request.install_opener(opener)" >> $FILE &&
      echo "import torchvision" >> $FILE &&
      echo "from torchvision import datasets, transforms" >> $FILE &&
      echo "# from torchvision 0.9.1, 2 candidate mirror website links will be added before resources items automatically" >> $FILE &&
      echo "# Reference PR https://github.com/pytorch/vision/pull/3559" >> $FILE &&
      echo "TORCHVISION_VERSION = '0.9.1'" >> $FILE &&
      echo "if Version(torchvision.__version__) < Version(TORCHVISION_VERSION):" >> $FILE &&
      echo "    datasets.MNIST.resources = [" >> $FILE &&
      echo "          ('https://dlinfra-mnist-dataset.s3-us-west-2.amazonaws.com/mnist/train-images-idx3-ubyte.gz', 'f68b3c2dcbeaaa9fbdd348bbdeb94873')," >> $FILE &&
      echo "          ('https://dlinfra-mnist-dataset.s3-us-west-2.amazonaws.com/mnist/train-labels-idx1-ubyte.gz', 'd53e105ee54ea40749a09fcbcd1e9432')," >> $FILE &&
      echo "          ('https://dlinfra-mnist-dataset.s3-us-west-2.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz', '9fb629c4189551a2d022fa330f9573f3')," >> $FILE &&
      echo "          ('https://dlinfra-mnist-dataset.s3-us-west-2.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz', 'ec29112dd5afa0611ce80d1b7f02629c')" >> $FILE &&
      echo "          ]" >> $FILE &&
      sed -i '1d' examples/mnist/main.py &&
      sed -i '6d' examples/mnist/main.py &&
      cat examples/mnist/main.py >> $FILE &&
      rm examples/mnist/main.py &&
      mv $FILE examples/mnist/main.py
    '''

    args = f"git clone https://github.com/pytorch/examples.git && {mnist_dataset_download_config}  && python examples/mnist/main.py"

    # TODO: Change hardcoded value to read a mapping from the EKS cluster instance.
    cpu_limit = 72
    cpu_limit = str(int(cpu_limit) / 2)

    search_replace_dict = {
        "<POD_NAME>": pod_name,
        "<CONTAINER_NAME>": pytorch_training,
        "<ARGS>": args,
        "<CPU_LIMIT>": cpu_limit,
    }

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path,
        search_replace_dict)

    try:
        run("kubectl create -f {}".format(yaml_path))

        if eks_utils.is_eks_training_complete(pod_name):
            pytorch_out = run("kubectl logs {}".format(pod_name)).stdout
            if "Accuracy" in pytorch_out:
                training_result = True
            else:
                eks_utils.LOGGER.info("**** training output ****")
                eks_utils.LOGGER.debug(pytorch_out)
        assert training_result, f"Training failed"
    finally:
        run("kubectl delete pods {}".format(pod_name))
def test_eks_pytorch_dgl_single_node_training(pytorch_training, py3_only):
    """
    Function to create a pod using kubectl and given container image, and run
    DGL training with PyTorch backend
    Args:
        :param pytorch_training: the ECR URI
    """
    _, image_framework_version = get_framework_and_version_from_tag(
        pytorch_training)
    image_cuda_version = get_cuda_version_from_tag(pytorch_training)
    if Version(image_framework_version) == Version(
            "1.6") and image_cuda_version == "cu110":
        pytest.skip("DGL does not suport CUDA 11 for PyTorch 1.6")
    # TODO: Remove when DGL gpu test on ecs get fixed
    if Version(image_framework_version) >= Version("1.10"):
        pytest.skip("ecs test for DGL gpu fails since pt 1.10")

    training_result = False
    rand_int = random.randint(4001, 6000)

    yaml_path = os.path.join(
        os.sep, "tmp", f"pytorch_single_node_training_dgl_{rand_int}.yaml")
    pod_name = f"pytorch-single-node-training-dgl-{rand_int}"

    if is_below_framework_version("1.7", pytorch_training, "pytorch"):
        dgl_branch = "0.4.x"
    else:
        dgl_branch = "0.7.x"

    args = (
        f"git clone -b {dgl_branch} https://github.com/dmlc/dgl.git && "
        f"cd /dgl/examples/pytorch/gcn/ && DGLBACKEND=pytorch python train.py --dataset cora"
    )

    # TODO: Change hardcoded value to read a mapping from the EKS cluster instance.
    cpu_limit = 72
    cpu_limit = str(int(cpu_limit) / 2)

    if "gpu" in pytorch_training:
        args = args + " --gpu 0"
    else:
        args = args + " --gpu -1"

    search_replace_dict = {
        "<POD_NAME>": pod_name,
        "<CONTAINER_NAME>": pytorch_training,
        "<ARGS>": args,
        "<CPU_LIMIT>": cpu_limit,
    }

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path,
        search_replace_dict)

    try:
        run("kubectl create -f {}".format(yaml_path))

        if eks_utils.is_eks_training_complete(pod_name):
            dgl_out = run("kubectl logs {}".format(pod_name)).stdout
            if "Test accuracy" in dgl_out:
                training_result = True
            else:
                eks_utils.LOGGER.info("**** training output ****")
                eks_utils.LOGGER.debug(dgl_out)

        assert training_result, f"Training failed"
    finally:
        run("kubectl delete pods {}".format(pod_name))
def test_eks_mxnet_dgl_single_node_training(mxnet_training, py3_only):
    """
    Function to create a pod using kubectl and given container image, and run
    DGL training with MXNet backend
    Args:
        :param mxnet_training: the ECR URI
    """

    # TODO: remove/update this when DGL supports MXNet 1.9
    _, framework_version = get_framework_and_version_from_tag(mxnet_training)
    if Version(framework_version) >= Version('1.9.0'):
        pytest.skip("Skipping DGL tests as DGL does not yet support MXNet 1.9")

    training_result = False
    rand_int = random.randint(4001, 6000)

    yaml_path = os.path.join(
        os.sep, "tmp", f"mxnet_single_node_training_dgl_{rand_int}.yaml")
    pod_name = f"mxnet-single-node-training-dgl-{rand_int}"

    ctx = Context()
    # Run container to determine dgl version
    container_name = get_container_name("dgl-mx", mxnet_training)
    ctx.run(f"docker run --name {container_name} -itd {mxnet_training}")

    dgl_version = ctx.run(
        f"docker exec --user root {container_name} python -c 'import dgl; print(dgl.__version__)'"
    ).stdout.strip()
    dgl_major_minor = re.search(r'(^\d+.\d+).', dgl_version).group(1)
    dgl_branch = f"{dgl_major_minor}.x"

    args = (
        f"git clone -b {dgl_branch} https://github.com/dmlc/dgl.git && "
        f"cd /dgl/examples/mxnet/gcn/ && DGLBACKEND=mxnet python train.py --dataset cora"
    )

    # TODO: Change hardcoded value to read a mapping from the EKS cluster instance.
    cpu_limit = 72
    cpu_limit = str(int(cpu_limit) / 2)

    if "gpu" in mxnet_training:
        args = args + " --gpu 0"
    else:
        args = args + " --gpu -1"

    search_replace_dict = {
        "<POD_NAME>": pod_name,
        "<CONTAINER_NAME>": mxnet_training,
        "<ARGS>": args,
        "<CPU_LIMIT>": cpu_limit,
    }

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path,
        search_replace_dict)

    try:
        run("kubectl create -f {}".format(yaml_path))

        if eks_utils.is_eks_training_complete(pod_name):
            dgl_out = run("kubectl logs {}".format(pod_name)).stdout
            if "Test accuracy" in dgl_out:
                training_result = True
            else:
                eks_utils.LOGGER.info("**** training output ****")
                eks_utils.LOGGER.debug(dgl_out)

        assert training_result, f"Training failed"
    finally:
        run("kubectl delete pods {}".format(pod_name))
示例#25
0
def test_eks_mxnet_gluonnlp_single_node_training(mxnet_training, py3_only):
    """
    Function to create a pod using kubectl and given container image, and run
    DGL training with MXNet backend
    Args:
        :param mxnet_training: the ECR URI
    """

    training_result = False

    rand_int = random.randint(4001, 6000)

    yaml_path = os.path.join(
        os.sep, "tmp", f"mxnet_single_node_training_gluonnlp_{rand_int}.yaml")
    pod_name = f"mxnet-single-node-training-gluonnlp-{rand_int}"

    args = (
        "git clone -b master https://github.com/dmlc/gluon-nlp.git && "
        "cd gluon-nlp && git checkout v0.9.0 &&"
        "cd ./scripts/sentiment_analysis/ &&"
        "python sentiment_analysis_cnn.py --batch_size 50 --epochs 20 --dropout 0.5 "
        "--model_mode multichannel --data_name TREC")

    # TODO: Change hardcoded value to read a mapping from the EKS cluster instance.
    cpu_limit = 72
    cpu_limit = str(int(cpu_limit) / 2)

    if "gpu" in mxnet_training:
        args = args + " --gpu 0"

    search_replace_dict = {
        "<POD_NAME>": pod_name,
        "<CONTAINER_NAME>": mxnet_training,
        "<ARGS>": args,
        "<CPU_LIMIT>": cpu_limit,
    }

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path,
        search_replace_dict)

    try:
        run("kubectl create -f {}".format(yaml_path))

        if eks_utils.is_eks_training_complete(pod_name):
            gluonnlp_out = run("kubectl logs {}".format(pod_name)).stdout

            results = re.search(r"test acc ((?:\d*\.\d+)|\d+)", gluonnlp_out)
            if results is not None:
                accuracy = float(results.groups()[0])

                if accuracy >= 0.75:
                    eks_utils.LOGGER.info(
                        "GluonNLP EKS test succeeded with accuracy {} >= 0.75".
                        format(accuracy))
                    training_result = True
                else:
                    eks_utils.LOGGER.info(
                        "GluonNLP EKS test FAILED with accuracy {} < 0.75".
                        format(accuracy))
                    eks_utils.LOGGER.debug(gluonnlp_out)

        assert training_result, f"Training failed"
    finally:
        run("kubectl delete pods {}".format(pod_name))