def _run_eks_mxnet_multinode_training_horovod_mpijob(example_image_uri, cluster_size, eks_gpus_per_worker): LOGGER.info("Starting run_eks_mxnet_multi_node_training on MNIST dataset using horovod") LOGGER.info("The test will run on an example image %s", example_image_uri) user = Context().run("echo $USER").stdout.strip("\n") random.seed(f"{example_image_uri}-{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}") unique_tag = f"{user}-{random.randint(1, 10000)}" namespace = f"mx-multi-node-train-{'py2' if 'py2' in example_image_uri else 'py3'}-{unique_tag}" job_name = f"mxnet-mnist-horovod-job-{unique_tag}" LOGGER.debug(f"Namespace: {namespace}") local_template_file_path = os.path.join( "eks", "eks_manifest_templates", "mxnet", "training", "multi_node_training_horovod_mnist.yaml" ) remote_yaml_file_path = os.path.join(os.sep, "tmp", f"tensorflow_multi_node_training_{unique_tag}.yaml") replace_dict = { "<JOB_NAME>": job_name, "<NUM_WORKERS>": cluster_size, "<CONTAINER_IMAGE>": example_image_uri, "<GPUS>": str(eks_gpus_per_worker) } eks_utils.write_eks_yaml_file_from_template(local_template_file_path, remote_yaml_file_path, replace_dict) _run_eks_multi_node_training_mpijob(namespace, job_name, remote_yaml_file_path)
def test_eks_mxnet_single_node_training(mxnet_training): """ Function to create a pod using kubectl and given container image, and run MXNet training Args: :param mxnet_training: the ECR URI """ training_result = False rand_int = random.randint(4001, 6000) framework_version_search = re.search(r"\d+\.\d+", mxnet_training) framework_version = "v" + framework_version_search.group() + ".x" yaml_path = os.path.join(os.sep, "tmp", f"mxnet_single_node_training_{rand_int}.yaml") pod_name = f"mxnet-single-node-training-{rand_int}" # Temporariy fix for 503 error while downloading MNIST dataset. See https://github.com/pytorch/vision/issues/3549 mnist_dataset_download_config = ''' FROM="http:\/\/yann\.lecun\.com\/exdb\/mnist\/" && TO="https:\/\/ossci-datasets\.s3\.amazonaws\.com\/mnist\/" && sed -i -e "s/${FROM}/${TO}/g" /incubator-mxnet/example/image-classification/train_mnist.py ''' args = ( f"git clone -b {framework_version} https://github.com/apache/incubator-mxnet.git && {mnist_dataset_download_config} && python " f"/incubator-mxnet/example/image-classification/train_mnist.py") processor_type = "gpu" if "gpu" in mxnet_training else "cpu" args = args + " --gpus 0" if processor_type == "gpu" else args # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": mxnet_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): mxnet_out = run("kubectl logs {}".format(pod_name)).stdout if "Epoch[19] Validation-accuracy" in mxnet_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(mxnet_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_mxnet_dgl_single_node_training(mxnet_training, py3_only): """ Function to create a pod using kubectl and given container image, and run DGL training with MXNet backend Args: :param mxnet_training: the ECR URI """ training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join( os.sep, "tmp", f"mxnet_single_node_training_dgl_{rand_int}.yaml") pod_name = f"mxnet-single-node-training-dgl-{rand_int}" dgl_branch = "0.4.x" args = ( f"git clone -b {dgl_branch} https://github.com/dmlc/dgl.git && " f"cd /dgl/examples/mxnet/gcn/ && DGLBACKEND=mxnet python train.py --dataset cora" ) # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) if "gpu" in mxnet_training: if "cu110" in mxnet_training: pytest.skip( "Skipping DGL tests for GPU until dgl-cu110 is available.") args = args + " --gpu 0" else: args = args + " --gpu -1" search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": mxnet_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): dgl_out = run("kubectl logs {}".format(pod_name)).stdout if "Test accuracy" in dgl_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(dgl_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_pytorch_single_node_training(pytorch_training): """ Function to create a pod using kubectl and given container image, and run MXNet training Args: :param setup_utils: environment in which EKS tools are setup :param pytorch_training: the ECR URI """ training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join(os.sep, "tmp", f"pytorch_single_node_training_{rand_int}.yaml") pod_name = f"pytorch-single-node-training-{rand_int}" mnist_dataset_download_config = ''' FILE=new_main.py && echo "from __future__ import print_function" > $FILE && echo "from six.moves import urllib" >> $FILE && echo "opener = urllib.request.build_opener()" >> $FILE && echo "opener.addheaders = [('User-agent', 'Mozilla/5.0')]" >> $FILE && echo "urllib.request.install_opener(opener)" >> $FILE && sed -i '1d' examples/mnist/main.py && cat examples/mnist/main.py >> $FILE && rm examples/mnist/main.py && mv $FILE examples/mnist/main.py ''' args = f"git clone https://github.com/pytorch/examples.git && {mnist_dataset_download_config} && python examples/mnist/main.py" # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": pytorch_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): pytorch_out = run("kubectl logs {}".format(pod_name)).stdout if "Accuracy" in pytorch_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(pytorch_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_pytorch_densenet_inference(pytorch_inference): server_type = test_utils.get_inference_server_type(pytorch_inference) if "eia" in pytorch_inference: pytest.skip("Skipping EKS Test for EIA") elif "neuron" in pytorch_inference: pytest.skip( "Neuron specific test is run and so skipping this test for Neuron") elif server_type == "ts": model = "pytorch-densenet=https://torchserve.s3.amazonaws.com/mar_files/densenet161.mar" server_cmd = "torchserve" else: model = "pytorch-densenet=https://dlc-samples.s3.amazonaws.com/pytorch/multi-model-server/densenet/densenet.mar" server_cmd = "multi-model-server" num_replicas = "1" rand_int = random.randint(4001, 6000) processor = "gpu" if "gpu" in pytorch_inference else "cpu" yaml_path = os.path.join( os.sep, "tmp", f"pytorch_single_node_{processor}_inference_{rand_int}.yaml") inference_service_name = selector_name = f"densenet-service-{processor}-{rand_int}" search_replace_dict = { "<MODELS>": model, "<NUM_REPLICAS>": num_replicas, "<SELECTOR_NAME>": selector_name, "<INFERENCE_SERVICE_NAME>": inference_service_name, "<DOCKER_IMAGE_BUILD_ID>": pytorch_inference, "<SERVER_TYPE>": server_type, "<SERVER_CMD>": server_cmd } if processor == "gpu": search_replace_dict["<NUM_GPUS>"] = "1" eks_utils.write_eks_yaml_file_from_template( eks_utils.get_single_node_inference_template_path( "pytorch", processor), yaml_path, search_replace_dict) try: run("kubectl apply -f {}".format(yaml_path)) port_to_forward = random.randint(49152, 65535) if eks_utils.is_service_running(selector_name): eks_utils.eks_forward_port_between_host_and_container( selector_name, port_to_forward, "8080") assert test_utils.request_pytorch_inference_densenet( port=port_to_forward, server_type=server_type) except ValueError as excp: eks_utils.LOGGER.error("Service is not running: %s", excp) finally: run(f"kubectl delete deployment {selector_name}") run(f"kubectl delete service {selector_name}")
def test_eks_mxnet_single_node_training(mxnet_training): """ Function to create a pod using kubectl and given container image, and run MXNet training Args: :param mxnet_training: the ECR URI """ training_result = False rand_int = random.randint(4001, 6000) framework_version_search = re.search(r"\d+(\.\d+){2}", mxnet_training) framework_version = framework_version_search.group() if not framework_version_search: framework_version_search = re.search(r"\d+\.\d+", mxnet_training) framework_version = framework_version_search.group() + ".0" yaml_path = os.path.join(os.sep, "tmp", f"mxnet_single_node_training_{rand_int}.yaml") pod_name = f"mxnet-single-node-training-{rand_int}" args = ( f"git clone -b {framework_version} https://github.com/apache/incubator-mxnet.git && python " f"/incubator-mxnet/example/image-classification/train_mnist.py" ) processor_type = "gpu" if "gpu" in mxnet_training else "cpu" args = args + " --gpus 0" if processor_type == "gpu" else args # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": mxnet_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict ) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): mxnet_out = run("kubectl logs {}".format(pod_name)).stdout if "Epoch[19] Validation-accuracy" in mxnet_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(mxnet_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_pytorch_neuron_inference(pytorch_inference, neuron_only): server_type = test_utils.get_inference_server_type(pytorch_inference) if "neuron" not in pytorch_inference: pytest.skip("Skipping EKS Neuron Test for EIA and Non Neuron Images") model = "pytorch-resnet-neuron=https://aws-dlc-sample-models.s3.amazonaws.com/pytorch/Resnet50-neuron.mar" server_cmd = "/usr/local/bin/entrypoint.sh -m pytorch-resnet-neuron=https://aws-dlc-sample-models.s3.amazonaws.com/pytorch/Resnet50-neuron.mar -t /home/model-server/config.properties" num_replicas = "1" rand_int = random.randint(4001, 6000) processor = "neuron" yaml_path = os.path.join( os.sep, "tmp", f"pytorch_single_node_{processor}_inference_{rand_int}.yaml") inference_service_name = selector_name = f"resnet-{processor}-{rand_int}" search_replace_dict = { "<NUM_REPLICAS>": num_replicas, "<SELECTOR_NAME>": selector_name, "<INFERENCE_SERVICE_NAME>": inference_service_name, "<DOCKER_IMAGE_BUILD_ID>": pytorch_inference, "<SERVER_TYPE>": server_type, "<SERVER_CMD>": server_cmd } search_replace_dict["<NUM_INF1S>"] = "1" eks_utils.write_eks_yaml_file_from_template( eks_utils.get_single_node_inference_template_path( "pytorch", processor), yaml_path, search_replace_dict) device_plugin_path = eks_utils.get_device_plugin_path("pytorch", processor) try: # TODO - once eksctl gets the latest neuron device plugin this can be removed run("kubectl delete -f {}".format(device_plugin_path)) sleep(60) run("kubectl apply -f {}".format(device_plugin_path)) sleep(10) run("kubectl apply -f {}".format(yaml_path)) port_to_forward = random.randint(49152, 65535) if eks_utils.is_service_running(selector_name): eks_utils.eks_forward_port_between_host_and_container( selector_name, port_to_forward, "8080") assert test_utils.request_pytorch_inference_densenet( port=port_to_forward) except ValueError as excp: run("kubectl cluster-info dump") eks_utils.LOGGER.error("Service is not running: %s", excp) finally: run(f"kubectl delete deployment {selector_name}") run(f"kubectl delete service {selector_name}")
def test_eks_pt_s3_plugin_single_node_training(pytorch_training, pt17_and_above_only): """ Function to create a pod using kubectl and given container image, and run MXNet training Args: :param setup_utils: environment in which EKS tools are setup :param pytorch_training: the ECR URI """ _, image_framework_version = get_framework_and_version_from_tag( pytorch_training) if Version(image_framework_version) < Version("1.8"): pytest.skip("S3 plugin is supported on PyTorch version >=1.8") training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join( os.sep, "tmp", f"pytorch_s3_single_node_training_{rand_int}.yaml") pod_name = f"pytorch-s3-single-node-training-{rand_int}" args = f"git clone https://github.com/aws/amazon-s3-plugin-for-pytorch.git && python amazon-s3-plugin-for-pytorch/examples/s3_imagenet_example.py" # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 96 cpu_limit = str(int(cpu_limit) / 2) if "gpu" in pytorch_training: args = args + " --gpu 0" search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": pytorch_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): pytorch_out = run("kubectl logs {}".format(pod_name)).stdout if "Acc" in pytorch_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(pytorch_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_tensorflow_single_node_training(tensorflow_training): """ Function to create a pod using kubectl and given container image, and run MXNet training Args: :param setup_utils: environment in which EKS tools are setup :param tensorflow_training: the ECR URI """ training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join( os.sep, "tmp", f"tensorflow_single_node_training_{rand_int}.yaml") pod_name = f"tensorflow-single-node-training-{rand_int}" args = ( "git clone https://github.com/fchollet/keras.git " "&& sed -i 's/import keras/from tensorflow import keras/g; " "s/from keras/from tensorflow.keras/g' /keras/examples/mnist_cnn.py " "&& python /keras/examples/mnist_cnn.py") # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": tensorflow_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): tensorflow_out = run("kubectl logs {}".format(pod_name)).stdout if "Test accuracy" in tensorflow_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(tensorflow_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def _run_eks_tensorflow_multinode_training_resnet50_mpijob( example_image_uri, cluster_size, eks_gpus_per_worker): """ Run Tensorflow distributed training on EKS using horovod docker images with synthetic dataset :param example_image_uri: :param cluster_size: :param eks_gpus_per_worker: :return: None """ user = Context().run("echo $USER").stdout.strip("\n") framework_version = re.search(r"\d+(\.\d+)+", example_image_uri).group() major_version = framework_version.split(".")[0] random.seed( f"{example_image_uri}-{datetime.now().strftime('%Y%m%d%H%M%S%f')}") unique_tag = f"{user}-{random.randint(1, 10000)}" namespace = f"tf{major_version}-multi-node-train-{'py2' if 'py2' in example_image_uri else 'py3'}-{unique_tag}" job_name = f"tf-resnet50-horovod-job-{unique_tag}" script_name = ( "/deep-learning-models/models/resnet/tensorflow2/train_tf2_resnet.py" if major_version == "2" else "/deep-learning-models/models/resnet/tensorflow/train_imagenet_resnet_hvd.py" ) args_to_pass = ( '["--synthetic","--batch_size,128","--num_batches","100","--clear_log","2"]' if major_version == "2" else '["--num_epochs=1","--synthetic"]') local_template_file_path = os.path.join("eks", "eks_manifest_templates", "tensorflow", "training", "multi_node_gpu_training.yaml") remote_yaml_file_path = os.path.join( os.sep, "tmp", f"tensorflow_multi_node_training_{unique_tag}.yaml") replace_dict = { "<JOB_NAME>": job_name, "<NUM_WORKERS>": cluster_size, "<CONTAINER_IMAGE>": example_image_uri, "<SCRIPT_NAME>": script_name, "<ARGS>": args_to_pass, "<GPUS>": str(eks_gpus_per_worker) } eks_utils.write_eks_yaml_file_from_template(local_template_file_path, remote_yaml_file_path, replace_dict) _run_eks_tensorflow_multi_node_training_mpijob(namespace, job_name, remote_yaml_file_path)
def test_eks_mxnet_neuron_inference(mxnet_inference, neuron_only): if "eia" in mxnet_inference or "neuron" not in mxnet_inference: pytest.skip("Skipping EKS Neuron Test for EIA and Non Neuron Images") num_replicas = "1" rand_int = random.randint(4001, 6000) processor = "neuron" model = "mxnet-resnet50=https://aws-dlc-sample-models.s3.amazonaws.com/mxnet/Resnet50-neuron.mar" yaml_path = os.path.join(os.sep, "tmp", f"mxnet_single_node_{processor}_inference_{rand_int}.yaml") inference_service_name = selector_name = f"resnet50-{processor}-{rand_int}" search_replace_dict = { "<MODELS>": model, "<NUM_REPLICAS>": num_replicas, "<SELECTOR_NAME>": selector_name, "<INFERENCE_SERVICE_NAME>": inference_service_name, "<DOCKER_IMAGE_BUILD_ID>": mxnet_inference } search_replace_dict["<NUM_INF1S>"] = "1" device_plugin_path = eks_utils.get_device_plugin_path("mxnet", processor) eks_utils.write_eks_yaml_file_from_template( eks_utils.get_single_node_inference_template_path("mxnet", processor), yaml_path, search_replace_dict ) try: # TODO - once eksctl gets the latest neuron device plugin this can be removed run("kubectl delete -f {}".format(device_plugin_path)) sleep(60) run("kubectl apply -f {}".format(device_plugin_path)) sleep(10) run("kubectl apply -f {}".format(yaml_path)) port_to_forward = random.randint(49152, 65535) if eks_utils.is_service_running(selector_name): eks_utils.eks_forward_port_between_host_and_container(selector_name, port_to_forward, "8080") assert test_utils.request_mxnet_inference(port=port_to_forward, model="mxnet-resnet50") except ValueError as excp: eks_utils.LOGGER.error("Service is not running: %s", excp) finally: run("kubectl cluster-info dump") run(f"kubectl delete deployment {selector_name}") run(f"kubectl delete service {selector_name}")
def test_eks_pytorch_neuron_inference(pytorch_inference, neuron_only): server_type = test_utils.get_inference_server_type(pytorch_inference) if "neuron" not in pytorch_inference: pytest.skip("Skipping EKS Neuron Test for EIA and Non Neuron Images") else: model = "pytorch-resnet-neuron=https://aws-dlc-sample-models.s3.amazonaws.com/pytorch/Resnet50-neuron.mar" server_cmd = "torchserve" num_replicas = "1" rand_int = random.randint(4001, 6000) processor = "neuron" yaml_path = os.path.join( os.sep, "tmp", f"pytorch_single_node_{processor}_inference_{rand_int}.yaml") inference_service_name = selector_name = f"resnet-{processor}-{rand_int}" search_replace_dict = { "<MODELS>": model, "<NUM_REPLICAS>": num_replicas, "<SELECTOR_NAME>": selector_name, "<INFERENCE_SERVICE_NAME>": inference_service_name, "<DOCKER_IMAGE_BUILD_ID>": pytorch_inference, "<SERVER_TYPE>": server_type, "<SERVER_CMD>": server_cmd, } search_replace_dict["<NUM_INF1S>"] = "1" eks_utils.write_eks_yaml_file_from_template( eks_utils.get_single_node_inference_template_path( "pytorch", processor), yaml_path, search_replace_dict) try: run("kubectl apply -f {}".format(yaml_path)) port_to_forward = random.randint(49152, 65535) if eks_utils.is_service_running(selector_name): eks_utils.eks_forward_port_between_host_and_container( selector_name, port_to_forward, "8080") assert test_utils.request_pytorch_inference_densenet( port=port_to_forward, server_type=server_type, model_name="pytorch-resnet-neuron") finally: run(f"kubectl delete deployment {selector_name}") run(f"kubectl delete service {selector_name}")
def test_eks_tensorflow_neuron_inference(tensorflow_inference, neuron_only): if "eia" in tensorflow_inference or "neuron" not in tensorflow_inference: pytest.skip("Skipping EKS Neuron Test for EIA and Non Neuron Images") num_replicas = "1" rand_int = random.randint(4001, 6000) processor = "neuron" model_name = "mnist_neuron" yaml_path = os.path.join( os.sep, "tmp", f"tensorflow_single_node_{processor}_inference_{rand_int}.yaml") inference_service_name = selector_name = f"mnist-{processor}-{rand_int}" search_replace_dict = { "<MODEL_NAME>": model_name, "<MODEL_BASE_PATH>": f"https://aws-dlc-sample-models.s3.amazonaws.com", "<NUM_REPLICAS>": num_replicas, "<SELECTOR_NAME>": selector_name, "<INFERENCE_SERVICE_NAME>": inference_service_name, "<DOCKER_IMAGE_BUILD_ID>": tensorflow_inference } search_replace_dict["<NUM_INF1S>"] = "1" eks_utils.write_eks_yaml_file_from_template( eks_utils.get_single_node_inference_template_path( "tensorflow", processor), yaml_path, search_replace_dict) secret_yml_path = eks_utils.get_aws_secret_yml_path() try: run("kubectl apply -f {}".format(yaml_path)) port_to_forward = random.randint(49152, 65535) if eks_utils.is_service_running(selector_name): eks_utils.eks_forward_port_between_host_and_container( selector_name, port_to_forward, "8500") assert test_utils.request_tensorflow_inference(model_name=model_name, port=port_to_forward) except ValueError as excp: run("kubectl cluster-info dump") eks_utils.LOGGER.error("Service is not running: %s", excp) finally: run(f"kubectl delete deployment {selector_name}") run(f"kubectl delete service {selector_name}")
def __test_eks_tensorflow_half_plus_two_inference(tensorflow_inference): num_replicas = "1" rand_int = random.randint(4001, 6000) processor = "gpu" if "gpu" in tensorflow_inference else "cpu" model_name = f"saved_model_half_plus_two_{processor}" yaml_path = os.path.join( os.sep, "tmp", f"tensorflow_single_node_{processor}_inference_{rand_int}.yaml") inference_service_name = selector_name = f"half-plus-two-service-{processor}-{rand_int}" model_base_path = get_eks_tensorflow_model_base_path( tensorflow_inference, model_name) command, args = get_tensorflow_command_args(tensorflow_inference, model_name, model_base_path) test_type = test_utils.get_eks_k8s_test_type_label(tensorflow_inference) search_replace_dict = { "<NUM_REPLICAS>": num_replicas, "<SELECTOR_NAME>": selector_name, "<INFERENCE_SERVICE_NAME>": inference_service_name, "<DOCKER_IMAGE_BUILD_ID>": tensorflow_inference, "<COMMAND>": command, "<ARGS>": args, "<TEST_TYPE>": test_type, } if processor == "gpu": search_replace_dict["<NUM_GPUS>"] = "1" eks_utils.write_eks_yaml_file_from_template( eks_utils.get_single_node_inference_template_path( "tensorflow", processor), yaml_path, search_replace_dict) try: run("kubectl apply -f {}".format(yaml_path)) port_to_forward = random.randint(49152, 65535) if eks_utils.is_service_running(selector_name): eks_utils.eks_forward_port_between_host_and_container( selector_name, port_to_forward, "8500") assert test_utils.request_tensorflow_inference(model_name=model_name, port=port_to_forward) finally: run(f"kubectl delete deployment {selector_name}") run(f"kubectl delete service {selector_name}")
def test_eks_tensorflow_neuron_inference(tensorflow_inference_neuron): num_replicas = "1" rand_int = random.randint(4001, 6000) processor = "neuron" model_name = "mnist_neuron" yaml_path = os.path.join( os.sep, "tmp", f"tensorflow_single_node_{processor}_inference_{rand_int}.yaml") inference_service_name = selector_name = f"mnist-{processor}-{rand_int}" search_replace_dict = { "<MODEL_NAME>": model_name, "<MODEL_BASE_PATH>": f"s3://aws-dlc-sample-models", "<NUM_REPLICAS>": num_replicas, "<SELECTOR_NAME>": selector_name, "<INFERENCE_SERVICE_NAME>": inference_service_name, "<DOCKER_IMAGE_BUILD_ID>": tensorflow_inference_neuron, } search_replace_dict["<NUM_INF1S>"] = "1" eks_utils.write_eks_yaml_file_from_template( eks_utils.get_single_node_inference_template_path( "tensorflow", processor), yaml_path, search_replace_dict) secret_yml_path = eks_utils.get_aws_secret_yml_path() try: run("kubectl apply -f {}".format(yaml_path)) port_to_forward = random.randint(49152, 65535) if eks_utils.is_service_running(selector_name): eks_utils.eks_forward_port_between_host_and_container( selector_name, port_to_forward, "8501") inference_string = '\'{"instances": ' + "{}".format( [[0 for i in range(784)]]) + "}'" assert test_utils.request_tensorflow_inference( model_name=model_name, port=port_to_forward, inference_string=inference_string) finally: run(f"kubectl delete deployment {selector_name}") run(f"kubectl delete service {selector_name}")
def test_eks_tensorflow_half_plus_two_inference(tensorflow_inference): if "eia" in tensorflow_inference or "neuron" in tensorflow_inference: pytest.skip("Skipping EKS Test for EIA and neuron Images") num_replicas = "1" rand_int = random.randint(4001, 6000) processor = "gpu" if "gpu" in tensorflow_inference else "cpu" model_name = f"saved_model_half_plus_two_{processor}" yaml_path = os.path.join( os.sep, "tmp", f"tensorflow_single_node_{processor}_inference_{rand_int}.yaml") inference_service_name = selector_name = f"half-plus-two-service-{processor}-{rand_int}" search_replace_dict = { "<MODEL_NAME>": model_name, "<MODEL_BASE_PATH>": f"s3://tensoflow-trained-models/{model_name}", "<NUM_REPLICAS>": num_replicas, "<SELECTOR_NAME>": selector_name, "<INFERENCE_SERVICE_NAME>": inference_service_name, "<DOCKER_IMAGE_BUILD_ID>": tensorflow_inference } if processor == "gpu": search_replace_dict["<NUM_GPUS>"] = "1" eks_utils.write_eks_yaml_file_from_template( eks_utils.get_single_node_inference_template_path( "tensorflow", processor), yaml_path, search_replace_dict) try: run("kubectl apply -f {}".format(yaml_path)) port_to_forward = random.randint(49152, 65535) if eks_utils.is_service_running(selector_name): eks_utils.eks_forward_port_between_host_and_container( selector_name, port_to_forward, "8500") assert test_utils.request_tensorflow_inference(model_name=model_name, port=port_to_forward) except ValueError as excp: eks_utils.LOGGER.error("Service is not running: %s", excp) finally: run(f"kubectl delete deployment {selector_name}") run(f"kubectl delete service {selector_name}")
def test_eks_mxnet_gluonnlp_inference(mxnet_inference, py3_only): if "eia" in mxnet_inference: pytest.skip("Skipping EKS Test for EIA") num_replicas = "1" rand_int = random.randint(4001, 6000) processor = "gpu" if "gpu" in mxnet_inference else "cpu" model = "https://aws-dlc-sample-models.s3.amazonaws.com/bert_sst/bert_sst.mar" yaml_path = os.path.join( os.sep, "tmp", f"mxnet_single_node_gluonnlp_{processor}_inference_{rand_int}.yaml") inference_service_name = selector_name = f"gluonnlp-service-{processor}-{rand_int}" search_replace_dict = { "<MODELS>": model, "<NUM_REPLICAS>": num_replicas, "<SELECTOR_NAME>": selector_name, "<INFERENCE_SERVICE_NAME>": inference_service_name, "<DOCKER_IMAGE_BUILD_ID>": mxnet_inference } if processor == "gpu": search_replace_dict["<NUM_GPUS>"] = "1" eks_utils.write_eks_yaml_file_from_template( eks_utils.get_single_node_inference_template_path("mxnet", processor), yaml_path, search_replace_dict) try: run("kubectl apply -f {}".format(yaml_path)) port_to_forward = random.randint(49152, 65535) if eks_utils.is_service_running(selector_name): eks_utils.eks_forward_port_between_host_and_container( selector_name, port_to_forward, "8080") assert test_utils.request_mxnet_inference_gluonnlp( port=port_to_forward) except ValueError as excp: eks_utils.LOGGER.error("Service is not running: %s", excp) finally: run(f"kubectl delete deployment {selector_name}") run(f"kubectl delete service {selector_name}")
def test_eks_mxnet_multinode_training(mxnet_training, example_only): """ Run MXNet distributed training on EKS using docker images with MNIST dataset (parameter server) """ random.seed( f"{mxnet_training}-{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}" ) unique_id = random.randint(1, 6000) namespace = f"mxnet-multi-node-training-{unique_id}" job_name = f"kubeflow-mxnet-gpu-dist-job-{unique_id}" # TODO: This should either be dynamic or at least global variables num_workers = "3" num_servers = "2" gpu_limit = "1" epochs = '"20"' layers = '"2"' gpus = '"0"' local_template_file_path = os.path.join("eks", "eks_manifest_templates", "mxnet", "training", "multi_node_gpu_training.yaml") remote_yaml_file_path = os.path.join( os.sep, "tmp", f"mxnet_multi_node_training_{unique_id}.yaml") replace_dict = { "<JOB_NAME>": job_name, "<NUM_SERVERS>": num_servers, "<NUM_WORKERS>": num_workers, "<CONTAINER_IMAGE>": mxnet_training, "<EPOCHS>": epochs, "<LAYERS>": layers, "<GPUS>": gpus, "<GPU_LIMIT>": gpu_limit } eks_utils.write_eks_yaml_file_from_template(local_template_file_path, remote_yaml_file_path, replace_dict) training_result = _run_eks_mxnet_multi_node_training( namespace, job_name, remote_yaml_file_path) assert training_result, "EKS multinode training failed"
def test_eks_mxnet_neuron_inference(mxnet_inference, neuron_only): if "eia" in mxnet_inference or "neuron" not in mxnet_inference: pytest.skip("Skipping EKS Neuron Test for EIA and Non Neuron Images") num_replicas = "1" rand_int = random.randint(4001, 6000) processor = "neuron" server_cmd = "/usr/local/bin/entrypoint.sh -m mxnet-resnet50=https://aws-dlc-sample-models.s3.amazonaws.com/mxnet/Resnet50-neuron.mar -t /home/model-server/config.properties" yaml_path = os.path.join( os.sep, "tmp", f"mxnet_single_node_{processor}_inference_{rand_int}.yaml") inference_service_name = selector_name = f"resnet50-{processor}-{rand_int}" search_replace_dict = { "<NUM_REPLICAS>": num_replicas, "<SELECTOR_NAME>": selector_name, "<INFERENCE_SERVICE_NAME>": inference_service_name, "<DOCKER_IMAGE_BUILD_ID>": mxnet_inference, "<SERVER_CMD>": server_cmd, } search_replace_dict["<NUM_INF1S>"] = "1" eks_utils.write_eks_yaml_file_from_template( eks_utils.get_single_node_inference_template_path("mxnet", processor), yaml_path, search_replace_dict) try: run("kubectl apply -f {}".format(yaml_path)) port_to_forward = random.randint(49152, 65535) if eks_utils.is_service_running(selector_name): eks_utils.eks_forward_port_between_host_and_container( selector_name, port_to_forward, "8080") assert test_utils.request_mxnet_inference(port=port_to_forward, model="mxnet-resnet50") finally: run(f"kubectl delete deployment {selector_name}") run(f"kubectl delete service {selector_name}")
def test_eks_mxnet_squeezenet_inference(mxnet_inference): if "eia" in mxnet_inference or "neuron" in mxnet_inference: pytest.skip("Skipping EKS Test for EIA and neuron images") num_replicas = "1" rand_int = random.randint(4001, 6000) processor = "gpu" if "gpu" in mxnet_inference else "cpu" model = "squeezenet=https://s3.amazonaws.com/model-server/models/squeezenet_v1.1/squeezenet_v1.1.model" yaml_path = os.path.join( os.sep, "tmp", f"mxnet_single_node_{processor}_inference_{rand_int}.yaml") inference_service_name = selector_name = f"squeezenet-service-{rand_int}" search_replace_dict = { "<MODELS>": model, "<NUM_REPLICAS>": num_replicas, "<SELECTOR_NAME>": selector_name, "<INFERENCE_SERVICE_NAME>": inference_service_name, "<DOCKER_IMAGE_BUILD_ID>": mxnet_inference, } if processor == "gpu": search_replace_dict["<NUM_GPUS>"] = "1" eks_utils.write_eks_yaml_file_from_template( eks_utils.get_single_node_inference_template_path("mxnet", processor), yaml_path, search_replace_dict) try: run("kubectl apply -f {}".format(yaml_path)) port_to_forward = random.randint(49152, 65535) if eks_utils.is_service_running(selector_name): eks_utils.eks_forward_port_between_host_and_container( selector_name, port_to_forward, "8080") assert test_utils.request_mxnet_inference(port=port_to_forward) finally: run(f"kubectl delete deployment {selector_name}") run(f"kubectl delete service {selector_name}")
def test_eks_pytorch_multinode_node_training(pytorch_training, example_only): """ Function to create mutliple pods using kubectl and given container image, and run Pytorch training Args: :param setup_utils: environment in which EKS tools are setup :param pytorch_training: the ECR URI """ # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. random.seed( f"{pytorch_training}-{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}" ) unique_id = random.randint(1, 6000) namespace = f"pytorch-multi-node-training-{unique_id}" app_name = f"eks-pytorch-mnist-app-{unique_id}" job_name = f"kubeflow-pytorch-gpu-dist-job-{unique_id}" num_masters = "1" num_workers = "3" gpu_limit = "1" backend = "gloo" epochs = '"10"' local_template_file_path = os.path.join("eks", "eks_manifest_templates", "pytorch", "training", "multi_node_gpu_training.yaml") remote_yaml_path = os.path.join( os.sep, "tmp", f"pytorch_multinode_node_training_{unique_id}.yaml") replace_dict = { "<JOB_NAME>": job_name, "<NUM_MASTERS>": num_masters, "<NUM_WORKERS>": num_workers, "<CONTAINER_IMAGE>": pytorch_training, "<BACKEND>": backend, "<EPOCHS>": epochs, "<GPU_LIMIT>": gpu_limit } eks_utils.write_eks_yaml_file_from_template(local_template_file_path, remote_yaml_path, replace_dict) run_eks_pytorch_multi_node_training(namespace, app_name, job_name, remote_yaml_path, unique_id)
def test_eks_pytorch_single_node_training(pytorch_training): """ Function to create a pod using kubectl and given container image, and run MXNet training Args: :param setup_utils: environment in which EKS tools are setup :param pytorch_training: the ECR URI """ training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join(os.sep, "tmp", f"pytorch_single_node_training_{rand_int}.yaml") pod_name = f"pytorch-single-node-training-{rand_int}" # Workaround for https://github.com/pytorch/vision/issues/1938 and https://github.com/pytorch/vision/issues/3549 mnist_dataset_download_config = ''' FILE=new_main.py && echo "from __future__ import print_function" > $FILE && echo "from six.moves import urllib" >> $FILE && echo "from packaging.version import Version" >> $FILE && echo "opener = urllib.request.build_opener()" >> $FILE && echo "opener.addheaders = [('User-agent', 'Mozilla/5.0')]" >> $FILE && echo "urllib.request.install_opener(opener)" >> $FILE && echo "import torchvision" >> $FILE && echo "from torchvision import datasets, transforms" >> $FILE && echo "# from torchvision 0.9.1, 2 candidate mirror website links will be added before resources items automatically" >> $FILE && echo "# Reference PR https://github.com/pytorch/vision/pull/3559" >> $FILE && echo "TORCHVISION_VERSION = '0.9.1'" >> $FILE && echo "if Version(torchvision.__version__) < Version(TORCHVISION_VERSION):" >> $FILE && echo " datasets.MNIST.resources = [" >> $FILE && echo " ('https://dlinfra-mnist-dataset.s3-us-west-2.amazonaws.com/mnist/train-images-idx3-ubyte.gz', 'f68b3c2dcbeaaa9fbdd348bbdeb94873')," >> $FILE && echo " ('https://dlinfra-mnist-dataset.s3-us-west-2.amazonaws.com/mnist/train-labels-idx1-ubyte.gz', 'd53e105ee54ea40749a09fcbcd1e9432')," >> $FILE && echo " ('https://dlinfra-mnist-dataset.s3-us-west-2.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz', '9fb629c4189551a2d022fa330f9573f3')," >> $FILE && echo " ('https://dlinfra-mnist-dataset.s3-us-west-2.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz', 'ec29112dd5afa0611ce80d1b7f02629c')" >> $FILE && echo " ]" >> $FILE && sed -i '1d' examples/mnist/main.py && sed -i '6d' examples/mnist/main.py && cat examples/mnist/main.py >> $FILE && rm examples/mnist/main.py && mv $FILE examples/mnist/main.py ''' args = f"git clone https://github.com/pytorch/examples.git && {mnist_dataset_download_config} && python examples/mnist/main.py" # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": pytorch_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): pytorch_out = run("kubectl logs {}".format(pod_name)).stdout if "Accuracy" in pytorch_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(pytorch_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_pytorch_dgl_single_node_training(pytorch_training, py3_only): """ Function to create a pod using kubectl and given container image, and run DGL training with PyTorch backend Args: :param pytorch_training: the ECR URI """ _, image_framework_version = get_framework_and_version_from_tag( pytorch_training) image_cuda_version = get_cuda_version_from_tag(pytorch_training) if Version(image_framework_version) == Version( "1.6") and image_cuda_version == "cu110": pytest.skip("DGL does not suport CUDA 11 for PyTorch 1.6") # TODO: Remove when DGL gpu test on ecs get fixed if Version(image_framework_version) >= Version("1.10"): pytest.skip("ecs test for DGL gpu fails since pt 1.10") training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join( os.sep, "tmp", f"pytorch_single_node_training_dgl_{rand_int}.yaml") pod_name = f"pytorch-single-node-training-dgl-{rand_int}" if is_below_framework_version("1.7", pytorch_training, "pytorch"): dgl_branch = "0.4.x" else: dgl_branch = "0.7.x" args = ( f"git clone -b {dgl_branch} https://github.com/dmlc/dgl.git && " f"cd /dgl/examples/pytorch/gcn/ && DGLBACKEND=pytorch python train.py --dataset cora" ) # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) if "gpu" in pytorch_training: args = args + " --gpu 0" else: args = args + " --gpu -1" search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": pytorch_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): dgl_out = run("kubectl logs {}".format(pod_name)).stdout if "Test accuracy" in dgl_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(dgl_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_mxnet_dgl_single_node_training(mxnet_training, py3_only): """ Function to create a pod using kubectl and given container image, and run DGL training with MXNet backend Args: :param mxnet_training: the ECR URI """ # TODO: remove/update this when DGL supports MXNet 1.9 _, framework_version = get_framework_and_version_from_tag(mxnet_training) if Version(framework_version) >= Version('1.9.0'): pytest.skip("Skipping DGL tests as DGL does not yet support MXNet 1.9") training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join( os.sep, "tmp", f"mxnet_single_node_training_dgl_{rand_int}.yaml") pod_name = f"mxnet-single-node-training-dgl-{rand_int}" ctx = Context() # Run container to determine dgl version container_name = get_container_name("dgl-mx", mxnet_training) ctx.run(f"docker run --name {container_name} -itd {mxnet_training}") dgl_version = ctx.run( f"docker exec --user root {container_name} python -c 'import dgl; print(dgl.__version__)'" ).stdout.strip() dgl_major_minor = re.search(r'(^\d+.\d+).', dgl_version).group(1) dgl_branch = f"{dgl_major_minor}.x" args = ( f"git clone -b {dgl_branch} https://github.com/dmlc/dgl.git && " f"cd /dgl/examples/mxnet/gcn/ && DGLBACKEND=mxnet python train.py --dataset cora" ) # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) if "gpu" in mxnet_training: args = args + " --gpu 0" else: args = args + " --gpu -1" search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": mxnet_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): dgl_out = run("kubectl logs {}".format(pod_name)).stdout if "Test accuracy" in dgl_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(dgl_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_mxnet_gluonnlp_single_node_training(mxnet_training, py3_only): """ Function to create a pod using kubectl and given container image, and run DGL training with MXNet backend Args: :param mxnet_training: the ECR URI """ training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join( os.sep, "tmp", f"mxnet_single_node_training_gluonnlp_{rand_int}.yaml") pod_name = f"mxnet-single-node-training-gluonnlp-{rand_int}" args = ( "git clone -b master https://github.com/dmlc/gluon-nlp.git && " "cd gluon-nlp && git checkout v0.9.0 &&" "cd ./scripts/sentiment_analysis/ &&" "python sentiment_analysis_cnn.py --batch_size 50 --epochs 20 --dropout 0.5 " "--model_mode multichannel --data_name TREC") # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) if "gpu" in mxnet_training: args = args + " --gpu 0" search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": mxnet_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): gluonnlp_out = run("kubectl logs {}".format(pod_name)).stdout results = re.search(r"test acc ((?:\d*\.\d+)|\d+)", gluonnlp_out) if results is not None: accuracy = float(results.groups()[0]) if accuracy >= 0.75: eks_utils.LOGGER.info( "GluonNLP EKS test succeeded with accuracy {} >= 0.75". format(accuracy)) training_result = True else: eks_utils.LOGGER.info( "GluonNLP EKS test FAILED with accuracy {} < 0.75". format(accuracy)) eks_utils.LOGGER.debug(gluonnlp_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))