def test_eks_mxnet_single_node_training(mxnet_training): """ Function to create a pod using kubectl and given container image, and run MXNet training Args: :param mxnet_training: the ECR URI """ training_result = False rand_int = random.randint(4001, 6000) framework_version_search = re.search(r"\d+\.\d+", mxnet_training) framework_version = "v" + framework_version_search.group() + ".x" yaml_path = os.path.join(os.sep, "tmp", f"mxnet_single_node_training_{rand_int}.yaml") pod_name = f"mxnet-single-node-training-{rand_int}" # Temporariy fix for 503 error while downloading MNIST dataset. See https://github.com/pytorch/vision/issues/3549 mnist_dataset_download_config = ''' FROM="http:\/\/yann\.lecun\.com\/exdb\/mnist\/" && TO="https:\/\/ossci-datasets\.s3\.amazonaws\.com\/mnist\/" && sed -i -e "s/${FROM}/${TO}/g" /incubator-mxnet/example/image-classification/train_mnist.py ''' args = ( f"git clone -b {framework_version} https://github.com/apache/incubator-mxnet.git && {mnist_dataset_download_config} && python " f"/incubator-mxnet/example/image-classification/train_mnist.py") processor_type = "gpu" if "gpu" in mxnet_training else "cpu" args = args + " --gpus 0" if processor_type == "gpu" else args # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": mxnet_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): mxnet_out = run("kubectl logs {}".format(pod_name)).stdout if "Epoch[19] Validation-accuracy" in mxnet_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(mxnet_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_mxnet_dgl_single_node_training(mxnet_training, py3_only): """ Function to create a pod using kubectl and given container image, and run DGL training with MXNet backend Args: :param mxnet_training: the ECR URI """ training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join( os.sep, "tmp", f"mxnet_single_node_training_dgl_{rand_int}.yaml") pod_name = f"mxnet-single-node-training-dgl-{rand_int}" dgl_branch = "0.4.x" args = ( f"git clone -b {dgl_branch} https://github.com/dmlc/dgl.git && " f"cd /dgl/examples/mxnet/gcn/ && DGLBACKEND=mxnet python train.py --dataset cora" ) # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) if "gpu" in mxnet_training: if "cu110" in mxnet_training: pytest.skip( "Skipping DGL tests for GPU until dgl-cu110 is available.") args = args + " --gpu 0" else: args = args + " --gpu -1" search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": mxnet_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): dgl_out = run("kubectl logs {}".format(pod_name)).stdout if "Test accuracy" in dgl_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(dgl_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_pytorch_single_node_training(pytorch_training): """ Function to create a pod using kubectl and given container image, and run MXNet training Args: :param setup_utils: environment in which EKS tools are setup :param pytorch_training: the ECR URI """ training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join(os.sep, "tmp", f"pytorch_single_node_training_{rand_int}.yaml") pod_name = f"pytorch-single-node-training-{rand_int}" mnist_dataset_download_config = ''' FILE=new_main.py && echo "from __future__ import print_function" > $FILE && echo "from six.moves import urllib" >> $FILE && echo "opener = urllib.request.build_opener()" >> $FILE && echo "opener.addheaders = [('User-agent', 'Mozilla/5.0')]" >> $FILE && echo "urllib.request.install_opener(opener)" >> $FILE && sed -i '1d' examples/mnist/main.py && cat examples/mnist/main.py >> $FILE && rm examples/mnist/main.py && mv $FILE examples/mnist/main.py ''' args = f"git clone https://github.com/pytorch/examples.git && {mnist_dataset_download_config} && python examples/mnist/main.py" # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": pytorch_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): pytorch_out = run("kubectl logs {}".format(pod_name)).stdout if "Accuracy" in pytorch_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(pytorch_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_mxnet_single_node_training(mxnet_training): """ Function to create a pod using kubectl and given container image, and run MXNet training Args: :param mxnet_training: the ECR URI """ training_result = False rand_int = random.randint(4001, 6000) framework_version_search = re.search(r"\d+(\.\d+){2}", mxnet_training) framework_version = framework_version_search.group() if not framework_version_search: framework_version_search = re.search(r"\d+\.\d+", mxnet_training) framework_version = framework_version_search.group() + ".0" yaml_path = os.path.join(os.sep, "tmp", f"mxnet_single_node_training_{rand_int}.yaml") pod_name = f"mxnet-single-node-training-{rand_int}" args = ( f"git clone -b {framework_version} https://github.com/apache/incubator-mxnet.git && python " f"/incubator-mxnet/example/image-classification/train_mnist.py" ) processor_type = "gpu" if "gpu" in mxnet_training else "cpu" args = args + " --gpus 0" if processor_type == "gpu" else args # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": mxnet_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict ) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): mxnet_out = run("kubectl logs {}".format(pod_name)).stdout if "Epoch[19] Validation-accuracy" in mxnet_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(mxnet_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_pt_s3_plugin_single_node_training(pytorch_training, pt17_and_above_only): """ Function to create a pod using kubectl and given container image, and run MXNet training Args: :param setup_utils: environment in which EKS tools are setup :param pytorch_training: the ECR URI """ _, image_framework_version = get_framework_and_version_from_tag( pytorch_training) if Version(image_framework_version) < Version("1.8"): pytest.skip("S3 plugin is supported on PyTorch version >=1.8") training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join( os.sep, "tmp", f"pytorch_s3_single_node_training_{rand_int}.yaml") pod_name = f"pytorch-s3-single-node-training-{rand_int}" args = f"git clone https://github.com/aws/amazon-s3-plugin-for-pytorch.git && python amazon-s3-plugin-for-pytorch/examples/s3_imagenet_example.py" # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 96 cpu_limit = str(int(cpu_limit) / 2) if "gpu" in pytorch_training: args = args + " --gpu 0" search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": pytorch_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): pytorch_out = run("kubectl logs {}".format(pod_name)).stdout if "Acc" in pytorch_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(pytorch_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_tensorflow_single_node_training(tensorflow_training): """ Function to create a pod using kubectl and given container image, and run MXNet training Args: :param setup_utils: environment in which EKS tools are setup :param tensorflow_training: the ECR URI """ training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join( os.sep, "tmp", f"tensorflow_single_node_training_{rand_int}.yaml") pod_name = f"tensorflow-single-node-training-{rand_int}" args = ( "git clone https://github.com/fchollet/keras.git " "&& sed -i 's/import keras/from tensorflow import keras/g; " "s/from keras/from tensorflow.keras/g' /keras/examples/mnist_cnn.py " "&& python /keras/examples/mnist_cnn.py") # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": tensorflow_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): tensorflow_out = run("kubectl logs {}".format(pod_name)).stdout if "Test accuracy" in tensorflow_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(tensorflow_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_pytorch_single_node_training(pytorch_training): """ Function to create a pod using kubectl and given container image, and run MXNet training Args: :param setup_utils: environment in which EKS tools are setup :param pytorch_training: the ECR URI """ training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join(os.sep, "tmp", f"pytorch_single_node_training_{rand_int}.yaml") pod_name = f"pytorch-single-node-training-{rand_int}" # Workaround for https://github.com/pytorch/vision/issues/1938 and https://github.com/pytorch/vision/issues/3549 mnist_dataset_download_config = ''' FILE=new_main.py && echo "from __future__ import print_function" > $FILE && echo "from six.moves import urllib" >> $FILE && echo "from packaging.version import Version" >> $FILE && echo "opener = urllib.request.build_opener()" >> $FILE && echo "opener.addheaders = [('User-agent', 'Mozilla/5.0')]" >> $FILE && echo "urllib.request.install_opener(opener)" >> $FILE && echo "import torchvision" >> $FILE && echo "from torchvision import datasets, transforms" >> $FILE && echo "# from torchvision 0.9.1, 2 candidate mirror website links will be added before resources items automatically" >> $FILE && echo "# Reference PR https://github.com/pytorch/vision/pull/3559" >> $FILE && echo "TORCHVISION_VERSION = '0.9.1'" >> $FILE && echo "if Version(torchvision.__version__) < Version(TORCHVISION_VERSION):" >> $FILE && echo " datasets.MNIST.resources = [" >> $FILE && echo " ('https://dlinfra-mnist-dataset.s3-us-west-2.amazonaws.com/mnist/train-images-idx3-ubyte.gz', 'f68b3c2dcbeaaa9fbdd348bbdeb94873')," >> $FILE && echo " ('https://dlinfra-mnist-dataset.s3-us-west-2.amazonaws.com/mnist/train-labels-idx1-ubyte.gz', 'd53e105ee54ea40749a09fcbcd1e9432')," >> $FILE && echo " ('https://dlinfra-mnist-dataset.s3-us-west-2.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz', '9fb629c4189551a2d022fa330f9573f3')," >> $FILE && echo " ('https://dlinfra-mnist-dataset.s3-us-west-2.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz', 'ec29112dd5afa0611ce80d1b7f02629c')" >> $FILE && echo " ]" >> $FILE && sed -i '1d' examples/mnist/main.py && sed -i '6d' examples/mnist/main.py && cat examples/mnist/main.py >> $FILE && rm examples/mnist/main.py && mv $FILE examples/mnist/main.py ''' args = f"git clone https://github.com/pytorch/examples.git && {mnist_dataset_download_config} && python examples/mnist/main.py" # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": pytorch_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): pytorch_out = run("kubectl logs {}".format(pod_name)).stdout if "Accuracy" in pytorch_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(pytorch_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_pytorch_dgl_single_node_training(pytorch_training, py3_only): """ Function to create a pod using kubectl and given container image, and run DGL training with PyTorch backend Args: :param pytorch_training: the ECR URI """ _, image_framework_version = get_framework_and_version_from_tag( pytorch_training) image_cuda_version = get_cuda_version_from_tag(pytorch_training) if Version(image_framework_version) == Version( "1.6") and image_cuda_version == "cu110": pytest.skip("DGL does not suport CUDA 11 for PyTorch 1.6") # TODO: Remove when DGL gpu test on ecs get fixed if Version(image_framework_version) >= Version("1.10"): pytest.skip("ecs test for DGL gpu fails since pt 1.10") training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join( os.sep, "tmp", f"pytorch_single_node_training_dgl_{rand_int}.yaml") pod_name = f"pytorch-single-node-training-dgl-{rand_int}" if is_below_framework_version("1.7", pytorch_training, "pytorch"): dgl_branch = "0.4.x" else: dgl_branch = "0.7.x" args = ( f"git clone -b {dgl_branch} https://github.com/dmlc/dgl.git && " f"cd /dgl/examples/pytorch/gcn/ && DGLBACKEND=pytorch python train.py --dataset cora" ) # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) if "gpu" in pytorch_training: args = args + " --gpu 0" else: args = args + " --gpu -1" search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": pytorch_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): dgl_out = run("kubectl logs {}".format(pod_name)).stdout if "Test accuracy" in dgl_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(dgl_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_mxnet_gluonnlp_single_node_training(mxnet_training, py3_only): """ Function to create a pod using kubectl and given container image, and run DGL training with MXNet backend Args: :param mxnet_training: the ECR URI """ training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join( os.sep, "tmp", f"mxnet_single_node_training_gluonnlp_{rand_int}.yaml") pod_name = f"mxnet-single-node-training-gluonnlp-{rand_int}" args = ( "git clone -b master https://github.com/dmlc/gluon-nlp.git && " "cd gluon-nlp && git checkout v0.9.0 &&" "cd ./scripts/sentiment_analysis/ &&" "python sentiment_analysis_cnn.py --batch_size 50 --epochs 20 --dropout 0.5 " "--model_mode multichannel --data_name TREC") # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) if "gpu" in mxnet_training: args = args + " --gpu 0" search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": mxnet_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): gluonnlp_out = run("kubectl logs {}".format(pod_name)).stdout results = re.search(r"test acc ((?:\d*\.\d+)|\d+)", gluonnlp_out) if results is not None: accuracy = float(results.groups()[0]) if accuracy >= 0.75: eks_utils.LOGGER.info( "GluonNLP EKS test succeeded with accuracy {} >= 0.75". format(accuracy)) training_result = True else: eks_utils.LOGGER.info( "GluonNLP EKS test FAILED with accuracy {} < 0.75". format(accuracy)) eks_utils.LOGGER.debug(gluonnlp_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_eks_mxnet_dgl_single_node_training(mxnet_training, py3_only): """ Function to create a pod using kubectl and given container image, and run DGL training with MXNet backend Args: :param mxnet_training: the ECR URI """ # TODO: remove/update this when DGL supports MXNet 1.9 _, framework_version = get_framework_and_version_from_tag(mxnet_training) if Version(framework_version) >= Version('1.9.0'): pytest.skip("Skipping DGL tests as DGL does not yet support MXNet 1.9") training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join( os.sep, "tmp", f"mxnet_single_node_training_dgl_{rand_int}.yaml") pod_name = f"mxnet-single-node-training-dgl-{rand_int}" ctx = Context() # Run container to determine dgl version container_name = get_container_name("dgl-mx", mxnet_training) ctx.run(f"docker run --name {container_name} -itd {mxnet_training}") dgl_version = ctx.run( f"docker exec --user root {container_name} python -c 'import dgl; print(dgl.__version__)'" ).stdout.strip() dgl_major_minor = re.search(r'(^\d+.\d+).', dgl_version).group(1) dgl_branch = f"{dgl_major_minor}.x" args = ( f"git clone -b {dgl_branch} https://github.com/dmlc/dgl.git && " f"cd /dgl/examples/mxnet/gcn/ && DGLBACKEND=mxnet python train.py --dataset cora" ) # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) if "gpu" in mxnet_training: args = args + " --gpu 0" else: args = args + " --gpu -1" search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": mxnet_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): dgl_out = run("kubectl logs {}".format(pod_name)).stdout if "Test accuracy" in dgl_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(dgl_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))