예제 #1
0
def test_ecs_tensorflow_inference_gpu(tensorflow_inference,
                                      ecs_container_instance, region,
                                      gpu_only):
    worker_instance_id, ecs_cluster_arn = ecs_container_instance
    public_ip_address = ec2_utils.get_public_ip(worker_instance_id,
                                                region=region)
    num_gpus = ec2_utils.get_instance_num_gpus(worker_instance_id)

    model_name = "saved_model_half_plus_two"
    service_name = task_family = revision = None
    try:
        service_name, task_family, revision = ecs_utils.setup_ecs_inference_service(
            tensorflow_inference,
            "tensorflow",
            ecs_cluster_arn,
            model_name,
            worker_instance_id,
            num_gpus=num_gpus,
            region=region)
        model_name = get_tensorflow_model_name("gpu", model_name)
        inference_result = request_tensorflow_inference(
            model_name, ip_address=public_ip_address)
        assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}"

    finally:
        ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn,
                                                  service_name, task_family,
                                                  revision)
예제 #2
0
def test_ecs_pytorch_training_dgl_gpu(gpu_only, py3_only,
                                      ecs_container_instance, pytorch_training,
                                      training_cmd, ecs_cluster_name):
    """
    GPU DGL test for PyTorch Training

    Instance Type - p3.8xlarge

    DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
    on this function.

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    _, image_framework_version = get_framework_and_version_from_tag(
        pytorch_training)
    image_cuda_version = get_cuda_version_from_tag(pytorch_training)
    if Version(image_framework_version) == Version(
            "1.6") and image_cuda_version == "cu110":
        pytest.skip("DGL does not suport CUDA 11 for PyTorch 1.6")
    # TODO: Remove when DGL gpu test on ecs get fixed
    if Version(image_framework_version) >= Version(
            "1.10") and image_cuda_version == "cu113":
        pytest.skip("ecs test for DGL gpu fails since pt 1.10")

    instance_id, cluster_arn = ecs_container_instance

    num_gpus = ec2_utils.get_instance_num_gpus(instance_id)

    ecs_utils.ecs_training_test_executor(ecs_cluster_name,
                                         cluster_arn,
                                         training_cmd,
                                         pytorch_training,
                                         instance_id,
                                         num_gpus=num_gpus)
def test_ecs_pytorch_inference_gpu(pytorch_inference, ecs_container_instance,
                                   region, gpu_only):
    worker_instance_id, ecs_cluster_arn = ecs_container_instance
    public_ip_address = ec2_utils.get_public_ip(worker_instance_id,
                                                region=region)
    num_gpus = ec2_utils.get_instance_num_gpus(worker_instance_id,
                                               region=region)

    model_name = "pytorch-densenet"
    service_name = task_family = revision = None
    try:
        service_name, task_family, revision = ecs_utils.setup_ecs_inference_service(
            pytorch_inference,
            "pytorch",
            ecs_cluster_arn,
            model_name,
            worker_instance_id,
            num_gpus=num_gpus,
            region=region)
        inference_result = request_pytorch_inference_densenet(
            public_ip_address)
        assert inference_result, f"Failed to perform inference at IP address: {public_ip_address}"

    finally:
        ecs_utils.tear_down_ecs_inference_service(ecs_cluster_arn,
                                                  service_name, task_family,
                                                  revision)
def test_ecs_mxnet_training_gluonnlp_gpu(gpu_only, py3_only,
                                         ecs_container_instance,
                                         mxnet_training, training_cmd,
                                         ecs_cluster_name):
    """
    GPU Gluon NLP test for MXNet Training

    Instance Type - p2.16xlarge

    DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
    on this function.

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    instance_id, cluster_arn = ecs_container_instance

    num_gpus = ec2_utils.get_instance_num_gpus(instance_id)

    ecs_utils.ecs_training_test_executor(ecs_cluster_name,
                                         cluster_arn,
                                         training_cmd,
                                         mxnet_training,
                                         instance_id,
                                         num_gpus=num_gpus)
예제 #5
0
def test_ecs_pytorch_s3_plugin_training_gpu(gpu_only, ecs_container_instance,
                                            pytorch_training, training_cmd,
                                            ecs_cluster_name,
                                            pt17_and_above_only):
    """
    GPU resnet18 test for PyTorch Training using S3 plugin

    Instance Type - p3.8xlarge

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    _, image_framework_version = get_framework_and_version_from_tag(
        pytorch_training)
    if Version(image_framework_version) < Version("1.8"):
        pytest.skip("S3 plugin is supported on PyTorch version >=1.8")
    instance_id, cluster_arn = ecs_container_instance

    num_gpus = ec2_utils.get_instance_num_gpus(instance_id)

    ecs_utils.ecs_training_test_executor(ecs_cluster_name,
                                         cluster_arn,
                                         training_cmd,
                                         pytorch_training,
                                         instance_id,
                                         num_gpus=num_gpus)
def test_eks_tensorflow_multi_node_training_gpu(tensorflow_training, example_only):
    eks_cluster_size = "3"                                                        
    ec2_instance_type = "p3.16xlarge"

    eks_gpus_per_worker = ec2_utils.get_instance_num_gpus(instance_type=ec2_instance_type)

    _run_eks_tensorflow_multinode_training_resnet50_mpijob(tensorflow_training, eks_cluster_size, eks_gpus_per_worker)
def test_eks_mxnet_multi_node_training_horovod_mnist(mxnet_training, example_only):
    """
    Run MXNet distributed training on EKS using docker images with MNIST dataset (horovod)
    """
    eks_cluster_size = "3"
    ec2_instance_type = "p3.16xlarge"

    eks_gpus_per_worker = ec2_utils.get_instance_num_gpus(instance_type=ec2_instance_type)
    
    _run_eks_mxnet_multinode_training_horovod_mpijob(mxnet_training, eks_cluster_size, eks_gpus_per_worker)
예제 #8
0
def test_eks_tensorflow_multi_node_training_gpu(tensorflow_training,
                                                example_only):
    eks_cluster_size = 3
    ec2_instance_type = "p3.16xlarge"
    cluster_name = eks_utils.PR_EKS_CLUSTER_NAME_TEMPLATE.format("tensorflow")

    assert eks_utils.is_eks_cluster_active(
        cluster_name), f"EKS Cluster {cluster_name} is inactive. Exiting test"

    eks_gpus_per_worker = ec2_utils.get_instance_num_gpus(
        instance_type=ec2_instance_type)

    _run_eks_tensorflow_multinode_training_resnet50_mpijob(
        tensorflow_training, eks_cluster_size, eks_gpus_per_worker)
def test_eks_mxnet_multi_node_training_horovod_mnist(mxnet_training,
                                                     example_only):
    """Run MXNet distributed training on EKS using docker images with MNIST dataset"""

    ctx = Context()

    eks_cluster_size = 3
    ec2_instance_type = "p3.16xlarge"
    cluster_name = eks_utils.PR_EKS_CLUSTER_NAME_TEMPLATE.format("mxnet")

    assert eks_utils.is_eks_cluster_active(
        cluster_name), f"EKS Cluster {cluster_name} is inactive. Exiting test"

    eks_gpus_per_worker = ec2_utils.get_instance_num_gpus(
        instance_type=ec2_instance_type)

    LOGGER.info(
        "Starting run_eks_mxnet_multi_node_training on MNIST dataset using horovod"
    )
    LOGGER.info("The test will run on an example image %s", mxnet_training)

    user = ctx.run("echo $USER").stdout.strip("\n")
    random.seed(
        f"{mxnet_training}-{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}"
    )
    unique_tag = f"{user}-{random.randint(1, 10000)}"

    namespace = f"mx-multi-node-train-{'py2' if 'py2' in mxnet_training else 'py3'}-{unique_tag}"
    app_name = f"kubeflow-mxnet-hvd-mpijob-{unique_tag}"
    job_name = f"mxnet-mnist-horovod-job={unique_tag}"

    command_to_run = "mpirun,-mca,btl_tcp_if_exclude,lo,-mca,pml,ob1,-mca,btl,^openib,--bind-to,none,-map-by,slot," \
                     "-x,LD_LIBRARY_PATH,-x,PATH,-x,NCCL_SOCKET_IFNAME=eth0,-x,NCCL_DEBUG=INFO,python," \
                     "/horovod/examples/mxnet_mnist.py"
    args_to_pass = "******"
    home_dir = ctx.run("echo $HOME").stdout.strip("\n")
    path_to_ksonnet_app = os.path.join(
        home_dir, f"mxnet_multi_node_hvd_eks_test-{unique_tag}")

    LOGGER.debug(f"Namespace: {namespace}")

    # return training_result
    result = _run_eks_multi_node_training_mpijob(namespace, app_name,
                                                 mxnet_training, job_name,
                                                 command_to_run, args_to_pass,
                                                 path_to_ksonnet_app,
                                                 eks_cluster_size,
                                                 eks_gpus_per_worker)

    return result
def test_ecs_mxnet_training_mnist_gpu(gpu_only, ecs_container_instance, mxnet_training, training_cmd, ecs_cluster_name):
    """
    GPU mnist test for MXNet Training

    Instance Type - p2.8xlarge

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    instance_id, cluster_arn = ecs_container_instance

    num_gpus = ec2_utils.get_instance_num_gpus(instance_id)

    ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, mxnet_training, instance_id,
                                         num_gpus=num_gpus)
    def handle_single_gpu_instances_test_report(self,
                                                function_key,
                                                function_keywords,
                                                processor="gpu"):
        """
        Generally, we do not want tests running on single gpu instances. However, there are exceptions to this rule.
        This method is used to determine whether we need to raise an error with report generation or not, based on
        whether we are using single gpu instances or not in a given test function.

        :param function_key: local/path/to/function::function_name
        :param function_keywords: string of keywords associated with the test function
        :param processor: whether the test is for cpu, gpu or both
        :return: processor if not single gpu instance, else "single_gpu", and a dict with updated failure messages
        """

        # Define conditions where we allow a test function to run with a single gpu instance
        whitelist_single_gpu = False
        allowed_single_gpu = self.ALLOWED_SINGLE_GPU_TESTS

        # Regex in order to determine the gpu instance type
        gpu_instance_pattern = re.compile(r"\w+\.\d*xlarge")
        gpu_match = gpu_instance_pattern.search(function_keywords)

        if gpu_match:
            instance_type = gpu_match.group()
            num_gpus = get_instance_num_gpus(instance_type=instance_type)

            for test in allowed_single_gpu:
                if test in function_key:
                    whitelist_single_gpu = True
                    break
            if num_gpus == 1:
                processor = "single_gpu"
                if not whitelist_single_gpu:
                    single_gpu_failure_message = (
                        f"Function uses single-gpu instance type {instance_type}. Please use multi-gpu instance type "
                        f"or add test to ALLOWED_SINGLE_GPU_TESTS. "
                        f"Current allowed tests: {self.ALLOWED_SINGLE_GPU_TESTS}"
                    )
                    if not self.failure_conditions.get(function_key):
                        self.failure_conditions[function_key] = [
                            single_gpu_failure_message
                        ]
                    else:
                        self.failure_conditions[function_key].append(
                            single_gpu_failure_message)

        return processor
def test_eks_tensorflow_multi_node_training_gpu(tensorflow_training,
                                                example_only):
    # EKS multinode are failing on TF1 Pipeline due to scheduling issues.
    # TODO: Remove this line and add the required scheduling scheme.
    if is_tf1(tensorflow_training):
        pytest.skip(
            "Skipping it on TF1 currently as it is not able to do the pods scheduling properly"
        )
    eks_cluster_size = "3"
    ec2_instance_type = "p3.16xlarge"

    eks_gpus_per_worker = ec2_utils.get_instance_num_gpus(
        instance_type=ec2_instance_type)

    _run_eks_tensorflow_multinode_training_resnet50_mpijob(
        tensorflow_training, eks_cluster_size, eks_gpus_per_worker)
def test_ecs_pytorch_s3_plugin_training_gpu(gpu_only, ecs_container_instance,
                                            pytorch_training, training_cmd,
                                            ecs_cluster_name):
    """
    GPU resnet18 test for PyTorch Training using S3 plugin

    Instance Type - p3.8xlarge

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    instance_id, cluster_arn = ecs_container_instance

    num_gpus = ec2_utils.get_instance_num_gpus(instance_id)

    ecs_utils.ecs_training_test_executor(ecs_cluster_name,
                                         cluster_arn,
                                         training_cmd,
                                         pytorch_training,
                                         instance_id,
                                         num_gpus=num_gpus)
def test_ecs_mxnet_training_dgl_gpu(gpu_only, py3_only, ecs_container_instance, mxnet_training, training_cmd,
                                    ecs_cluster_name):
    """
    GPU DGL test for MXNet Training

    Instance Type - p2.xlarge

    DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
    on this function.

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    # TODO: remove/update this when DGL supports MXNet 1.9
    _, framework_version = get_framework_and_version_from_tag(mxnet_training)
    if Version(framework_version) >= Version('1.9.0'):
        pytest.skip("Skipping DGL tests as DGL does not yet support MXNet 1.9")
    instance_id, cluster_arn = ecs_container_instance

    num_gpus = ec2_utils.get_instance_num_gpus(instance_id)

    ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, mxnet_training, instance_id,
                                         num_gpus=num_gpus)