def test_smdebug_gpu(training, ec2_connection, region, ec2_instance_type,
                     gpu_only, py3_only):
    if test_utils.is_image_incompatible_with_instance_type(
            training, ec2_instance_type):
        pytest.skip(
            f"Image {training} is incompatible with instance type {ec2_instance_type}"
        )
    smdebug_test_timeout = 2400
    if is_tf_version("1", training):
        if is_nightly_context():
            smdebug_test_timeout = 7200
        else:
            pytest.skip(
                "TF1 gpu smdebug tests can take up to 2 hours, thus we are only running in nightly context"
            )

    run_smdebug_test(
        training,
        ec2_connection,
        region,
        ec2_instance_type,
        docker_executable="nvidia-docker",
        container_name="smdebug-gpu",
        timeout=smdebug_test_timeout,
    )
def test_sm_profiler_tf(tensorflow_training):
    if is_tf_version("1", tensorflow_training):
        pytest.skip(
            "Skipping test on TF1, since there are no smprofiler config files for TF1"
        )
    processor = get_processor_from_image_uri(tensorflow_training)
    if processor not in ("cpu", "gpu"):
        pytest.skip(f"Processor {processor} not supported. Skipping test.")

    ctx = Context()

    profiler_tests_dir = os.path.join(
        os.getenv("CODEBUILD_SRC_DIR"),
        get_container_name("smprof", tensorflow_training), "smprofiler_tests")
    ctx.run(f"mkdir -p {profiler_tests_dir}", hide=True)

    # Download sagemaker-tests zip
    sm_tests_zip = "sagemaker-tests.zip"
    ctx.run(
        f"aws s3 cp {os.getenv('SMPROFILER_TESTS_BUCKET')}/{sm_tests_zip} {profiler_tests_dir}/{sm_tests_zip}",
        hide=True)
    ctx.run(f"cd {profiler_tests_dir} && unzip {sm_tests_zip}", hide=True)

    # Install tf datasets
    ctx.run(
        f"echo 'tensorflow-datasets==4.0.1' >> "
        f"{profiler_tests_dir}/sagemaker-tests/tests/scripts/tf_scripts/requirements.txt",
        hide=True,
    )

    run_sm_profiler_tests(tensorflow_training, profiler_tests_dir,
                          "test_profiler_tensorflow.py", processor)
def framework_version_within_limit(metafunc_obj, image):
    """
    Test all pytest fixtures for TensorFlow version limits, and return True if all requirements are satisfied

    :param metafunc_obj: pytest metafunc object from which fixture names used by test function will be obtained
    :param image: Image URI for which the validation must be performed
    :return: True if all validation succeeds, else False
    """
    image_framework_name, _ = get_framework_and_version_from_tag(image)
    if image_framework_name == "tensorflow":
        tf2_requirement_failed = "tf2_only" in metafunc_obj.fixturenames and not is_tf_version(
            "2", image)
        tf24_requirement_failed = "tf24_and_above_only" in metafunc_obj.fixturenames and is_below_tf_version(
            "2.4", image)
        tf23_requirement_failed = "tf23_and_above_only" in metafunc_obj.fixturenames and is_below_tf_version(
            "2.3", image)
        tf21_requirement_failed = "tf21_and_above_only" in metafunc_obj.fixturenames and is_below_tf_version(
            "2.1", image)
        if tf2_requirement_failed or tf21_requirement_failed or tf24_requirement_failed or tf23_requirement_failed:
            return False
    if image_framework_name == "mxnet":
        mx18_requirement_failed = "mx18_and_above_only" in metafunc_obj.fixturenames and is_below_mxnet_version(
            "1.8", image)
        if mx18_requirement_failed:
            return False
    return True
示例#4
0
def ec2_performance_tensorflow_inference(image_uri, processor, ec2_connection,
                                         region, threshold):
    docker_cmd = "nvidia-docker" if processor == "gpu" else "docker"
    container_test_local_dir = os.path.join("$HOME", "container_tests")
    tf_version = "1" if is_tf_version("1", image_uri) else "2"
    tf_api_version = "1.15" if tf_version == "1" else "2.3.0"

    # Make sure we are logged into ECR so we can pull the image
    ec2_connection.run(
        f"$(aws ecr get-login --no-include-email --region {region})",
        hide=True)

    ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ")

    # Run performance inference command, display benchmark results to console
    ec2_connection.run(f"pip3 install -U pip")
    ec2_connection.run(
        f"pip3 install boto3 grpcio tensorflow-serving-api=={tf_api_version} --user --no-warn-script-location"
    )
    time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    log_file = f"synthetic_{commit_info}_{time_str}.log"
    ec2_connection.run(
        f"python3 {container_test_local_dir}/bin/benchmark/tf{tf_version}_serving_perf.py "
        f"--processor {processor} --docker_image_name {image_uri} --run_all_s3 --binary /usr/bin/tensorflow_model_server --get_perf --iterations 1000 "
        f"2>&1 | tee {log_file}")
    ec2_performance_upload_result_to_s3_and_validate(
        ec2_connection,
        image_uri,
        log_file,
        "synthetic",
        threshold,
        post_process_inference,
        log_file,
    )
示例#5
0
def test_performance_ec2_tensorflow_inference_cpu(tensorflow_inference,
                                                  ec2_connection, region,
                                                  cpu_only):
    threshold = (TENSORFLOW1_INFERENCE_CPU_THRESHOLD if is_tf_version(
        "1", tensorflow_inference) else TENSORFLOW2_INFERENCE_CPU_THRESHOLD)
    ec2_performance_tensorflow_inference(tensorflow_inference, "cpu",
                                         ec2_connection, region, threshold)
def test_tensorflow_with_horovod_gpu(tensorflow_training, ec2_instance_type, ec2_connection, gpu_only, tf2_only):
    test_script = TF1_HVD_CMD if is_tf_version("1", tensorflow_training) else TF2_HVD_CMD
    execute_ec2_training_test(
        connection=ec2_connection,
        ecr_uri=tensorflow_training,
        test_cmd=test_script,
        large_shm=bool(re.match(r"(p2\.8xlarge)|(g3\.16xlarge)", ec2_instance_type))
    )
def get_framework_from_image_uri(image_uri):
    frameworks = ("tensorflow", "mxnet", "pytorch")
    for framework in frameworks:
        if framework in image_uri:
            if framework == "tensorflow" and is_tf_version("2", image_uri):
                return "tensorflow2"
            return framework
    raise RuntimeError(f"Could not find any framework {frameworks} in {image_uri}")
def test_curand_gpu(training, ec2_connection, gpu_only, ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            training, ec2_instance_type):
        pytest.skip(
            f"Image {training} is incompatible with instance type {ec2_instance_type}"
        )
    if is_tf_version("1", training) or "mxnet" in training:
        pytest.skip("Test is not configured for TF1 and MXNet")
    execute_ec2_training_test(ec2_connection, training, CURAND_CMD)
示例#9
0
def test_tensorflow_with_horovod_gpu(tensorflow_training, ec2_instance_type, ec2_connection, gpu_only, tf2_only):
    if test_utils.is_image_incompatible_with_instance_type(tensorflow_training, ec2_instance_type):
        pytest.skip(f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}")
    test_script = TF1_HVD_CMD if is_tf_version("1", tensorflow_training) else TF2_HVD_CMD
    execute_ec2_training_test(
        connection=ec2_connection,
        ecr_uri=tensorflow_training,
        test_cmd=f"{test_script} {ec2_instance_type}",
        large_shm=bool(re.match(r"(p2\.8xlarge)|(g3\.16xlarge)", ec2_instance_type)),
    )
示例#10
0
def test_tensorflow_standalone_gpu(tensorflow_training, ec2_connection,
                                   gpu_only, ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            tensorflow_training, ec2_instance_type):
        pytest.skip(
            f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}"
        )
    test_script = TF1_STANDALONE_CMD if is_tf_version(
        "1", tensorflow_training) else TF2_STANDALONE_CMD
    execute_ec2_training_test(ec2_connection, tensorflow_training, test_script)
示例#11
0
def ec2_performance_tensorflow_inference(image_uri, processor, ec2_connection,
                                         ec2_instance_ami, region, threshold):
    docker_cmd = "nvidia-docker" if processor == "gpu" else "docker"
    container_test_local_dir = os.path.join("$HOME", "container_tests")
    tf_version = "1" if is_tf_version("1", image_uri) else "2"
    _, tf_api_version = get_framework_and_version_from_tag(image_uri)

    num_iterations = 500 if is_pr_context() else 1000
    # Make sure we are logged into ECR so we can pull the image
    ec2_connection.run(
        f"$(aws ecr get-login --no-include-email --region {region})",
        hide=True)

    ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ")

    # Run performance inference command, display benchmark results to console

    if "graviton" in image_uri:
        # TF training binary is used that is compatible for graviton instance type

        ec2_connection.run((f"sudo apt install python3-pip"), hide=True)
        ec2_connection.run((
            f"pip3 install --user --upgrade awscli boto3 && pip3 install --user grpcio"
        ),
                           hide=True)
        ec2_connection.run((
            f"pip3 install --no-dependencies --user tensorflow-serving-api=={tf_api_version}"
        ),
                           hide=True)
    else:
        ec2_connection.run(f"pip3 install -U pip")
        ec2_connection.run(
            f"pip3 install boto3 grpcio 'tensorflow-serving-api<={tf_api_version}' --user --no-warn-script-location"
        )
    time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    log_file = f"synthetic_{commit_info}_{time_str}.log"
    python_invoker = get_python_invoker(ec2_instance_ami)
    ec2_connection.run(
        f"{python_invoker} {container_test_local_dir}/bin/benchmark/tf{tf_version}_serving_perf.py "
        f"--processor {processor} --docker_image_name {image_uri} "
        f"--run_all_s3 --binary /usr/bin/tensorflow_model_server --get_perf --iterations {num_iterations} "
        f"2>&1 | tee {log_file}")
    ec2_performance_upload_result_to_s3_and_validate(
        ec2_connection,
        image_uri,
        log_file,
        "synthetic",
        threshold,
        post_process_inference,
        log_file,
    )
示例#12
0
def test_performance_tensorflow_gpu_imagenet(tensorflow_training,
                                             ec2_connection, gpu_only):
    threshold = (TENSORFLOW2_TRAINING_GPU_IMAGENET_THRESHOLD if is_tf_version(
        "2", tensorflow_training) else
                 TENSORFLOW1_TRAINING_GPU_IMAGENET_THRESHOLD)
    execute_ec2_training_performance_test(
        ec2_connection,
        tensorflow_training,
        TF_PERFORMANCE_TRAINING_GPU_IMAGENET_CMD,
        post_process=post_process_tensorflow_training_performance,
        data_source="imagenet",
        threshold={"Throughput": threshold},
    )
def test_performance_tensorflow_cpu(tensorflow_training, ec2_connection,
                                    cpu_only):
    threshold = (TENSORFLOW2_TRAINING_CPU_SYNTHETIC_THRESHOLD if is_tf_version(
        "2", tensorflow_training) else
                 TENSORFLOW1_TRAINING_CPU_SYNTHETIC_THRESHOLD)
    execute_ec2_training_performance_test(
        ec2_connection,
        tensorflow_training,
        TF_PERFORMANCE_TRAINING_CPU_SYNTHETIC_CMD,
        post_process=post_process_tensorflow_training_performance,
        data_source="synthetic",
        threshold={"Throughput": threshold},
    )
def test_tensorflow_with_horovod_cpu(tensorflow_training, ec2_connection, cpu_only, tf2_only):
    container_name = "tf_hvd_cpu_test"
    test_script = TF1_HVD_CMD if is_tf_version("1", tensorflow_training) else TF2_HVD_CMD
    try:
        execute_ec2_training_test(
            ec2_connection, tensorflow_training, test_script, container_name=container_name, timeout=1800
        )
    except Exception as e:
        debug_output = ec2_connection.run(f"docker logs {container_name}")
        debug_stdout = debug_output.stdout
        if "TF HVD tests passed!" in debug_stdout:
            LOGGER.warning(
                f"TF HVD tests succeeded, but there is an issue with fabric. Error:\n{e}\nTest output:\n{debug_stdout}"
            )
            return
        raise TFTrainingTestFailure(f"TF HVD test failed. Full output:\n{debug_stdout}") from e
def test_smdebug_gpu(training, ec2_connection, region, ec2_instance_type, gpu_only, py3_only):
    smdebug_test_timeout = 2400
    if is_tf_version("1", training):
        if is_nightly_context():
            smdebug_test_timeout = 7200
        else:
            pytest.skip("TF1 gpu smdebug tests can take up to 2 hours, thus we are only running in nightly context")

    run_smdebug_test(
        training,
        ec2_connection,
        region,
        ec2_instance_type,
        docker_executable="nvidia-docker",
        container_name="smdebug-gpu",
        timeout=smdebug_test_timeout
    )
示例#16
0
def framework_version_within_limit(metafunc_obj, image):
    """
    Test all pytest fixtures for TensorFlow version limits, and return True if all requirements are satisfied

    :param metafunc_obj: pytest metafunc object from which fixture names used by test function will be obtained
    :param image: Image URI for which the validation must be performed
    :return: True if all validation succeeds, else False
    """
    image_framework_name, _ = get_framework_and_version_from_tag(image)
    if image_framework_name in ("tensorflow", "huggingface_tensorflow_trcomp"):
        tf2_requirement_failed = "tf2_only" in metafunc_obj.fixturenames and not is_tf_version(
            "2", image)
        tf25_requirement_failed = "tf25_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "2.5", image, image_framework_name)
        tf24_requirement_failed = "tf24_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "2.4", image, image_framework_name)
        tf23_requirement_failed = "tf23_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "2.3", image, image_framework_name)
        tf21_requirement_failed = "tf21_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "2.1", image, image_framework_name)
        if (tf2_requirement_failed or tf21_requirement_failed
                or tf24_requirement_failed or tf25_requirement_failed
                or tf23_requirement_failed):
            return False
    if image_framework_name == "mxnet":
        mx18_requirement_failed = "mx18_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "1.8", image, "mxnet")
        if mx18_requirement_failed:
            return False
    if image_framework_name in ("pytorch", "huggingface_pytorch_trcomp"):
        pt111_requirement_failed = "pt111_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "1.11", image, image_framework_name)
        pt17_requirement_failed = "pt17_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "1.7", image, image_framework_name)
        pt16_requirement_failed = "pt16_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "1.6", image, image_framework_name)
        pt15_requirement_failed = "pt15_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "1.5", image, image_framework_name)
        pt14_requirement_failed = "pt14_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "1.4", image, image_framework_name)
        if pt111_requirement_failed or pt17_requirement_failed or pt16_requirement_failed or pt15_requirement_failed or pt14_requirement_failed:
            return False
    return True
示例#17
0
def test_tensorflow_with_horovod_cpu(tensorflow_training, ec2_connection,
                                     cpu_only):
    test_script = TF1_HVD_CMD if is_tf_version(
        "1", tensorflow_training) else TF2_HVD_CMD
    execute_ec2_training_test(ec2_connection, tensorflow_training, test_script)
def test_dlc_major_version_dockerfiles(image):
    """
    Test to make sure semantic versioning scheme in Dockerfiles is correct

    :param image: <str> ECR image URI
    """
    dlc_dir = os.getcwd().split(f"{os.sep}test{os.sep}")[0]
    job_type = test_utils.get_job_type_from_image(image)
    framework, fw_version = test_utils.get_framework_and_version_from_tag(
        image)
    processor = test_utils.get_processor_from_image_uri(image)

    # Assign a string of numbers associated with python version in tag. Python major version is not sufficient to
    # define DLC major version
    python_major_minor_version = re.search(r"-py(\d{2,})", image).group(1)

    root_dir = os.path.join(dlc_dir, framework, job_type, "docker")

    # Skip older FW versions that did not use this versioning scheme
    references = {
        "tensorflow2": "2.2.0",
        "tensorflow1": "1.16.0",
        "mxnet": "1.7.0",
        "pytorch": "1.5.0"
    }
    if test_utils.is_tf_version("1", image):
        reference_fw = "tensorflow1"
    elif test_utils.is_tf_version("2", image):
        reference_fw = "tensorflow2"
    else:
        reference_fw = framework
    if processor != "eia" and (
            reference_fw in references
            and Version(fw_version) < Version(references[reference_fw])):
        pytest.skip(
            f"Not enforcing new versioning scheme on old image {image}. "
            f"Started enforcing version scheme on the following: {references}")

    # Find all Dockerfile.<processor> for this framework/job_type's Major.Minor version
    dockerfiles = []
    fw_version_major_minor = re.match(r"(\d+\.\d+)", fw_version).group(1)
    for root, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename == f"Dockerfile.{processor}":
                dockerfile_path = os.path.join(root_dir, root, filename)
                if "example" not in dockerfile_path and f"{os.sep}{fw_version_major_minor}" in dockerfile_path:
                    dockerfiles.append(dockerfile_path)

    # For the collected dockerfiles above, note the DLC major versions in each Dockerfile if python version matches
    # the current image under test
    versions = {}
    dlc_label_regex = re.compile(r'LABEL dlc_major_version="(\d+)"')
    python_version_regex = re.compile(r"ARG PYTHON_VERSION=(\d+\.\d+)")
    for dockerfile in dockerfiles:
        with open(dockerfile, "r") as df:
            dlc_version = None
            python_version = None
            for line in df:
                major_version_match = dlc_label_regex.match(line)
                python_version_match = python_version_regex.match(line)
                if major_version_match:
                    dlc_version = int(major_version_match.group(1))
                elif python_version_match:
                    python_version = python_version_match.group(1).replace(
                        ".", "")

            # Raise errors if dlc major version label and python version arg are not found in Dockerfile
            if not dlc_version:
                raise DLCMajorVersionLabelNotFound(
                    f"Cannot find dlc_major_version label in {dockerfile}")
            if not python_version:
                raise DLCPythonVersionNotFound(
                    f"Cannot find PYTHON_VERSION arg in {dockerfile}")
            if python_version == python_major_minor_version:
                versions[dockerfile] = dlc_version

    expected_versions = list(range(1, len(dockerfiles) + 1))
    actual_versions = sorted(versions.values())

    # Test case explicitly for TF2.3 gpu, since v1.0 is banned
    if (framework, fw_version_major_minor, processor,
            python_major_minor_version, job_type) == (
                "tensorflow",
                "2.3",
                "gpu",
                "37",
                "training",
            ):
        expected_versions = [v + 1 for v in expected_versions]
        assert 1 not in actual_versions, (
            f"DLC v1.0 is deprecated in TF2.3 gpu containers, but found major version 1 "
            f"in one of the Dockerfiles. Please inspect {versions}")

    # Test case explicitly for PyTorch 1.6.0 training gpu, since v2.0 is banned
    if (framework, fw_version_major_minor, processor,
            python_major_minor_version, job_type) == (
                "pytorch",
                "1.6",
                "gpu",
                "36",
                "training",
            ):
        expected_versions = [v + 1 for v in expected_versions]
        expected_versions[0] = 1
        assert 2 not in actual_versions, (
            f"DLC v2.0 is deprecated in PyTorch 1.6.0 gpu containers, but found major version 2 "
            f"in one of the Dockerfiles. Please inspect {versions}")

    # Note: If, for example, we find 3 dockerfiles with the same framework major/minor version, same processor,
    # and same python major/minor version, we will expect DLC major versions 1, 2, and 3. If an exception needs to be
    # made to this rule, please see the above handling of TF2.3 as an example.
    assert actual_versions == expected_versions, (
        f"Found DLC major versions {actual_versions} but expected {expected_versions} for "
        f"{framework} {job_type} {processor}. Full version info: {versions}. Py version: {python_major_minor_version}"
    )
示例#19
0
def test_tensorflow_standalone_cpu(tensorflow_training, ec2_connection, cpu_only):
    test_script = TF1_STANDALONE_CMD if is_tf_version("1", tensorflow_training) else TF2_STANDALONE_CMD
    execute_ec2_training_test(ec2_connection, tensorflow_training, test_script)
def test_cuda_paths(gpu):
    """
    Test to ensure directory structure for GPU Dockerfiles has cuda version in it

    :param gpu: gpu image uris
    """
    image = gpu
    if "example" in image:
        pytest.skip(
            "Skipping Example Dockerfiles which are not explicitly tied to a cuda version"
        )

    dlc_path = os.getcwd().split("/test/")[0]
    job_type = "training" if "training" in image else "inference"

    # Ensure that image has a supported framework
    frameworks = ("tensorflow", "pytorch", "mxnet")
    framework = ""
    for fw in frameworks:
        if fw in image:
            framework = fw
            break
    assert framework, f"Cannot find any frameworks {frameworks} in image uri {image}"

    # Get cuda, framework version, python version through regex
    cuda_version = re.search(r"-(cu\d+)-", image).group(1)
    framework_version = re.search(r":(\d+(.\d+){2})", image).group(1)
    python_version = re.search(r"(py\d+)", image).group(1)

    framework_version_path = os.path.join(dlc_path, framework, job_type,
                                          "docker", framework_version)
    if not os.path.exists(framework_version_path):
        framework_short_version = re.match(r"(\d+.\d+)",
                                           framework_version).group(1)
        framework_version_path = os.path.join(dlc_path, framework, job_type,
                                              "docker",
                                              framework_short_version)
    if not os.path.exists(os.path.join(framework_version_path,
                                       python_version)):
        # Use the pyX version as opposed to the pyXY version if pyXY path does not exist
        python_version = python_version[:3]

    # Check buildspec for cuda version
    buildspec = "buildspec.yml"
    if is_tf_version("1", image):
        buildspec = "buildspec-tf1.yml"

    cuda_in_buildspec = False
    cuda_in_buildspec_ref = f"CUDA_VERSION {cuda_version}"
    buildspec_path = os.path.join(dlc_path, framework, buildspec)
    with open(buildspec_path, "r") as bf:
        for line in bf:
            if cuda_in_buildspec_ref in line:
                cuda_in_buildspec = True
                break

    try:
        assert cuda_in_buildspec, f"Can't find {cuda_in_buildspec_ref} in {buildspec_path}"
    except AssertionError as e:
        if not is_dlc_cicd_context():
            LOGGER.warn(
                f"{e} - not failing, as this is a(n) {os.getenv('BUILD_CONTEXT', 'empty')} build context."
            )
        else:
            raise

    # Check that a Dockerfile exists in the right directory
    dockerfile_path = os.path.join(framework_version_path, python_version,
                                   cuda_version, "Dockerfile.gpu")

    assert os.path.exists(
        dockerfile_path
    ), f"Cannot find dockerfile for image {image} in {dockerfile_path}"
def test_curand_gpu(training, ec2_connection, gpu_only):
    if is_tf_version("1", training) or "mxnet" in training:
        pytest.skip("Test is not configured for TF1 and MXNet")
    execute_ec2_training_test(ec2_connection, training, CURAND_CMD)
示例#22
0
def test_cuda_paths(gpu):
    """
    Test to ensure that:
    a. buildspec contains an entry to create the same image as the image URI
    b. directory structure for GPU Dockerfiles has framework version, python version, and cuda version in it

    :param gpu: gpu image uris
    """
    image = gpu
    if "example" in image:
        pytest.skip(
            "Skipping Example Dockerfiles which are not explicitly tied to a cuda version"
        )

    dlc_path = os.getcwd().split("/test/")[0]
    job_type = "training" if "training" in image else "inference"

    # Ensure that image has a supported framework
    framework, framework_version = get_framework_and_version_from_tag(image)

    # Get cuda, framework version, python version through regex
    cuda_version = re.search(r"-(cu\d+)-", image).group(1)
    framework_short_version = None
    python_version = re.search(r"(py\d+)", image).group(1)
    short_python_version = None
    image_tag = re.search(
        r":(\d+(\.\d+){2}(-transformers\d+(\.\d+){2})?-(gpu)-(py\d+)(-cu\d+)-(ubuntu\d+\.\d+)((-e3)?-example|-e3|-sagemaker)?)",
        image,
    ).group(1)

    # replacing '_' by '/' to handle huggingface_<framework> case
    framework_path = framework.replace("_", "/")
    framework_version_path = os.path.join(dlc_path, framework_path, job_type,
                                          "docker", framework_version)
    if not os.path.exists(framework_version_path):
        framework_short_version = re.match(r"(\d+.\d+)",
                                           framework_version).group(1)
        framework_version_path = os.path.join(dlc_path, framework_path,
                                              job_type, "docker",
                                              framework_short_version)
    if not os.path.exists(os.path.join(framework_version_path,
                                       python_version)):
        # Use the pyX version as opposed to the pyXY version if pyXY path does not exist
        short_python_version = python_version[:3]

    # Check buildspec for cuda version
    buildspec = "buildspec.yml"
    if is_tf_version("1", image):
        buildspec = "buildspec-tf1.yml"

    image_tag_in_buildspec = False
    dockerfile_spec_abs_path = None
    buildspec_path = os.path.join(dlc_path, framework_path, buildspec)
    buildspec_def = Buildspec()
    buildspec_def.load(buildspec_path)

    for name, image_spec in buildspec_def["images"].items():
        if image_spec["device_type"] == "gpu" and image_spec[
                "tag"] == image_tag:
            image_tag_in_buildspec = True
            dockerfile_spec_abs_path = os.path.join(
                os.path.dirname(framework_version_path),
                image_spec["docker_file"].lstrip("docker/"))
            break
    try:
        assert image_tag_in_buildspec, f"Image tag {image_tag} not found in {buildspec_path}"
    except AssertionError as e:
        if not is_dlc_cicd_context():
            LOGGER.warn(
                f"{e} - not failing, as this is a(n) {os.getenv('BUILD_CONTEXT', 'empty')} build context."
            )
        else:
            raise

    image_properties_expected_in_dockerfile_path = [
        framework_short_version or framework_version,
        short_python_version or python_version,
        cuda_version,
    ]
    assert all(
        prop in dockerfile_spec_abs_path
        for prop in image_properties_expected_in_dockerfile_path
    ), (f"Dockerfile location {dockerfile_spec_abs_path} does not contain all the image properties in "
        f"{image_properties_expected_in_dockerfile_path}")

    assert os.path.exists(
        dockerfile_spec_abs_path
    ), f"Cannot find dockerfile for {image} in {dockerfile_spec_abs_path}"
def test_cuda_paths(gpu):
    """
    Test to ensure that:
    a. buildspec contains an entry to create the same image as the image URI
    b. directory structure for GPU Dockerfiles has framework version, python version, and cuda version in it

    :param gpu: gpu image uris
    """
    image = gpu
    if "example" in image:
        pytest.skip(
            "Skipping Example Dockerfiles which are not explicitly tied to a cuda version"
        )

    dlc_path = os.getcwd().split("/test/")[0]
    job_type = "training" if "training" in image else "inference"

    # Ensure that image has a supported framework
    frameworks = ("tensorflow", "pytorch", "mxnet")
    framework = ""
    for fw in frameworks:
        if fw in image:
            framework = fw
            break
    assert framework, f"Cannot find any frameworks {frameworks} in image uri {image}"

    # Get cuda, framework version, python version through regex
    cuda_version = re.search(r"-(cu\d+)-", image).group(1)
    framework_version = re.search(r":(\d+(\.\d+){2})", image).group(1)
    framework_short_version = None
    python_version = re.search(r"(py\d+)", image).group(1)
    short_python_version = None
    image_tag = re.search(
        r":(\d+(\.\d+){2}-(cpu|gpu|neuron)-(py\d+)(-cu\d+)-(ubuntu\d+\.\d+)(-example)?)",
        image).group(1)

    framework_version_path = os.path.join(dlc_path, framework, job_type,
                                          "docker", framework_version)
    if not os.path.exists(framework_version_path):
        framework_short_version = re.match(r"(\d+.\d+)",
                                           framework_version).group(1)
        framework_version_path = os.path.join(dlc_path, framework, job_type,
                                              "docker",
                                              framework_short_version)
    if not os.path.exists(os.path.join(framework_version_path,
                                       python_version)):
        # Use the pyX version as opposed to the pyXY version if pyXY path does not exist
        short_python_version = python_version[:3]

    # Check buildspec for cuda version
    buildspec = "buildspec.yml"
    if is_tf_version("1", image):
        buildspec = "buildspec-tf1.yml"

    cuda_in_buildspec = False
    dockerfile_spec_abs_path = None
    cuda_in_buildspec_ref = f"CUDA_VERSION {cuda_version}"
    buildspec_path = os.path.join(dlc_path, framework, buildspec)
    buildspec_def = Buildspec()
    buildspec_def.load(buildspec_path)

    for name, image_spec in buildspec_def["images"].items():
        if image_spec["device_type"] == "gpu" and image_spec[
                "tag"] == image_tag:
            cuda_in_buildspec = True
            dockerfile_spec_abs_path = os.path.join(
                os.path.dirname(framework_version_path),
                image_spec["docker_file"].lstrip("docker/"))
            break

    try:
        assert cuda_in_buildspec, f"Can't find {cuda_in_buildspec_ref} in {buildspec_path}"
    except AssertionError as e:
        if not is_dlc_cicd_context():
            LOGGER.warn(
                f"{e} - not failing, as this is a(n) {os.getenv('BUILD_CONTEXT', 'empty')} build context."
            )
        else:
            raise

    image_properties_expected_in_dockerfile_path = [
        framework_short_version or framework_version, short_python_version
        or python_version, cuda_version
    ]
    assert all(
        prop in dockerfile_spec_abs_path
        for prop in image_properties_expected_in_dockerfile_path
    ), (f"Dockerfile location {dockerfile_spec_abs_path} does not contain all the image properties in "
        f"{image_properties_expected_in_dockerfile_path}")

    assert os.path.exists(
        dockerfile_spec_abs_path
    ), f"Cannot find dockerfile for {image} in {dockerfile_spec_abs_path}"