def test_smdebug_gpu(training, ec2_connection, region, ec2_instance_type, gpu_only, py3_only): if test_utils.is_image_incompatible_with_instance_type( training, ec2_instance_type): pytest.skip( f"Image {training} is incompatible with instance type {ec2_instance_type}" ) smdebug_test_timeout = 2400 if is_tf_version("1", training): if is_nightly_context(): smdebug_test_timeout = 7200 else: pytest.skip( "TF1 gpu smdebug tests can take up to 2 hours, thus we are only running in nightly context" ) run_smdebug_test( training, ec2_connection, region, ec2_instance_type, docker_executable="nvidia-docker", container_name="smdebug-gpu", timeout=smdebug_test_timeout, )
def test_sm_profiler_tf(tensorflow_training): if is_tf_version("1", tensorflow_training): pytest.skip( "Skipping test on TF1, since there are no smprofiler config files for TF1" ) processor = get_processor_from_image_uri(tensorflow_training) if processor not in ("cpu", "gpu"): pytest.skip(f"Processor {processor} not supported. Skipping test.") ctx = Context() profiler_tests_dir = os.path.join( os.getenv("CODEBUILD_SRC_DIR"), get_container_name("smprof", tensorflow_training), "smprofiler_tests") ctx.run(f"mkdir -p {profiler_tests_dir}", hide=True) # Download sagemaker-tests zip sm_tests_zip = "sagemaker-tests.zip" ctx.run( f"aws s3 cp {os.getenv('SMPROFILER_TESTS_BUCKET')}/{sm_tests_zip} {profiler_tests_dir}/{sm_tests_zip}", hide=True) ctx.run(f"cd {profiler_tests_dir} && unzip {sm_tests_zip}", hide=True) # Install tf datasets ctx.run( f"echo 'tensorflow-datasets==4.0.1' >> " f"{profiler_tests_dir}/sagemaker-tests/tests/scripts/tf_scripts/requirements.txt", hide=True, ) run_sm_profiler_tests(tensorflow_training, profiler_tests_dir, "test_profiler_tensorflow.py", processor)
def framework_version_within_limit(metafunc_obj, image): """ Test all pytest fixtures for TensorFlow version limits, and return True if all requirements are satisfied :param metafunc_obj: pytest metafunc object from which fixture names used by test function will be obtained :param image: Image URI for which the validation must be performed :return: True if all validation succeeds, else False """ image_framework_name, _ = get_framework_and_version_from_tag(image) if image_framework_name == "tensorflow": tf2_requirement_failed = "tf2_only" in metafunc_obj.fixturenames and not is_tf_version( "2", image) tf24_requirement_failed = "tf24_and_above_only" in metafunc_obj.fixturenames and is_below_tf_version( "2.4", image) tf23_requirement_failed = "tf23_and_above_only" in metafunc_obj.fixturenames and is_below_tf_version( "2.3", image) tf21_requirement_failed = "tf21_and_above_only" in metafunc_obj.fixturenames and is_below_tf_version( "2.1", image) if tf2_requirement_failed or tf21_requirement_failed or tf24_requirement_failed or tf23_requirement_failed: return False if image_framework_name == "mxnet": mx18_requirement_failed = "mx18_and_above_only" in metafunc_obj.fixturenames and is_below_mxnet_version( "1.8", image) if mx18_requirement_failed: return False return True
def ec2_performance_tensorflow_inference(image_uri, processor, ec2_connection, region, threshold): docker_cmd = "nvidia-docker" if processor == "gpu" else "docker" container_test_local_dir = os.path.join("$HOME", "container_tests") tf_version = "1" if is_tf_version("1", image_uri) else "2" tf_api_version = "1.15" if tf_version == "1" else "2.3.0" # Make sure we are logged into ECR so we can pull the image ec2_connection.run( f"$(aws ecr get-login --no-include-email --region {region})", hide=True) ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ") # Run performance inference command, display benchmark results to console ec2_connection.run(f"pip3 install -U pip") ec2_connection.run( f"pip3 install boto3 grpcio tensorflow-serving-api=={tf_api_version} --user --no-warn-script-location" ) time_str = time.strftime("%Y-%m-%d-%H-%M-%S") commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") log_file = f"synthetic_{commit_info}_{time_str}.log" ec2_connection.run( f"python3 {container_test_local_dir}/bin/benchmark/tf{tf_version}_serving_perf.py " f"--processor {processor} --docker_image_name {image_uri} --run_all_s3 --binary /usr/bin/tensorflow_model_server --get_perf --iterations 1000 " f"2>&1 | tee {log_file}") ec2_performance_upload_result_to_s3_and_validate( ec2_connection, image_uri, log_file, "synthetic", threshold, post_process_inference, log_file, )
def test_performance_ec2_tensorflow_inference_cpu(tensorflow_inference, ec2_connection, region, cpu_only): threshold = (TENSORFLOW1_INFERENCE_CPU_THRESHOLD if is_tf_version( "1", tensorflow_inference) else TENSORFLOW2_INFERENCE_CPU_THRESHOLD) ec2_performance_tensorflow_inference(tensorflow_inference, "cpu", ec2_connection, region, threshold)
def test_tensorflow_with_horovod_gpu(tensorflow_training, ec2_instance_type, ec2_connection, gpu_only, tf2_only): test_script = TF1_HVD_CMD if is_tf_version("1", tensorflow_training) else TF2_HVD_CMD execute_ec2_training_test( connection=ec2_connection, ecr_uri=tensorflow_training, test_cmd=test_script, large_shm=bool(re.match(r"(p2\.8xlarge)|(g3\.16xlarge)", ec2_instance_type)) )
def get_framework_from_image_uri(image_uri): frameworks = ("tensorflow", "mxnet", "pytorch") for framework in frameworks: if framework in image_uri: if framework == "tensorflow" and is_tf_version("2", image_uri): return "tensorflow2" return framework raise RuntimeError(f"Could not find any framework {frameworks} in {image_uri}")
def test_curand_gpu(training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_incompatible_with_instance_type( training, ec2_instance_type): pytest.skip( f"Image {training} is incompatible with instance type {ec2_instance_type}" ) if is_tf_version("1", training) or "mxnet" in training: pytest.skip("Test is not configured for TF1 and MXNet") execute_ec2_training_test(ec2_connection, training, CURAND_CMD)
def test_tensorflow_with_horovod_gpu(tensorflow_training, ec2_instance_type, ec2_connection, gpu_only, tf2_only): if test_utils.is_image_incompatible_with_instance_type(tensorflow_training, ec2_instance_type): pytest.skip(f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}") test_script = TF1_HVD_CMD if is_tf_version("1", tensorflow_training) else TF2_HVD_CMD execute_ec2_training_test( connection=ec2_connection, ecr_uri=tensorflow_training, test_cmd=f"{test_script} {ec2_instance_type}", large_shm=bool(re.match(r"(p2\.8xlarge)|(g3\.16xlarge)", ec2_instance_type)), )
def test_tensorflow_standalone_gpu(tensorflow_training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_incompatible_with_instance_type( tensorflow_training, ec2_instance_type): pytest.skip( f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}" ) test_script = TF1_STANDALONE_CMD if is_tf_version( "1", tensorflow_training) else TF2_STANDALONE_CMD execute_ec2_training_test(ec2_connection, tensorflow_training, test_script)
def ec2_performance_tensorflow_inference(image_uri, processor, ec2_connection, ec2_instance_ami, region, threshold): docker_cmd = "nvidia-docker" if processor == "gpu" else "docker" container_test_local_dir = os.path.join("$HOME", "container_tests") tf_version = "1" if is_tf_version("1", image_uri) else "2" _, tf_api_version = get_framework_and_version_from_tag(image_uri) num_iterations = 500 if is_pr_context() else 1000 # Make sure we are logged into ECR so we can pull the image ec2_connection.run( f"$(aws ecr get-login --no-include-email --region {region})", hide=True) ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ") # Run performance inference command, display benchmark results to console if "graviton" in image_uri: # TF training binary is used that is compatible for graviton instance type ec2_connection.run((f"sudo apt install python3-pip"), hide=True) ec2_connection.run(( f"pip3 install --user --upgrade awscli boto3 && pip3 install --user grpcio" ), hide=True) ec2_connection.run(( f"pip3 install --no-dependencies --user tensorflow-serving-api=={tf_api_version}" ), hide=True) else: ec2_connection.run(f"pip3 install -U pip") ec2_connection.run( f"pip3 install boto3 grpcio 'tensorflow-serving-api<={tf_api_version}' --user --no-warn-script-location" ) time_str = time.strftime("%Y-%m-%d-%H-%M-%S") commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") log_file = f"synthetic_{commit_info}_{time_str}.log" python_invoker = get_python_invoker(ec2_instance_ami) ec2_connection.run( f"{python_invoker} {container_test_local_dir}/bin/benchmark/tf{tf_version}_serving_perf.py " f"--processor {processor} --docker_image_name {image_uri} " f"--run_all_s3 --binary /usr/bin/tensorflow_model_server --get_perf --iterations {num_iterations} " f"2>&1 | tee {log_file}") ec2_performance_upload_result_to_s3_and_validate( ec2_connection, image_uri, log_file, "synthetic", threshold, post_process_inference, log_file, )
def test_performance_tensorflow_gpu_imagenet(tensorflow_training, ec2_connection, gpu_only): threshold = (TENSORFLOW2_TRAINING_GPU_IMAGENET_THRESHOLD if is_tf_version( "2", tensorflow_training) else TENSORFLOW1_TRAINING_GPU_IMAGENET_THRESHOLD) execute_ec2_training_performance_test( ec2_connection, tensorflow_training, TF_PERFORMANCE_TRAINING_GPU_IMAGENET_CMD, post_process=post_process_tensorflow_training_performance, data_source="imagenet", threshold={"Throughput": threshold}, )
def test_performance_tensorflow_cpu(tensorflow_training, ec2_connection, cpu_only): threshold = (TENSORFLOW2_TRAINING_CPU_SYNTHETIC_THRESHOLD if is_tf_version( "2", tensorflow_training) else TENSORFLOW1_TRAINING_CPU_SYNTHETIC_THRESHOLD) execute_ec2_training_performance_test( ec2_connection, tensorflow_training, TF_PERFORMANCE_TRAINING_CPU_SYNTHETIC_CMD, post_process=post_process_tensorflow_training_performance, data_source="synthetic", threshold={"Throughput": threshold}, )
def test_tensorflow_with_horovod_cpu(tensorflow_training, ec2_connection, cpu_only, tf2_only): container_name = "tf_hvd_cpu_test" test_script = TF1_HVD_CMD if is_tf_version("1", tensorflow_training) else TF2_HVD_CMD try: execute_ec2_training_test( ec2_connection, tensorflow_training, test_script, container_name=container_name, timeout=1800 ) except Exception as e: debug_output = ec2_connection.run(f"docker logs {container_name}") debug_stdout = debug_output.stdout if "TF HVD tests passed!" in debug_stdout: LOGGER.warning( f"TF HVD tests succeeded, but there is an issue with fabric. Error:\n{e}\nTest output:\n{debug_stdout}" ) return raise TFTrainingTestFailure(f"TF HVD test failed. Full output:\n{debug_stdout}") from e
def test_smdebug_gpu(training, ec2_connection, region, ec2_instance_type, gpu_only, py3_only): smdebug_test_timeout = 2400 if is_tf_version("1", training): if is_nightly_context(): smdebug_test_timeout = 7200 else: pytest.skip("TF1 gpu smdebug tests can take up to 2 hours, thus we are only running in nightly context") run_smdebug_test( training, ec2_connection, region, ec2_instance_type, docker_executable="nvidia-docker", container_name="smdebug-gpu", timeout=smdebug_test_timeout )
def framework_version_within_limit(metafunc_obj, image): """ Test all pytest fixtures for TensorFlow version limits, and return True if all requirements are satisfied :param metafunc_obj: pytest metafunc object from which fixture names used by test function will be obtained :param image: Image URI for which the validation must be performed :return: True if all validation succeeds, else False """ image_framework_name, _ = get_framework_and_version_from_tag(image) if image_framework_name in ("tensorflow", "huggingface_tensorflow_trcomp"): tf2_requirement_failed = "tf2_only" in metafunc_obj.fixturenames and not is_tf_version( "2", image) tf25_requirement_failed = "tf25_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version( "2.5", image, image_framework_name) tf24_requirement_failed = "tf24_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version( "2.4", image, image_framework_name) tf23_requirement_failed = "tf23_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version( "2.3", image, image_framework_name) tf21_requirement_failed = "tf21_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version( "2.1", image, image_framework_name) if (tf2_requirement_failed or tf21_requirement_failed or tf24_requirement_failed or tf25_requirement_failed or tf23_requirement_failed): return False if image_framework_name == "mxnet": mx18_requirement_failed = "mx18_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version( "1.8", image, "mxnet") if mx18_requirement_failed: return False if image_framework_name in ("pytorch", "huggingface_pytorch_trcomp"): pt111_requirement_failed = "pt111_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version( "1.11", image, image_framework_name) pt17_requirement_failed = "pt17_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version( "1.7", image, image_framework_name) pt16_requirement_failed = "pt16_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version( "1.6", image, image_framework_name) pt15_requirement_failed = "pt15_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version( "1.5", image, image_framework_name) pt14_requirement_failed = "pt14_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version( "1.4", image, image_framework_name) if pt111_requirement_failed or pt17_requirement_failed or pt16_requirement_failed or pt15_requirement_failed or pt14_requirement_failed: return False return True
def test_tensorflow_with_horovod_cpu(tensorflow_training, ec2_connection, cpu_only): test_script = TF1_HVD_CMD if is_tf_version( "1", tensorflow_training) else TF2_HVD_CMD execute_ec2_training_test(ec2_connection, tensorflow_training, test_script)
def test_dlc_major_version_dockerfiles(image): """ Test to make sure semantic versioning scheme in Dockerfiles is correct :param image: <str> ECR image URI """ dlc_dir = os.getcwd().split(f"{os.sep}test{os.sep}")[0] job_type = test_utils.get_job_type_from_image(image) framework, fw_version = test_utils.get_framework_and_version_from_tag( image) processor = test_utils.get_processor_from_image_uri(image) # Assign a string of numbers associated with python version in tag. Python major version is not sufficient to # define DLC major version python_major_minor_version = re.search(r"-py(\d{2,})", image).group(1) root_dir = os.path.join(dlc_dir, framework, job_type, "docker") # Skip older FW versions that did not use this versioning scheme references = { "tensorflow2": "2.2.0", "tensorflow1": "1.16.0", "mxnet": "1.7.0", "pytorch": "1.5.0" } if test_utils.is_tf_version("1", image): reference_fw = "tensorflow1" elif test_utils.is_tf_version("2", image): reference_fw = "tensorflow2" else: reference_fw = framework if processor != "eia" and ( reference_fw in references and Version(fw_version) < Version(references[reference_fw])): pytest.skip( f"Not enforcing new versioning scheme on old image {image}. " f"Started enforcing version scheme on the following: {references}") # Find all Dockerfile.<processor> for this framework/job_type's Major.Minor version dockerfiles = [] fw_version_major_minor = re.match(r"(\d+\.\d+)", fw_version).group(1) for root, dirnames, filenames in os.walk(root_dir): for filename in filenames: if filename == f"Dockerfile.{processor}": dockerfile_path = os.path.join(root_dir, root, filename) if "example" not in dockerfile_path and f"{os.sep}{fw_version_major_minor}" in dockerfile_path: dockerfiles.append(dockerfile_path) # For the collected dockerfiles above, note the DLC major versions in each Dockerfile if python version matches # the current image under test versions = {} dlc_label_regex = re.compile(r'LABEL dlc_major_version="(\d+)"') python_version_regex = re.compile(r"ARG PYTHON_VERSION=(\d+\.\d+)") for dockerfile in dockerfiles: with open(dockerfile, "r") as df: dlc_version = None python_version = None for line in df: major_version_match = dlc_label_regex.match(line) python_version_match = python_version_regex.match(line) if major_version_match: dlc_version = int(major_version_match.group(1)) elif python_version_match: python_version = python_version_match.group(1).replace( ".", "") # Raise errors if dlc major version label and python version arg are not found in Dockerfile if not dlc_version: raise DLCMajorVersionLabelNotFound( f"Cannot find dlc_major_version label in {dockerfile}") if not python_version: raise DLCPythonVersionNotFound( f"Cannot find PYTHON_VERSION arg in {dockerfile}") if python_version == python_major_minor_version: versions[dockerfile] = dlc_version expected_versions = list(range(1, len(dockerfiles) + 1)) actual_versions = sorted(versions.values()) # Test case explicitly for TF2.3 gpu, since v1.0 is banned if (framework, fw_version_major_minor, processor, python_major_minor_version, job_type) == ( "tensorflow", "2.3", "gpu", "37", "training", ): expected_versions = [v + 1 for v in expected_versions] assert 1 not in actual_versions, ( f"DLC v1.0 is deprecated in TF2.3 gpu containers, but found major version 1 " f"in one of the Dockerfiles. Please inspect {versions}") # Test case explicitly for PyTorch 1.6.0 training gpu, since v2.0 is banned if (framework, fw_version_major_minor, processor, python_major_minor_version, job_type) == ( "pytorch", "1.6", "gpu", "36", "training", ): expected_versions = [v + 1 for v in expected_versions] expected_versions[0] = 1 assert 2 not in actual_versions, ( f"DLC v2.0 is deprecated in PyTorch 1.6.0 gpu containers, but found major version 2 " f"in one of the Dockerfiles. Please inspect {versions}") # Note: If, for example, we find 3 dockerfiles with the same framework major/minor version, same processor, # and same python major/minor version, we will expect DLC major versions 1, 2, and 3. If an exception needs to be # made to this rule, please see the above handling of TF2.3 as an example. assert actual_versions == expected_versions, ( f"Found DLC major versions {actual_versions} but expected {expected_versions} for " f"{framework} {job_type} {processor}. Full version info: {versions}. Py version: {python_major_minor_version}" )
def test_tensorflow_standalone_cpu(tensorflow_training, ec2_connection, cpu_only): test_script = TF1_STANDALONE_CMD if is_tf_version("1", tensorflow_training) else TF2_STANDALONE_CMD execute_ec2_training_test(ec2_connection, tensorflow_training, test_script)
def test_cuda_paths(gpu): """ Test to ensure directory structure for GPU Dockerfiles has cuda version in it :param gpu: gpu image uris """ image = gpu if "example" in image: pytest.skip( "Skipping Example Dockerfiles which are not explicitly tied to a cuda version" ) dlc_path = os.getcwd().split("/test/")[0] job_type = "training" if "training" in image else "inference" # Ensure that image has a supported framework frameworks = ("tensorflow", "pytorch", "mxnet") framework = "" for fw in frameworks: if fw in image: framework = fw break assert framework, f"Cannot find any frameworks {frameworks} in image uri {image}" # Get cuda, framework version, python version through regex cuda_version = re.search(r"-(cu\d+)-", image).group(1) framework_version = re.search(r":(\d+(.\d+){2})", image).group(1) python_version = re.search(r"(py\d+)", image).group(1) framework_version_path = os.path.join(dlc_path, framework, job_type, "docker", framework_version) if not os.path.exists(framework_version_path): framework_short_version = re.match(r"(\d+.\d+)", framework_version).group(1) framework_version_path = os.path.join(dlc_path, framework, job_type, "docker", framework_short_version) if not os.path.exists(os.path.join(framework_version_path, python_version)): # Use the pyX version as opposed to the pyXY version if pyXY path does not exist python_version = python_version[:3] # Check buildspec for cuda version buildspec = "buildspec.yml" if is_tf_version("1", image): buildspec = "buildspec-tf1.yml" cuda_in_buildspec = False cuda_in_buildspec_ref = f"CUDA_VERSION {cuda_version}" buildspec_path = os.path.join(dlc_path, framework, buildspec) with open(buildspec_path, "r") as bf: for line in bf: if cuda_in_buildspec_ref in line: cuda_in_buildspec = True break try: assert cuda_in_buildspec, f"Can't find {cuda_in_buildspec_ref} in {buildspec_path}" except AssertionError as e: if not is_dlc_cicd_context(): LOGGER.warn( f"{e} - not failing, as this is a(n) {os.getenv('BUILD_CONTEXT', 'empty')} build context." ) else: raise # Check that a Dockerfile exists in the right directory dockerfile_path = os.path.join(framework_version_path, python_version, cuda_version, "Dockerfile.gpu") assert os.path.exists( dockerfile_path ), f"Cannot find dockerfile for image {image} in {dockerfile_path}"
def test_curand_gpu(training, ec2_connection, gpu_only): if is_tf_version("1", training) or "mxnet" in training: pytest.skip("Test is not configured for TF1 and MXNet") execute_ec2_training_test(ec2_connection, training, CURAND_CMD)
def test_cuda_paths(gpu): """ Test to ensure that: a. buildspec contains an entry to create the same image as the image URI b. directory structure for GPU Dockerfiles has framework version, python version, and cuda version in it :param gpu: gpu image uris """ image = gpu if "example" in image: pytest.skip( "Skipping Example Dockerfiles which are not explicitly tied to a cuda version" ) dlc_path = os.getcwd().split("/test/")[0] job_type = "training" if "training" in image else "inference" # Ensure that image has a supported framework framework, framework_version = get_framework_and_version_from_tag(image) # Get cuda, framework version, python version through regex cuda_version = re.search(r"-(cu\d+)-", image).group(1) framework_short_version = None python_version = re.search(r"(py\d+)", image).group(1) short_python_version = None image_tag = re.search( r":(\d+(\.\d+){2}(-transformers\d+(\.\d+){2})?-(gpu)-(py\d+)(-cu\d+)-(ubuntu\d+\.\d+)((-e3)?-example|-e3|-sagemaker)?)", image, ).group(1) # replacing '_' by '/' to handle huggingface_<framework> case framework_path = framework.replace("_", "/") framework_version_path = os.path.join(dlc_path, framework_path, job_type, "docker", framework_version) if not os.path.exists(framework_version_path): framework_short_version = re.match(r"(\d+.\d+)", framework_version).group(1) framework_version_path = os.path.join(dlc_path, framework_path, job_type, "docker", framework_short_version) if not os.path.exists(os.path.join(framework_version_path, python_version)): # Use the pyX version as opposed to the pyXY version if pyXY path does not exist short_python_version = python_version[:3] # Check buildspec for cuda version buildspec = "buildspec.yml" if is_tf_version("1", image): buildspec = "buildspec-tf1.yml" image_tag_in_buildspec = False dockerfile_spec_abs_path = None buildspec_path = os.path.join(dlc_path, framework_path, buildspec) buildspec_def = Buildspec() buildspec_def.load(buildspec_path) for name, image_spec in buildspec_def["images"].items(): if image_spec["device_type"] == "gpu" and image_spec[ "tag"] == image_tag: image_tag_in_buildspec = True dockerfile_spec_abs_path = os.path.join( os.path.dirname(framework_version_path), image_spec["docker_file"].lstrip("docker/")) break try: assert image_tag_in_buildspec, f"Image tag {image_tag} not found in {buildspec_path}" except AssertionError as e: if not is_dlc_cicd_context(): LOGGER.warn( f"{e} - not failing, as this is a(n) {os.getenv('BUILD_CONTEXT', 'empty')} build context." ) else: raise image_properties_expected_in_dockerfile_path = [ framework_short_version or framework_version, short_python_version or python_version, cuda_version, ] assert all( prop in dockerfile_spec_abs_path for prop in image_properties_expected_in_dockerfile_path ), (f"Dockerfile location {dockerfile_spec_abs_path} does not contain all the image properties in " f"{image_properties_expected_in_dockerfile_path}") assert os.path.exists( dockerfile_spec_abs_path ), f"Cannot find dockerfile for {image} in {dockerfile_spec_abs_path}"
def test_cuda_paths(gpu): """ Test to ensure that: a. buildspec contains an entry to create the same image as the image URI b. directory structure for GPU Dockerfiles has framework version, python version, and cuda version in it :param gpu: gpu image uris """ image = gpu if "example" in image: pytest.skip( "Skipping Example Dockerfiles which are not explicitly tied to a cuda version" ) dlc_path = os.getcwd().split("/test/")[0] job_type = "training" if "training" in image else "inference" # Ensure that image has a supported framework frameworks = ("tensorflow", "pytorch", "mxnet") framework = "" for fw in frameworks: if fw in image: framework = fw break assert framework, f"Cannot find any frameworks {frameworks} in image uri {image}" # Get cuda, framework version, python version through regex cuda_version = re.search(r"-(cu\d+)-", image).group(1) framework_version = re.search(r":(\d+(\.\d+){2})", image).group(1) framework_short_version = None python_version = re.search(r"(py\d+)", image).group(1) short_python_version = None image_tag = re.search( r":(\d+(\.\d+){2}-(cpu|gpu|neuron)-(py\d+)(-cu\d+)-(ubuntu\d+\.\d+)(-example)?)", image).group(1) framework_version_path = os.path.join(dlc_path, framework, job_type, "docker", framework_version) if not os.path.exists(framework_version_path): framework_short_version = re.match(r"(\d+.\d+)", framework_version).group(1) framework_version_path = os.path.join(dlc_path, framework, job_type, "docker", framework_short_version) if not os.path.exists(os.path.join(framework_version_path, python_version)): # Use the pyX version as opposed to the pyXY version if pyXY path does not exist short_python_version = python_version[:3] # Check buildspec for cuda version buildspec = "buildspec.yml" if is_tf_version("1", image): buildspec = "buildspec-tf1.yml" cuda_in_buildspec = False dockerfile_spec_abs_path = None cuda_in_buildspec_ref = f"CUDA_VERSION {cuda_version}" buildspec_path = os.path.join(dlc_path, framework, buildspec) buildspec_def = Buildspec() buildspec_def.load(buildspec_path) for name, image_spec in buildspec_def["images"].items(): if image_spec["device_type"] == "gpu" and image_spec[ "tag"] == image_tag: cuda_in_buildspec = True dockerfile_spec_abs_path = os.path.join( os.path.dirname(framework_version_path), image_spec["docker_file"].lstrip("docker/")) break try: assert cuda_in_buildspec, f"Can't find {cuda_in_buildspec_ref} in {buildspec_path}" except AssertionError as e: if not is_dlc_cicd_context(): LOGGER.warn( f"{e} - not failing, as this is a(n) {os.getenv('BUILD_CONTEXT', 'empty')} build context." ) else: raise image_properties_expected_in_dockerfile_path = [ framework_short_version or framework_version, short_python_version or python_version, cuda_version ] assert all( prop in dockerfile_spec_abs_path for prop in image_properties_expected_in_dockerfile_path ), (f"Dockerfile location {dockerfile_spec_abs_path} does not contain all the image properties in " f"{image_properties_expected_in_dockerfile_path}") assert os.path.exists( dockerfile_spec_abs_path ), f"Cannot find dockerfile for {image} in {dockerfile_spec_abs_path}"