def ec2_performance_tensorflow_inference(image_uri, processor, ec2_connection, region): docker_cmd = "nvidia-docker" if processor == "gpu" else "docker" python_version = "py2" if "py2" in image_uri else "py3" container_test_local_dir = os.path.join("$HOME", "container_tests") tf_version = "1" if is_tf1(image_uri) else "2" tf_api_version = '1.15' if tf_version == '1' else '2.1.0rc1' tf_version_folder = '1.15' if tf_version == '1' else '2.1' processor_folder = "CPU-WITH-MKL" if processor == "cpu" else "GPU" # Make sure we are logged into ECR so we can pull the image ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ") # Run performance inference command, display benchmark results to console ec2_connection.run( f"pip install boto3 grpcio tensorflow-serving-api=={tf_api_version} --user --no-warn-script-location" ) ec2_connection.sudo(f"aws s3 cp s3://tensorflow-aws/{tf_version_folder}/Serving/{processor_folder}/tensorflow_model_server /usr/bin/") ec2_connection.sudo(f"chmod +x /usr/bin/tensorflow_model_server") time_str = time.strftime('%Y-%m-%d-%H-%M-%S') commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") log_file = f"inference_benchmark_results_{commit_info}_{time_str}.log" ec2_connection.run( f"python {container_test_local_dir}/bin/benchmark/tf{tf_version}_serving_perf.py " f"--processor {processor} --docker_image_name {image_uri} --run_all_s3 --binary /usr/bin/tensorflow_model_server --get_perf --iterations 1000 " f"2>&1 | tee {log_file}" ) ec2_connection.run( f"echo Benchmark Results: >&2;" f"echo Tensorflow{tf_version} Inference {processor} {python_version} >&2" ) ec2_connection.run(f"tail {log_file} >&2") ec2_connection.run(f"aws s3 cp {log_file} {BENCHMARK_RESULTS_S3_BUCKET}/tensorflow{tf_version}/ec2/inference/{processor}/{python_version}/{log_file}") ec2_connection.run(f"echo To retrieve complete benchmark log, check s3://dlinfra-dlc-cicd-performance/tensorflow{tf_version}/ec2/inference/{processor}/{python_version}/{log_file} >&2")
def ec2_performance_tensorflow_inference(image_uri, processor, ec2_connection, region, threshold): docker_cmd = "nvidia-docker" if processor == "gpu" else "docker" container_test_local_dir = os.path.join("$HOME", "container_tests") tf_version = "1" if is_tf1(image_uri) else "2" tf_api_version = "1.15" if tf_version == "1" else "2.3.0" # Make sure we are logged into ECR so we can pull the image ec2_connection.run( f"$(aws ecr get-login --no-include-email --region {region})", hide=True) ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ") # Run performance inference command, display benchmark results to console ec2_connection.run(f"pip3 install -U pip") ec2_connection.run( f"pip3 install boto3 grpcio tensorflow-serving-api=={tf_api_version} --user --no-warn-script-location" ) time_str = time.strftime("%Y-%m-%d-%H-%M-%S") commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") log_file = f"synthetic_{commit_info}_{time_str}.log" ec2_connection.run( f"python3 {container_test_local_dir}/bin/benchmark/tf{tf_version}_serving_perf.py " f"--processor {processor} --docker_image_name {image_uri} --run_all_s3 --binary /usr/bin/tensorflow_model_server --get_perf --iterations 1000 " f"2>&1 | tee {log_file}") ec2_performance_upload_result_to_s3_and_validate( ec2_connection, image_uri, log_file, "synthetic", threshold, post_process_inference, log_file, )
def test_smdebug_gpu(training, ec2_connection, region, gpu_only, py3_only): # TODO: Remove this once test timeout has been debugged (failures especially on p2.8xlarge) if is_tf2(training) and "2.3.0" in training and "p2.8xlarge" in SMDEBUG_EC2_GPU_INSTANCE_TYPE: pytest.skip("Currently skipping for TF2.3.0 on p2.8xlarge until the issue is fixed") if is_tf1(training): pytest.skip("Currently skipping for TF1 until the issue is fixed") run_smdebug_test(training, ec2_connection, region, docker_executable="nvidia-docker", container_name="smdebug-gpu")
def test_tensorflow_keras_horovod_amp(tensorflow_training, ec2_connection, gpu_only): if is_tf1(tensorflow_training) or is_tf20(tensorflow_training): pytest.skip( "This test is for TF2.1 and later only" ) # https://github.com/tensorflow/tensorflow/issues/33484#issuecomment-555299647 execute_ec2_training_test(ec2_connection, tensorflow_training, TF_KERAS_HVD_CMD_AMP)
def test_performance_ec2_tensorflow_inference_cpu(tensorflow_inference, ec2_connection, region, cpu_only): threshold = (TENSORFLOW1_INFERENCE_CPU_THRESHOLD if is_tf1(tensorflow_inference) else TENSORFLOW2_INFERENCE_CPU_THRESHOLD) ec2_performance_tensorflow_inference(tensorflow_inference, "cpu", ec2_connection, region, threshold)
def test_tensorflow_with_horovod_gpu(tensorflow_training, ec2_connection, gpu_only): test_script = TF1_HVD_CMD if is_tf1(tensorflow_training) else TF2_HVD_CMD execute_ec2_training_test( ec2_connection, tensorflow_training, test_script, large_shm=True if "p2.8xlarge" in TF_EC2_GPU_INSTANCE_TYPE else False)
def test_eks_tensorflow_multi_node_training_gpu(tensorflow_training, example_only): # EKS multinode are failing on TF1 Pipeline due to scheduling issues. # TODO: Remove this line and add the required scheduling scheme. if is_tf1(tensorflow_training): pytest.skip( "Skipping it on TF1 currently as it is not able to do the pods scheduling properly" ) eks_cluster_size = "3" ec2_instance_type = "p3.16xlarge" eks_gpus_per_worker = ec2_utils.get_instance_num_gpus( instance_type=ec2_instance_type) _run_eks_tensorflow_multinode_training_resnet50_mpijob( tensorflow_training, eks_cluster_size, eks_gpus_per_worker)
def test_smdebug_cpu(training, ec2_connection, region, cpu_only, py3_only): # TODO: Remove this once test timeout has been debugged (failures especially on m4.16xlarge) if is_tf1(training): pytest.skip("Currently skipping for TF1 until the issue is fixed") test_script = SMDEBUG_SCRIPT framework = get_framework_from_image_uri(training) container_test_local_dir = os.path.join("$HOME", "container_tests") ec2_connection.run( f"$(aws ecr get-login --no-include-email --region {region})", hide=True) ec2_connection.run( f"docker run --name smdebug-cpu -v {container_test_local_dir}:{os.path.join(os.sep, 'test')} -itd {training}", hide=True, ) ec2_connection.run( f"docker exec --user root smdebug-cpu /bin/bash -c '{test_script} {framework}'", hide=True, )
def test_smdebug_gpu(training, ec2_connection, region, gpu_only, py3_only): # p2.8xlarge and m4.16xlarge TF1 Pipeline Test are failing for unknown reason. # TODO: Remove this line and provide the required solution. if is_tf1(training) and SMDEBUG_EC2_GPU_INSTANCE_TYPE == "p2.8xlarge": pytest.skip("Currently skipping for TF1 until the issue is fixed") test_script = SMDEBUG_SCRIPT framework = get_framework_from_image_uri(training) container_test_local_dir = os.path.join("$HOME", "container_tests") ec2_connection.run( f"$(aws ecr get-login --no-include-email --region {region})", hide=True) ec2_connection.run( f"nvidia-docker run --name smdebug-gpu -v " f"{container_test_local_dir}:{os.path.join(os.sep, 'test')} -itd {training}", hide=True, ) ec2_connection.run( f"nvidia-docker exec --user root smdebug-gpu /bin/bash -c '{test_script} {framework}'", hide=True, )
def test_tensorflow_dataservice_gpu(tensorflow_training, ec2_connection, gpu_only): if below_tf23(tensorflow_training) or is_tf1(tensorflow_training): pytest.skip("This test is for TF2.3 and higher") run_data_service_test(ec2_connection, tensorflow_training)
def test_tensorflow_with_horovod_cpu(tensorflow_training, ec2_connection, cpu_only): test_script = TF1_HVD_CMD if is_tf1(tensorflow_training) else TF2_HVD_CMD execute_ec2_training_test(ec2_connection, tensorflow_training, test_script)
def test_tensorflow_opencv_cpu(tensorflow_training, ec2_connection, cpu_only): if is_tf1(tensorflow_training): pytest.skip("This test is for TF2 only") execute_ec2_training_test(ec2_connection, tensorflow_training, TF_OPENCV_CMD)
def test_tensorflow_keras_horovod_fp32(tensorflow_training, ec2_connection, gpu_only): if is_tf1(tensorflow_training): pytest.skip("This test is for TF2 and later only") execute_ec2_training_test(ec2_connection, tensorflow_training, TF_KERAS_HVD_CMD_FP32)
def test_tensorflow_standalone_cpu(tensorflow_training, ec2_connection, cpu_only): test_script = TF1_STANDALONE_CMD if is_tf1( tensorflow_training) else TF2_STANDALONE_CMD execute_ec2_training_test(ec2_connection, tensorflow_training, test_script)
def test_smdebug_cpu(training, ec2_connection, region, cpu_only, py3_only): # TODO: Remove this once test timeout has been debugged (failures especially on m4.16xlarge) if is_tf1(training): pytest.skip("Currently skipping for TF1 until the issue is fixed") run_smdebug_test(training, ec2_connection, region)
def test_cuda_paths(gpu): """ Test to ensure directory structure for GPU Dockerfiles has cuda version in it :param gpu: gpu image uris """ image = gpu if "example" in image: pytest.skip( "Skipping Example Dockerfiles which are not explicitly tied to a cuda version" ) dlc_path = os.getcwd().split("/test/")[0] job_type = "training" if "training" in image else "inference" # Ensure that image has a supported framework frameworks = ("tensorflow", "pytorch", "mxnet") framework = "" for fw in frameworks: if fw in image: framework = fw break assert framework, f"Cannot find any frameworks {frameworks} in image uri {image}" # Get cuda, framework version, python version through regex cuda_version = re.search(r"-(cu\d+)-", image).group(1) framework_version = re.search(r":(\d+(.\d+){2})", image).group(1) python_version = re.search(r"(py\d+)", image).group(1) framework_version_path = os.path.join(dlc_path, framework, job_type, "docker", framework_version) if not os.path.exists(os.path.join(framework_version_path, python_version)): # Use the pyX version as opposed to the pyXY version if pyXY path does not exist python_version = python_version[:3] # Check buildspec for cuda version buildspec = "buildspec.yml" if is_tf1(image): buildspec = "buildspec-tf1.yml" cuda_in_buildspec = False cuda_in_buildspec_ref = f"CUDA_VERSION {cuda_version}" buildspec_path = os.path.join(dlc_path, framework, buildspec) with open(buildspec_path, "r") as bf: for line in bf: if cuda_in_buildspec_ref in line: cuda_in_buildspec = True break try: assert cuda_in_buildspec, f"Can't find {cuda_in_buildspec_ref} in {buildspec_path}" except AssertionError as e: if not is_dlc_cicd_context(): LOGGER.warn( f"{e} - not failing, as this is a(n) {os.getenv('BUILD_CONTEXT', 'empty')} build context." ) else: raise # Check that a Dockerfile exists in the right directory dockerfile_path = os.path.join(framework_version_path, python_version, cuda_version, "Dockerfile.gpu") assert os.path.exists( dockerfile_path ), f"Cannot find dockerfile for image {image} in {dockerfile_path}"
def test_tensorflow_dataservice_gpu(tensorflow_training, ec2_connection, gpu_only): if is_tf1(tensorflow_training): pytest.skip("This test is for TF2 only") run_data_service_test(ec2_connection, tensorflow_training)
def test_curand_gpu(training, ec2_connection, gpu_only): if is_tf1(training) or "mxnet" in training: pytest.skip("Test is not configured for TF1 and MXNet") execute_ec2_training_test(ec2_connection, training, CURAND_CMD)
def test_tensorflow_tensorboard_cpu(tensorflow_training, ec2_connection, cpu_only): if is_tf1(tensorflow_training): pytest.skip("This test is for TF2 only") execute_ec2_training_test(ec2_connection, tensorflow_training, TF_TENSORBOARD_CMD)
def test_dlc_major_version_dockerfiles(image): """ Test to make sure semantic versioning scheme in Dockerfiles is correct :param image: <str> ECR image URI """ dlc_dir = os.getcwd().split(f"{os.sep}test{os.sep}")[0] job_type = test_utils.get_job_type_from_image(image) framework, fw_version = test_utils.get_framework_and_version_from_tag( image) processor = test_utils.get_processor_from_image_uri(image) # Assign a string of numbers associated with python version in tag. Python major version is not sufficient to # define DLC major version python_major_minor_version = re.search(r"-py(\d{2,})", image).group(1) root_dir = os.path.join(dlc_dir, framework, job_type, "docker") # Skip older FW versions that did not use this versioning scheme references = { "tensorflow2": "2.2.0", "tensorflow1": "1.16.0", "mxnet": "1.7.0", "pytorch": "1.5.0" } if test_utils.is_tf1(image): reference_fw = "tensorflow1" elif test_utils.is_tf2(image): reference_fw = "tensorflow2" else: reference_fw = framework if processor != "eia" and packaging.version.parse( fw_version) < packaging.version.parse(references[reference_fw]): pytest.skip( f"Not enforcing new versioning scheme on old image {image}. " f"Started enforcing version scheme on the following: {references}") # Find all Dockerfile.<processor> for this framework/job_type's Major.Minor version dockerfiles = [] fw_version_major_minor = re.match(r"(\d+\.\d+)", fw_version).group(1) for root, dirnames, filenames in os.walk(root_dir): for filename in filenames: if filename == f"Dockerfile.{processor}": dockerfile_path = os.path.join(root_dir, root, filename) if "example" not in dockerfile_path and f"{os.sep}{fw_version_major_minor}" in dockerfile_path: dockerfiles.append(dockerfile_path) # For the collected dockerfiles above, note the DLC major versions in each Dockerfile if python version matches # the current image under test versions = {} dlc_label_regex = re.compile(r'LABEL dlc_major_version="(\d+)"') python_version_regex = re.compile(r"ARG PYTHON_VERSION=(\d+\.\d+)") for dockerfile in dockerfiles: with open(dockerfile, "r") as df: dlc_version = None python_version = None for line in df: major_version_match = dlc_label_regex.match(line) python_version_match = python_version_regex.match(line) if major_version_match: dlc_version = int(major_version_match.group(1)) elif python_version_match: python_version = python_version_match.group(1).replace( ".", "") # Raise errors if dlc major version label and python version arg are not found in Dockerfile if not dlc_version: raise DLCMajorVersionLabelNotFound( f"Cannot find dlc_major_version label in {dockerfile}") if not python_version: raise DLCPythonVersionNotFound( f"Cannot find PYTHON_VERSION arg in {dockerfile}") if python_version == python_major_minor_version: versions[dockerfile] = dlc_version expected_versions = list(range(1, len(dockerfiles) + 1)) actual_versions = sorted(versions.values()) # Test case explicitly for TF2.3 gpu, since v1.0 is banned if (framework, fw_version_major_minor, processor, python_major_minor_version, job_type) == ( "tensorflow", "2.3", "gpu", "37", "training", ): expected_versions = [v + 1 for v in expected_versions] assert 1 not in actual_versions, ( f"DLC v1.0 is deprecated in TF2.3 gpu containers, but found major version 1 " f"in one of the Dockerfiles. Please inspect {versions}") # Note: If, for example, we find 3 dockerfiles with the same framework major/minor version, same processor, # and same python major/minor version, we will expect DLC major versions 1, 2, and 3. If an exception needs to be # made to this rule, please see the above handling of TF2.3 as an example. assert actual_versions == expected_versions, ( f"Found DLC major versions {actual_versions} but expected {expected_versions} for " f"{framework} {job_type} {processor}. Full version info: {versions}. Py version: {python_major_minor_version}" )
def test_tensorflow_addons_cpu(tensorflow_training, ec2_connection, cpu_only): if is_tf1(tensorflow_training): pytest.skip("This test is for TF2 only") execute_ec2_training_test(ec2_connection, tensorflow_training, TF_ADDONS_CMD)