def ec2_performance_tensorflow_inference(image_uri, processor, ec2_connection, region):
    docker_cmd = "nvidia-docker" if processor == "gpu" else "docker"
    python_version = "py2" if "py2" in image_uri else "py3"
    container_test_local_dir = os.path.join("$HOME", "container_tests")
    tf_version = "1" if is_tf1(image_uri) else "2"
    tf_api_version = '1.15' if tf_version == '1' else '2.1.0rc1'
    tf_version_folder = '1.15' if tf_version == '1' else '2.1'
    processor_folder = "CPU-WITH-MKL" if processor == "cpu" else "GPU"

    # Make sure we are logged into ECR so we can pull the image
    ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)

    ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ")

    # Run performance inference command, display benchmark results to console
    ec2_connection.run(
        f"pip install boto3 grpcio tensorflow-serving-api=={tf_api_version} --user --no-warn-script-location"
    )
    ec2_connection.sudo(f"aws s3 cp s3://tensorflow-aws/{tf_version_folder}/Serving/{processor_folder}/tensorflow_model_server /usr/bin/")
    ec2_connection.sudo(f"chmod +x /usr/bin/tensorflow_model_server")
    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    log_file = f"inference_benchmark_results_{commit_info}_{time_str}.log"
    ec2_connection.run(
        f"python {container_test_local_dir}/bin/benchmark/tf{tf_version}_serving_perf.py "
        f"--processor {processor} --docker_image_name {image_uri} --run_all_s3 --binary /usr/bin/tensorflow_model_server --get_perf --iterations 1000 "
        f"2>&1 | tee {log_file}"
    )
    ec2_connection.run(
        f"echo Benchmark Results: >&2;"
        f"echo Tensorflow{tf_version} Inference {processor} {python_version} >&2"
    )
    ec2_connection.run(f"tail {log_file} >&2")
    ec2_connection.run(f"aws s3 cp {log_file} {BENCHMARK_RESULTS_S3_BUCKET}/tensorflow{tf_version}/ec2/inference/{processor}/{python_version}/{log_file}")
    ec2_connection.run(f"echo To retrieve complete benchmark log, check s3://dlinfra-dlc-cicd-performance/tensorflow{tf_version}/ec2/inference/{processor}/{python_version}/{log_file} >&2")
Пример #2
0
def ec2_performance_tensorflow_inference(image_uri, processor, ec2_connection,
                                         region, threshold):
    docker_cmd = "nvidia-docker" if processor == "gpu" else "docker"
    container_test_local_dir = os.path.join("$HOME", "container_tests")
    tf_version = "1" if is_tf1(image_uri) else "2"
    tf_api_version = "1.15" if tf_version == "1" else "2.3.0"

    # Make sure we are logged into ECR so we can pull the image
    ec2_connection.run(
        f"$(aws ecr get-login --no-include-email --region {region})",
        hide=True)

    ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ")

    # Run performance inference command, display benchmark results to console
    ec2_connection.run(f"pip3 install -U pip")
    ec2_connection.run(
        f"pip3 install boto3 grpcio tensorflow-serving-api=={tf_api_version} --user --no-warn-script-location"
    )
    time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    log_file = f"synthetic_{commit_info}_{time_str}.log"
    ec2_connection.run(
        f"python3 {container_test_local_dir}/bin/benchmark/tf{tf_version}_serving_perf.py "
        f"--processor {processor} --docker_image_name {image_uri} --run_all_s3 --binary /usr/bin/tensorflow_model_server --get_perf --iterations 1000 "
        f"2>&1 | tee {log_file}")
    ec2_performance_upload_result_to_s3_and_validate(
        ec2_connection,
        image_uri,
        log_file,
        "synthetic",
        threshold,
        post_process_inference,
        log_file,
    )
def test_smdebug_gpu(training, ec2_connection, region, gpu_only, py3_only):
    # TODO: Remove this once test timeout has been debugged (failures especially on p2.8xlarge)
    if is_tf2(training) and "2.3.0" in training and "p2.8xlarge" in SMDEBUG_EC2_GPU_INSTANCE_TYPE:
        pytest.skip("Currently skipping for TF2.3.0 on p2.8xlarge until the issue is fixed")
    if is_tf1(training):
        pytest.skip("Currently skipping for TF1 until the issue is fixed")
    run_smdebug_test(training, ec2_connection, region, docker_executable="nvidia-docker", container_name="smdebug-gpu")
def test_tensorflow_keras_horovod_amp(tensorflow_training, ec2_connection,
                                      gpu_only):
    if is_tf1(tensorflow_training) or is_tf20(tensorflow_training):
        pytest.skip(
            "This test is for TF2.1 and later only"
        )  # https://github.com/tensorflow/tensorflow/issues/33484#issuecomment-555299647
    execute_ec2_training_test(ec2_connection, tensorflow_training,
                              TF_KERAS_HVD_CMD_AMP)
Пример #5
0
def test_performance_ec2_tensorflow_inference_cpu(tensorflow_inference,
                                                  ec2_connection, region,
                                                  cpu_only):
    threshold = (TENSORFLOW1_INFERENCE_CPU_THRESHOLD
                 if is_tf1(tensorflow_inference) else
                 TENSORFLOW2_INFERENCE_CPU_THRESHOLD)
    ec2_performance_tensorflow_inference(tensorflow_inference, "cpu",
                                         ec2_connection, region, threshold)
Пример #6
0
def test_tensorflow_with_horovod_gpu(tensorflow_training, ec2_connection,
                                     gpu_only):
    test_script = TF1_HVD_CMD if is_tf1(tensorflow_training) else TF2_HVD_CMD
    execute_ec2_training_test(
        ec2_connection,
        tensorflow_training,
        test_script,
        large_shm=True if "p2.8xlarge" in TF_EC2_GPU_INSTANCE_TYPE else False)
def test_eks_tensorflow_multi_node_training_gpu(tensorflow_training,
                                                example_only):
    # EKS multinode are failing on TF1 Pipeline due to scheduling issues.
    # TODO: Remove this line and add the required scheduling scheme.
    if is_tf1(tensorflow_training):
        pytest.skip(
            "Skipping it on TF1 currently as it is not able to do the pods scheduling properly"
        )
    eks_cluster_size = "3"
    ec2_instance_type = "p3.16xlarge"

    eks_gpus_per_worker = ec2_utils.get_instance_num_gpus(
        instance_type=ec2_instance_type)

    _run_eks_tensorflow_multinode_training_resnet50_mpijob(
        tensorflow_training, eks_cluster_size, eks_gpus_per_worker)
def test_smdebug_cpu(training, ec2_connection, region, cpu_only, py3_only):
    # TODO: Remove this once test timeout has been debugged (failures especially on m4.16xlarge)
    if is_tf1(training):
        pytest.skip("Currently skipping for TF1 until the issue is fixed")
    test_script = SMDEBUG_SCRIPT
    framework = get_framework_from_image_uri(training)
    container_test_local_dir = os.path.join("$HOME", "container_tests")
    ec2_connection.run(
        f"$(aws ecr get-login --no-include-email --region {region})",
        hide=True)

    ec2_connection.run(
        f"docker run --name smdebug-cpu -v {container_test_local_dir}:{os.path.join(os.sep, 'test')} -itd {training}",
        hide=True,
    )

    ec2_connection.run(
        f"docker exec --user root smdebug-cpu /bin/bash -c '{test_script} {framework}'",
        hide=True,
    )
def test_smdebug_gpu(training, ec2_connection, region, gpu_only, py3_only):
    # p2.8xlarge and m4.16xlarge TF1 Pipeline Test are failing for unknown reason.
    # TODO: Remove this line and provide the required solution.
    if is_tf1(training) and SMDEBUG_EC2_GPU_INSTANCE_TYPE == "p2.8xlarge":
        pytest.skip("Currently skipping for TF1 until the issue is fixed")
    test_script = SMDEBUG_SCRIPT
    framework = get_framework_from_image_uri(training)
    container_test_local_dir = os.path.join("$HOME", "container_tests")
    ec2_connection.run(
        f"$(aws ecr get-login --no-include-email --region {region})",
        hide=True)

    ec2_connection.run(
        f"nvidia-docker run --name smdebug-gpu -v "
        f"{container_test_local_dir}:{os.path.join(os.sep, 'test')} -itd {training}",
        hide=True,
    )

    ec2_connection.run(
        f"nvidia-docker exec --user root smdebug-gpu /bin/bash -c '{test_script} {framework}'",
        hide=True,
    )
Пример #10
0
def test_tensorflow_dataservice_gpu(tensorflow_training, ec2_connection,
                                    gpu_only):
    if below_tf23(tensorflow_training) or is_tf1(tensorflow_training):
        pytest.skip("This test is for TF2.3 and higher")
    run_data_service_test(ec2_connection, tensorflow_training)
def test_tensorflow_with_horovod_cpu(tensorflow_training, ec2_connection,
                                     cpu_only):
    test_script = TF1_HVD_CMD if is_tf1(tensorflow_training) else TF2_HVD_CMD
    execute_ec2_training_test(ec2_connection, tensorflow_training, test_script)
def test_tensorflow_opencv_cpu(tensorflow_training, ec2_connection, cpu_only):
    if is_tf1(tensorflow_training):
        pytest.skip("This test is for TF2 only")
    execute_ec2_training_test(ec2_connection, tensorflow_training,
                              TF_OPENCV_CMD)
def test_tensorflow_keras_horovod_fp32(tensorflow_training, ec2_connection,
                                       gpu_only):
    if is_tf1(tensorflow_training):
        pytest.skip("This test is for TF2 and later only")
    execute_ec2_training_test(ec2_connection, tensorflow_training,
                              TF_KERAS_HVD_CMD_FP32)
def test_tensorflow_standalone_cpu(tensorflow_training, ec2_connection,
                                   cpu_only):
    test_script = TF1_STANDALONE_CMD if is_tf1(
        tensorflow_training) else TF2_STANDALONE_CMD
    execute_ec2_training_test(ec2_connection, tensorflow_training, test_script)
def test_smdebug_cpu(training, ec2_connection, region, cpu_only, py3_only):
    # TODO: Remove this once test timeout has been debugged (failures especially on m4.16xlarge)
    if is_tf1(training):
        pytest.skip("Currently skipping for TF1 until the issue is fixed")
    run_smdebug_test(training, ec2_connection, region)
Пример #16
0
def test_cuda_paths(gpu):
    """
    Test to ensure directory structure for GPU Dockerfiles has cuda version in it

    :param gpu: gpu image uris
    """
    image = gpu
    if "example" in image:
        pytest.skip(
            "Skipping Example Dockerfiles which are not explicitly tied to a cuda version"
        )

    dlc_path = os.getcwd().split("/test/")[0]
    job_type = "training" if "training" in image else "inference"

    # Ensure that image has a supported framework
    frameworks = ("tensorflow", "pytorch", "mxnet")
    framework = ""
    for fw in frameworks:
        if fw in image:
            framework = fw
            break
    assert framework, f"Cannot find any frameworks {frameworks} in image uri {image}"

    # Get cuda, framework version, python version through regex
    cuda_version = re.search(r"-(cu\d+)-", image).group(1)
    framework_version = re.search(r":(\d+(.\d+){2})", image).group(1)
    python_version = re.search(r"(py\d+)", image).group(1)

    framework_version_path = os.path.join(dlc_path, framework, job_type,
                                          "docker", framework_version)
    if not os.path.exists(os.path.join(framework_version_path,
                                       python_version)):
        # Use the pyX version as opposed to the pyXY version if pyXY path does not exist
        python_version = python_version[:3]

    # Check buildspec for cuda version
    buildspec = "buildspec.yml"
    if is_tf1(image):
        buildspec = "buildspec-tf1.yml"

    cuda_in_buildspec = False
    cuda_in_buildspec_ref = f"CUDA_VERSION {cuda_version}"
    buildspec_path = os.path.join(dlc_path, framework, buildspec)
    with open(buildspec_path, "r") as bf:
        for line in bf:
            if cuda_in_buildspec_ref in line:
                cuda_in_buildspec = True
                break

    try:
        assert cuda_in_buildspec, f"Can't find {cuda_in_buildspec_ref} in {buildspec_path}"
    except AssertionError as e:
        if not is_dlc_cicd_context():
            LOGGER.warn(
                f"{e} - not failing, as this is a(n) {os.getenv('BUILD_CONTEXT', 'empty')} build context."
            )
        else:
            raise

    # Check that a Dockerfile exists in the right directory
    dockerfile_path = os.path.join(framework_version_path, python_version,
                                   cuda_version, "Dockerfile.gpu")

    assert os.path.exists(
        dockerfile_path
    ), f"Cannot find dockerfile for image {image} in {dockerfile_path}"
def test_tensorflow_dataservice_gpu(tensorflow_training, ec2_connection,
                                    gpu_only):
    if is_tf1(tensorflow_training):
        pytest.skip("This test is for TF2 only")
    run_data_service_test(ec2_connection, tensorflow_training)
Пример #18
0
def test_curand_gpu(training, ec2_connection, gpu_only):
    if is_tf1(training) or "mxnet" in training:
        pytest.skip("Test is not configured for TF1 and MXNet")
    execute_ec2_training_test(ec2_connection, training, CURAND_CMD)
Пример #19
0
def test_tensorflow_tensorboard_cpu(tensorflow_training, ec2_connection,
                                    cpu_only):
    if is_tf1(tensorflow_training):
        pytest.skip("This test is for TF2 only")
    execute_ec2_training_test(ec2_connection, tensorflow_training,
                              TF_TENSORBOARD_CMD)
Пример #20
0
def test_dlc_major_version_dockerfiles(image):
    """
    Test to make sure semantic versioning scheme in Dockerfiles is correct

    :param image: <str> ECR image URI
    """
    dlc_dir = os.getcwd().split(f"{os.sep}test{os.sep}")[0]
    job_type = test_utils.get_job_type_from_image(image)
    framework, fw_version = test_utils.get_framework_and_version_from_tag(
        image)
    processor = test_utils.get_processor_from_image_uri(image)

    # Assign a string of numbers associated with python version in tag. Python major version is not sufficient to
    # define DLC major version
    python_major_minor_version = re.search(r"-py(\d{2,})", image).group(1)

    root_dir = os.path.join(dlc_dir, framework, job_type, "docker")

    # Skip older FW versions that did not use this versioning scheme
    references = {
        "tensorflow2": "2.2.0",
        "tensorflow1": "1.16.0",
        "mxnet": "1.7.0",
        "pytorch": "1.5.0"
    }
    if test_utils.is_tf1(image):
        reference_fw = "tensorflow1"
    elif test_utils.is_tf2(image):
        reference_fw = "tensorflow2"
    else:
        reference_fw = framework
    if processor != "eia" and packaging.version.parse(
            fw_version) < packaging.version.parse(references[reference_fw]):
        pytest.skip(
            f"Not enforcing new versioning scheme on old image {image}. "
            f"Started enforcing version scheme on the following: {references}")

    # Find all Dockerfile.<processor> for this framework/job_type's Major.Minor version
    dockerfiles = []
    fw_version_major_minor = re.match(r"(\d+\.\d+)", fw_version).group(1)
    for root, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename == f"Dockerfile.{processor}":
                dockerfile_path = os.path.join(root_dir, root, filename)
                if "example" not in dockerfile_path and f"{os.sep}{fw_version_major_minor}" in dockerfile_path:
                    dockerfiles.append(dockerfile_path)

    # For the collected dockerfiles above, note the DLC major versions in each Dockerfile if python version matches
    # the current image under test
    versions = {}
    dlc_label_regex = re.compile(r'LABEL dlc_major_version="(\d+)"')
    python_version_regex = re.compile(r"ARG PYTHON_VERSION=(\d+\.\d+)")
    for dockerfile in dockerfiles:
        with open(dockerfile, "r") as df:
            dlc_version = None
            python_version = None
            for line in df:
                major_version_match = dlc_label_regex.match(line)
                python_version_match = python_version_regex.match(line)
                if major_version_match:
                    dlc_version = int(major_version_match.group(1))
                elif python_version_match:
                    python_version = python_version_match.group(1).replace(
                        ".", "")

            # Raise errors if dlc major version label and python version arg are not found in Dockerfile
            if not dlc_version:
                raise DLCMajorVersionLabelNotFound(
                    f"Cannot find dlc_major_version label in {dockerfile}")
            if not python_version:
                raise DLCPythonVersionNotFound(
                    f"Cannot find PYTHON_VERSION arg in {dockerfile}")
            if python_version == python_major_minor_version:
                versions[dockerfile] = dlc_version

    expected_versions = list(range(1, len(dockerfiles) + 1))
    actual_versions = sorted(versions.values())

    # Test case explicitly for TF2.3 gpu, since v1.0 is banned
    if (framework, fw_version_major_minor, processor,
            python_major_minor_version, job_type) == (
                "tensorflow",
                "2.3",
                "gpu",
                "37",
                "training",
            ):
        expected_versions = [v + 1 for v in expected_versions]
        assert 1 not in actual_versions, (
            f"DLC v1.0 is deprecated in TF2.3 gpu containers, but found major version 1 "
            f"in one of the Dockerfiles. Please inspect {versions}")

    # Note: If, for example, we find 3 dockerfiles with the same framework major/minor version, same processor,
    # and same python major/minor version, we will expect DLC major versions 1, 2, and 3. If an exception needs to be
    # made to this rule, please see the above handling of TF2.3 as an example.
    assert actual_versions == expected_versions, (
        f"Found DLC major versions {actual_versions} but expected {expected_versions} for "
        f"{framework} {job_type} {processor}. Full version info: {versions}. Py version: {python_major_minor_version}"
    )
Пример #21
0
def test_tensorflow_addons_cpu(tensorflow_training, ec2_connection, cpu_only):
    if is_tf1(tensorflow_training):
        pytest.skip("This test is for TF2 only")
    execute_ec2_training_test(ec2_connection, tensorflow_training,
                              TF_ADDONS_CMD)