def test_performance_ec2_pytorch_inference_graviton_cpu( pytorch_inference_graviton, ec2_connection, region, cpu_only): _, framework_version = get_framework_and_version_from_tag( pytorch_inference_graviton) threshold = get_threshold_for_image(framework_version, PYTORCH_INFERENCE_CPU_THRESHOLD) ec2_performance_pytorch_inference(pytorch_inference_graviton, "cpu", ec2_connection, region, PT_PERFORMANCE_INFERENCE_CPU_CMD, threshold)
def test_performance_ec2_tensorflow_inference_graviton_cpu( tensorflow_inference_graviton, ec2_connection, ec2_instance_ami, region, cpu_only): _, framework_version = get_framework_and_version_from_tag( tensorflow_inference_graviton) threshold = get_threshold_for_image(framework_version, TENSORFLOW_INFERENCE_CPU_THRESHOLD) ec2_performance_tensorflow_inference(tensorflow_inference_graviton, "cpu", ec2_connection, ec2_instance_ami, region, threshold)
def test_performance_tensorflow_gpu_imagenet(tensorflow_training, ec2_connection, gpu_only, tf2_only): _, framework_version = get_framework_and_version_from_tag(tensorflow_training) threshold = get_threshold_for_image(framework_version, TENSORFLOW_TRAINING_GPU_IMAGENET_THRESHOLD) execute_ec2_training_performance_test( ec2_connection, tensorflow_training, TF_PERFORMANCE_TRAINING_GPU_IMAGENET_CMD, post_process=post_process_tensorflow_training_performance, data_source="imagenet", threshold={"Throughput": threshold}, )
def test_performance_tensorflow_cpu(tensorflow_training, ec2_connection, cpu_only): _, framework_version = get_framework_and_version_from_tag(tensorflow_training) threshold = get_threshold_for_image(framework_version, TENSORFLOW_TRAINING_CPU_SYNTHETIC_THRESHOLD) execute_ec2_training_performance_test( ec2_connection, tensorflow_training, TF_PERFORMANCE_TRAINING_CPU_SYNTHETIC_CMD, post_process=post_process_tensorflow_training_performance, data_source="synthetic", threshold={"Throughput": threshold}, )
def test_performance_ec2_tensorflow_inference_cpu(tensorflow_inference, ec2_connection, ec2_instance_ami, region, cpu_only): _, framework_version = get_framework_and_version_from_tag( tensorflow_inference) if Version(framework_version) == Version("2.4.1"): pytest.skip("This test times out, and needs to be run manually.") threshold = get_threshold_for_image(framework_version, TENSORFLOW_INFERENCE_CPU_THRESHOLD) ec2_performance_tensorflow_inference(tensorflow_inference, "cpu", ec2_connection, ec2_instance_ami, region, threshold)
def test_performance_ec2_mxnet_inference_cpu(mxnet_inference, ec2_connection, cpu_only, py3_only): _, framework_version = get_framework_and_version_from_tag(mxnet_inference) threshold = get_threshold_for_image( framework_version, MXNET_INFERENCE_CPU_IMAGENET_THRESHOLD) execute_ec2_inference_performance_test( ec2_connection, mxnet_inference, MX_PERFORMANCE_INFERENCE_CPU_CMD, post_process=post_process_mxnet_ec2_performance, data_source="imagenet", threshold={"Throughput": threshold}, )
def test_performance_ec2_mxnet_training_cpu(mxnet_training, ec2_connection, cpu_only): _, framework_version = get_framework_and_version_from_tag(mxnet_training) threshold = get_threshold_for_image(framework_version, MXNET_TRAINING_CPU_CIFAR_THRESHOLD) execute_ec2_training_performance_test( ec2_connection, mxnet_training, MX_PERFORMANCE_TRAINING_CPU_CMD, post_process=post_process_mxnet_ec2_performance, data_source="cifar10", threshold={"Throughput": threshold}, )
def test_performance_pytorch_gpu_synthetic(pytorch_training, ec2_connection, gpu_only, py3_only): _, framework_version = get_framework_and_version_from_tag(pytorch_training) threshold = get_threshold_for_image( framework_version, PYTORCH_TRAINING_GPU_SYNTHETIC_THRESHOLD) execute_ec2_training_performance_test( ec2_connection, pytorch_training, PT_PERFORMANCE_TRAINING_GPU_SYNTHETIC_CMD, post_process= post_process_pytorch_gpu_py3_synthetic_ec2_training_performance, data_source="synthetic", threshold={"Throughput": threshold}, )
def execute_pytorch_gpu_py3_imagenet_ec2_training_performance_test( connection, ecr_uri, test_cmd, region=DEFAULT_REGION): _, framework_version = get_framework_and_version_from_tag(ecr_uri) threshold = get_threshold_for_image( framework_version, PYTORCH_TRAINING_GPU_IMAGENET_THRESHOLD) repo_name, image_tag = ecr_uri.split("/")[-1].split(":") container_test_local_dir = os.path.join("$HOME", "container_tests") container_name = f"{repo_name}-performance-{image_tag}-ec2" # Make sure we are logged into ECR so we can pull the image connection.run( f"$(aws ecr get-login --no-include-email --region {region})", hide=True) connection.run(f"nvidia-docker pull -q {ecr_uri}") timestamp = time.strftime("%Y-%m-%d-%H-%M-%S") log_name = f"imagenet_{os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')}_{timestamp}.txt" log_location = os.path.join(container_test_local_dir, "benchmark", "logs", log_name) # Run training command, display benchmark results to console try: connection.run( f"nvidia-docker run --user root " f"-e LOG_FILE={os.path.join(os.sep, 'test', 'benchmark', 'logs', log_name)} " f"-e PR_CONTEXT={1 if is_pr_context() else 0} " f"--shm-size 8G --env OMP_NUM_THREADS=1 --name {container_name} " f"-v {container_test_local_dir}:{os.path.join(os.sep, 'test')} " f"-v /home/ubuntu/:/root/:delegated " f"{ecr_uri} {os.path.join(os.sep, 'bin', 'bash')} -c {test_cmd}") finally: connection.run(f"docker rm -f {container_name}", warn=True, hide=True) ec2_performance_upload_result_to_s3_and_validate( connection, ecr_uri, log_location, "imagenet", {"Cost": threshold}, post_process_pytorch_gpu_py3_imagenet_ec2_training_performance, log_name, )
def run_sm_perf_test(image_uri, num_nodes, region): """ Run TF sagemaker training performance tests Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv. TODO: Refactor the above setup function to be more obviously connected to this function, TODO: and install requirements via a requirements.txt file :param image_uri: ECR image URI :param num_nodes: Number of nodes to run on :param region: AWS region """ _, framework_version = get_framework_and_version_from_tag(image_uri) if framework_version.startswith("1."): pytest.skip("Skipping benchmark test on TF 1.x images.") processor = "gpu" if "gpu" in image_uri else "cpu" device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3" time_str = time.strftime("%Y-%m-%d-%H-%M-%S") commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version) training_job_name = ( f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" ) # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in # a throttling error for SageMaker APIs. time.sleep(Random(x=training_job_name).random() * 60) test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = ( f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt" ) run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {image_uri} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name}" f"2>&1 | tee {log_file}", warn=True, echo=True, ) if not (run_out.ok or run_out.return_code == 124): target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}" ) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) result_statement, throughput = _print_results_of_test( os.path.join(test_dir, log_file), processor) throughput /= num_nodes assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) threshold_table = ((TENSORFLOW_SM_TRAINING_CPU_1NODE_THRESHOLD if num_nodes == 1 else TENSORFLOW_SM_TRAINING_CPU_4NODE_THRESHOLD) if processor == "cpu" else TENSORFLOW_SM_TRAINING_GPU_1NODE_THRESHOLD if num_nodes == 1 else TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD) threshold = get_threshold_for_image(framework_version, threshold_table) LOGGER.info( f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} " f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec" ) assert throughput > threshold, ( f"tensorflow {framework_version} sagemaker training {processor} {py_version} imagenet {num_nodes} nodes " f"Benchmark Result {throughput} does not reach the threshold {threshold}" )
def test_mxnet_sagemaker_training_performance(mxnet_training, num_nodes, region, gpu_only, py3_only): """ Run MX sagemaker training performance test Additional context: Setup for this function is performed by 'setup_sm_benchmark_mx_train_env' -- this installs some prerequisite packages, pulls required script, and creates a virtualenv called sm_benchmark_venv. The training script mxnet_imagenet_resnet50.py is invoked via a shell script smtrain-resnet50-imagenet.sh The shell script sets num-epochs to 40. This parameter is configurable. TODO: Refactor the above setup function to be more obviously connected to this function, TODO: and install requirements via a requirements.txt file TODO: Change latency [time/epoch] metric to Throughput metric :param mxnet_training: ECR image URI :param num_nodes: Number of nodes to run on :param region: AWS region """ _, framework_version = get_framework_and_version_from_tag(mxnet_training) device_cuda_str = f"gpu-{get_cuda_version_from_tag(mxnet_training)}" py_version = "py37" if "py37" in mxnet_training else "py2" if "py2" in mxnet_training else "py3" ec2_instance_type = "p3.16xlarge" time_str = time.strftime('%Y-%m-%d-%H-%M-%S') commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "manual") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", device_cuda_str, py_version) training_job_name = f"mx-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt" run_out = ctx.run( f"timeout 90m python mx_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {mxnet_training} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name} " f"2>&1 | tee {log_file}", warn=True, echo=True) if not run_out.ok: target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}", warn=True, echo=True) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) result_statement, time_val, accuracy = _print_results_of_test( os.path.join(test_dir, log_file)) accuracy_threshold = get_threshold_for_image( framework_version, MXNET_TRAINING_GPU_IMAGENET_ACCURACY_THRESHOLD) assert accuracy > accuracy_threshold, ( f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes " f"Benchmark Result {accuracy} does not reach the threshold accuracy {accuracy_threshold}" ) time_threshold = get_threshold_for_image( framework_version, MXNET_TRAINING_GPU_IMAGENET_LATENCY_THRESHOLD) assert time_val < time_threshold, ( f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes " f"Benchmark Result {time_val} does not reach the threshold latency {time_threshold}" )