def test_dataclasses_check(image): """ Ensure there is no dataclasses pip package is installed for python 3.7 and above version. Python version retrieved from the ecr image uri is expected in the format `py<major_verion><minor_version>` :param image: ECR image URI """ ctx = Context() pip_package = "dataclasses" container_name = get_container_name("dataclasses-check", image) python_version = get_python_version_from_image_uri(image).replace("py", "") python_version = int(python_version) if python_version >= 37: start_container(container_name, image, ctx) output = run_cmd_on_container(container_name, ctx, f"pip show {pip_package}", warn=True) if output.return_code == 0: pytest.fail( f"{pip_package} package exists in the DLC image {image} that has py{python_version} version which is greater than py36 version" ) else: LOGGER.info( f"{pip_package} package does not exists in the DLC image {image}" ) else: pytest.skip( f"Skipping test for DLC image {image} that has py36 version as {pip_package} is not included in the python framework" )
def _print_results_of_test(file_path, processor): last_100_lines = Context().run(f"tail -100 {file_path}").stdout.split("\n") result = "" throughput = 0 if processor == "cpu": for line in last_100_lines: if "Total img/sec on " in line: result = line + "\n" throughput = float( re.search( r"(CPU\(s\):[ ]*)(?P<throughput>[0-9]+\.?[0-9]+)", line).group("throughput")) break elif processor == "gpu": result_dict = dict() for line in last_100_lines: if "images/sec: " in line: key = line.split("<stdout>")[0] result_dict[key] = line.strip("\n") if throughput == 0: throughput = float( re.search( r"(images/sec:[ ]*)(?P<throughput>[0-9]+\.?[0-9]+)", line).group("throughput")) result = "\n".join(result_dict.values()) + "\n" LOGGER.info(result) return result, throughput
def _print_results_of_test(file_path, processor): result = "" throughput = 0 if processor == "cpu": with open(file_path, 'r') as f: lines = f.readlines() for line in lines: if "Total img/sec on " in line: result = line + "\n" throughput += float( re.search( r"(CPU\(s\):[ ]*)(?P<throughput>[0-9]+\.?[0-9]+)", line).group("throughput")) elif processor == "gpu": """calculate average throughput""" result_list, throughput_list = [], [] with open(file_path, 'r') as f: lines = f.readlines() for line in lines: if "images/sec: " in line: result_list.append(line.strip("\n")) throughput = float( re.search( r"(images/sec:[ ]*)(?P<throughput>[0-9]+\.?[0-9]+)", line).group("throughput")) throughput_list.append(throughput) result = "\n".join(result_list[-100:]) + "\n" if len(throughput_list) == 0: raise Exception( "Cannot find throughput lines. Looks like SageMaker job was not run successfully. Please check" ) # Take average of last 100 throughput lines throughput = sum(throughput_list[-100:]) / len(throughput_list[-100:]) LOGGER.info(result) return result, throughput
def test_tensorflow_sagemaker_training_performance(tensorflow_training, num_nodes, region): # This sleep has been inserted because all the parametrized training jobs are automatically created # by SageMaker with the same name, due to being started around the same time, and with the same image uri. time.sleep( random.Random(x=f"{tensorflow_training}{num_nodes}").random() * 60) framework_version = re.search(r"[1,2](\.\d+){2}", tensorflow_training).group() processor = "gpu" if "gpu" in tensorflow_training else "cpu" ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3" time_str = time.strftime('%Y-%m-%d-%H-%M-%S') commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", processor, py_version) test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt" run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {tensorflow_training} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"> {log_file}", warn=True, echo=True) if not (run_out.ok or run_out.return_code == 124): target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {log_file} {os.path.join(target_upload_location, log_file)}" ) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" )
def _print_results_of_test(file_path, processor): last_100_lines = Context().run(f"tail -100 {file_path}").stdout.split("\n") result = "" if processor == "cpu": for line in last_100_lines: if "Total img/sec on " in line: result = line + "\n" elif processor == "gpu": result_dict = dict() for line in last_100_lines: if "images/sec: " in line: key = line.split("<stdout>")[0] result_dict[key] = line.strip("\n") result = "\n".join(result_dict.values()) + "\n" LOGGER.info(result) return result
def test_dlc_major_version_label(image, region): """ Test to ensure that all DLC images have the LABEL "dlc_major_version" :param image: <str> Image URI :param region: <str> region where ECR repository holding the image resides :return: """ ecr_client = boto3.client("ecr", region_name=region) image_repository, image_tag = get_repository_and_tag_from_image_uri(image) # Using "acceptedMediaTypes" on the batch_get_image request allows the returned image information to # provide the ECR Image Manifest in the specific format that we need, so that the image LABELS can be found # on the manifest. The default format does not return the image LABELs. response = ecr_client.batch_get_image( repositoryName=image_repository, imageIds=[{ "imageTag": image_tag }], acceptedMediaTypes=[ "application/vnd.docker.distribution.manifest.v1+json" ], ) if not response.get("images"): raise KeyError( f"Failed to get images through ecr_client.batch_get_image response for image {image_repository}:{image_tag}" ) elif not response["images"][0].get("imageManifest"): raise KeyError( f"imageManifest not found in ecr_client.batch_get_image response:\n{response['images']}" ) manifest_str = response["images"][0]["imageManifest"] # manifest_str is a json-format string manifest = json.loads(manifest_str) image_metadata = json.loads(manifest["history"][0]["v1Compatibility"]) major_version = image_metadata["config"]["Labels"].get( "dlc_major_version", None) assert major_version, f"{image} has no LABEL named 'dlc_major_version'. Please insert label." LOGGER.info(f"{image} has 'dlc_major_version' = {major_version}")
def test_canary_images_pullable(region): """ Sanity test to verify canary specific functions """ ctx = Context() frameworks = ("tensorflow", "mxnet", "pytorch") # Have a default framework to test on framework = "pytorch" for fw in frameworks: if fw in os.getenv("CODEBUILD_INITIATOR"): framework = fw break images = parse_canary_images(framework, region) login_to_ecr_registry(ctx, PUBLIC_DLC_REGISTRY, region) if not images: return for image in images.split(" "): ctx.run(f"docker pull -q {image}") LOGGER.info(f"Canary image {image} is available")
def _print_results_of_test(file_path): last_n_lines = Context().run(f"tail -500 {file_path}").stdout.split("\n") result_dict = dict() accuracy = 0 time_cost = 0 accuracy_key = "Train-accuracy" time_cost_key = "Time cost" reversed_log = reversed(last_n_lines) for line in reversed_log: if all(key in result_dict for key in ("Train-accuracy", "Time cost")): break if accuracy_key in line: if accuracy_key in result_dict: continue accuracy_str = line.split("=")[1] result_dict[accuracy_key] = accuracy_str accuracy = float(accuracy_str) if time_cost_key in line: if time_cost_key in result_dict: continue time_str = line.split("=")[1] result_dict[time_cost_key] = time_str time_cost = float(time_str) result = "\n".join(result_dict.values()) + "\n" LOGGER.info(f'Result is {result}') LOGGER.info(f'{accuracy_key} is {accuracy}') LOGGER.info(f'{time_cost_key} is {time_cost}') return result, time_cost, accuracy
def run_sm_profiler_tests(image, profiler_tests_dir, test_file, processor): """ Testrunner to execute SM profiler tests from DLC repo """ ctx = Context() # Install profiler requirements only once - pytest-rerunfailures has a known issue # with the latest pytest https://github.com/pytest-dev/pytest-rerunfailures/issues/128 try: ctx.run( "pip install -r " "https://raw.githubusercontent.com/awslabs/sagemaker-debugger/master/config/profiler/requirements.txt && " "pip install smdebug && " "pip uninstall -y pytest-rerunfailures", hide=True, ) except UnexpectedExit: # Wait a minute and a half if we get an invoke failure - since smprofiler test requirements can be flaky time.sleep(90) framework, version = get_framework_and_version_from_tag(image) # Conditionally set sm data parallel tests, based on config file rules from link below: # https://github.com/awslabs/sagemaker-debugger/tree/master/config/profiler enable_sm_data_parallel_tests = "true" if framework == "pytorch" and Version(version) < Version("1.6"): enable_sm_data_parallel_tests = "false" if framework == "tensorflow" and Version(version) < Version("2.3"): enable_sm_data_parallel_tests = "false" # Set SMProfiler specific environment variables smprof_configs = { "use_current_branch": "false", "enable_smdataparallel_tests": enable_sm_data_parallel_tests, "force_run_tests": "false", "framework": framework, "build_type": "release" } # Command to set all necessary environment variables export_cmd = " && ".join(f"export {key}={val}" for key, val in smprof_configs.items()) export_cmd = f"{export_cmd} && export ENV_CPU_TRAIN_IMAGE=test && export ENV_GPU_TRAIN_IMAGE=test && " \ f"export ENV_{processor.upper()}_TRAIN_IMAGE={image}" test_results_outfile = os.path.join( os.getcwd(), f"{get_container_name('smprof', image)}.txt") with ctx.prefix(f"cd {profiler_tests_dir}"): with ctx.prefix(f"cd sagemaker-tests && {export_cmd}"): try: ctx.run( f"pytest --json-report --json-report-file={test_results_outfile} -n=auto " f"-v -s -W=ignore tests/{test_file}::test_{processor}_jobs", hide=True, ) with open(test_results_outfile) as outfile: result_data = json.load(outfile) LOGGER.info( f"Tests passed on {image}; Results:\n{json.dumps(result_data, indent=4)}" ) except Exception as e: if os.path.exists(test_results_outfile): with open(test_results_outfile) as outfile: result_data = json.load(outfile) raise SMProfilerRCTestFailure( f"Failed SM Profiler tests. Results:\n{json.dumps(result_data, indent=4)}" ) from e raise
def test_tensorflow_sagemaker_training_performance(tensorflow_training, num_nodes, region): """ Run TF sagemaker training performance tests Additonal context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv. TODO: Refactor the above setup function to be more obviously connected to this function, TODO: and install requirements via a requirements.txt file :param tensorflow_training: ECR image URI :param num_nodes: Number of nodes to run on :param region: AWS region """ framework_version = re.search(r"[1,2](\.\d+){2}", tensorflow_training).group() if framework_version.startswith("1."): pytest.skip("Skipping benchmark test on TF 1.x images.") processor = "gpu" if "gpu" in tensorflow_training else "cpu" ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3" time_str = time.strftime('%Y-%m-%d-%H-%M-%S') commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", processor, py_version) training_job_name = ( f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}" f"-{commit_info[:7]}-{time_str}") # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in # a throttling error for SageMaker APIs. time.sleep(Random(x=training_job_name).random() * 60) test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = f"results-{commit_info}-{time_str}-{framework_version}-{processor}-{py_version}-{num_nodes}-node.txt" run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {tensorflow_training} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name}" f"2>&1 | tee {log_file}", warn=True, echo=True) if not (run_out.ok or run_out.return_code == 124): target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}" ) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) _print_results_of_test(os.path.join(test_dir, log_file), processor) assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" )
def test_oss_compliance(image): """ Run oss compliance check on a container to check if license attribution files exist. And upload source of third party packages to S3 bucket. """ THIRD_PARTY_SOURCE_CODE_BUCKET = "aws-dlinfra-licenses" THIRD_PARTY_SOURCE_CODE_BUCKET_PATH = "third_party_source_code" file = "THIRD_PARTY_SOURCE_CODE_URLS" container_name = get_container_name("oss_compliance", image) context = Context() local_repo_path = get_repository_local_path() start_container(container_name, image, context) # run compliance test to make sure license attribution files exists. testOSSCompliance is copied as part of Dockerfile run_cmd_on_container(container_name, context, "/usr/local/bin/testOSSCompliance /root") try: context.run( f"docker cp {container_name}:/root/{file} {os.path.join(local_repo_path, file)}" ) finally: context.run(f"docker rm -f {container_name}", hide=True) s3_resource = boto3.resource("s3") with open(os.path.join(local_repo_path, file)) as source_code_file: for line in source_code_file: name, version, url = line.split(" ") file_name = f"{name}_v{version}_source_code" s3_object_path = f"{THIRD_PARTY_SOURCE_CODE_BUCKET_PATH}/{file_name}.tar.gz" local_file_path = os.path.join(local_repo_path, file_name) for i in range(3): try: if not os.path.isdir(local_file_path): context.run( f"git clone {url.rstrip()} {local_file_path}") context.run( f"tar -czvf {local_file_path}.tar.gz {local_file_path}" ) except Exception as e: time.sleep(1) if i == 2: LOGGER.error(f"Unable to clone git repo. Error: {e}") raise continue try: if os.path.exists(f"{local_file_path}.tar.gz"): LOGGER.info(f"Uploading package to s3 bucket: {line}") s3_resource.Object(THIRD_PARTY_SOURCE_CODE_BUCKET, s3_object_path).load() except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "404": try: # using aws cli as using boto3 expects to upload folder by iterating through each file instead of entire folder. context.run( f"aws s3 cp {local_file_path}.tar.gz s3://{THIRD_PARTY_SOURCE_CODE_BUCKET}/{s3_object_path}" ) object = s3_resource.Bucket( THIRD_PARTY_SOURCE_CODE_BUCKET).Object( s3_object_path) object.Acl().put(ACL="public-read") except ClientError as e: LOGGER.error( f"Unable to upload source code to bucket {THIRD_PARTY_SOURCE_CODE_BUCKET}. Error: {e}" ) raise else: LOGGER.error( f"Unable to check if source code is present on bucket {THIRD_PARTY_SOURCE_CODE_BUCKET}. Error: {e}" ) raise
def test_ecr_scan(image, ecr_client, sts_client, region): """ Run ECR Scan Tool on an image being tested, and raise Error if vulnerabilities found 1. Start Scan. 2. For 5 minutes (Run DescribeImages): (We run this for 5 minutes because the Scan is expected to complete in about 2 minutes, though no analysis has been performed on exactly how long the Scan takes for a DLC image. Therefore we also have a 3 minute buffer beyond the expected amount of time taken.) 3.1. If imageScanStatus == COMPLETE: exit loop 3.2. If imageScanStatus == IN_PROGRESS or AttributeNotFound(imageScanStatus): continue loop 3.3. If imageScanStatus == FAILED: raise RuntimeError 4. If DescribeImages.imageScanStatus != COMPLETE: raise TimeOutError 5. assert imageScanFindingsSummary.findingSeverityCounts.HIGH/CRITICAL == 0 :param image: str Image URI for image to be tested :param ecr_client: boto3 Client for ECR :param sts_client: boto3 Client for STS :param region: str Name of region where test is executed """ test_account_id = sts_client.get_caller_identity().get("Account") image_account_id = get_account_id_from_image_uri(image) if image_account_id != test_account_id: image_repo_uri, image_tag = image.split(":") _, image_repo_name = image_repo_uri.split("/") target_image_repo_name = f"beta-{image_repo_name}" image = ecr_utils.reupload_image_to_test_ecr(image, target_image_repo_name, region) minimum_sev_threshold = get_minimum_sev_threshold_level(image) LOGGER.info(f"Severity threshold level is {minimum_sev_threshold}") run_scan(ecr_client, image) scan_results = ecr_utils.get_ecr_image_scan_results( ecr_client, image, minimum_vulnerability=minimum_sev_threshold) scan_results = ecr_utils.populate_ecr_scan_with_web_scraper_results( image, scan_results) ecr_image_vulnerability_list = ScanVulnerabilityList( minimum_severity=CVESeverity[minimum_sev_threshold]) ecr_image_vulnerability_list.construct_allowlist_from_ecr_scan_result( scan_results) remaining_vulnerabilities = ecr_image_vulnerability_list # TODO: Once this feature is enabled, remove "if" condition and second assertion statement # TODO: Ensure this works on the canary tags before removing feature flag if is_image_covered_by_allowlist_feature(image): upgraded_image_vulnerability_list, image_scan_allowlist = fetch_other_vulnerability_lists( image, ecr_client, minimum_sev_threshold) s3_bucket_name = ECR_SCAN_HELPER_BUCKET ## In case new vulnerabilities are found conduct failure routine newly_found_vulnerabilities = ecr_image_vulnerability_list - image_scan_allowlist if newly_found_vulnerabilities: failure_routine_summary = conduct_failure_routine( image, image_scan_allowlist, ecr_image_vulnerability_list, upgraded_image_vulnerability_list, s3_bucket_name, ) ( s3_filename_for_fixable_list, s3_filename_for_non_fixable_list, ) = process_failure_routine_summary_and_store_data_in_s3( failure_routine_summary, s3_bucket_name) assert not newly_found_vulnerabilities, ( f"""Found {len(failure_routine_summary["fixable_vulnerabilities"])} fixable vulnerabilites """ f"""and {len(failure_routine_summary["non_fixable_vulnerabilities"])} non fixable vulnerabilites. """ f"""Refer to files s3://{s3_bucket_name}/{s3_filename_for_fixable_list}, s3://{s3_bucket_name}/{s3_filename_for_non_fixable_list}, """ f"""s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_current_image_ecr_scan_list"]} and s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_allowlist"]}.""" ) ## In case there is no new vulnerability but the allowlist is outdated conduct failure routine vulnerabilities_that_can_be_fixed = image_scan_allowlist - upgraded_image_vulnerability_list if vulnerabilities_that_can_be_fixed: failure_routine_summary = conduct_failure_routine( image, image_scan_allowlist, ecr_image_vulnerability_list, upgraded_image_vulnerability_list, s3_bucket_name, ) ( s3_filename_for_fixable_list, s3_filename_for_non_fixable_list, ) = process_failure_routine_summary_and_store_data_in_s3( failure_routine_summary, s3_bucket_name) assert not vulnerabilities_that_can_be_fixed, ( f"""Allowlist is Outdated!! Found {len(failure_routine_summary["fixable_vulnerabilities"])} fixable vulnerabilites """ f"""and {len(failure_routine_summary["non_fixable_vulnerabilities"])} non fixable vulnerabilites. """ f"""Refer to files s3://{s3_bucket_name}/{s3_filename_for_fixable_list}, s3://{s3_bucket_name}/{s3_filename_for_non_fixable_list}, """ f"""s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_current_image_ecr_scan_list"]} and s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_allowlist"]}.""" ) return common_ecr_scan_allowlist = ScanVulnerabilityList( minimum_severity=CVESeverity[minimum_sev_threshold]) common_ecr_scan_allowlist_path = os.path.join( os.sep, get_repository_local_path(), "data", "common-ecr-scan-allowlist.json") if os.path.exists(common_ecr_scan_allowlist_path): common_ecr_scan_allowlist.construct_allowlist_from_file( common_ecr_scan_allowlist_path) remaining_vulnerabilities = remaining_vulnerabilities - common_ecr_scan_allowlist if remaining_vulnerabilities: assert not remaining_vulnerabilities.vulnerability_list, ( f"The following vulnerabilities need to be fixed on {image}:\n" f"{json.dumps(remaining_vulnerabilities.vulnerability_list, indent=4)}" )
def run_sm_perf_test(image_uri, xla, num_nodes, region, threshold=None): """ Run TF sagemaker training performance tests Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv. TODO: Refactor the above setup function to be more obviously connected to this function, TODO: and install requirements via a requirements.txt file :param image_uri: ECR image URI :param xla: [ True | False ] Enable XLA acceleration :param num_nodes: Number of nodes to run on :param region: AWS region This function was inspired by deep-learning-containers/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py """ _, framework_version = get_framework_and_version_from_tag(image_uri) processor = "xla" if xla else "gpu" device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" ''' TODO: Switch to p3.16xlarge when EC2 availability issues are resolved ''' ec2_instance_type = "p3.8xlarge" py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3" time_str = time.strftime("%Y-%m-%d-%H-%M-%S") commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join( BENCHMARK_RESULTS_S3_BUCKET, "xla", "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version ) training_job_name = ( f"opt-tf{framework_version[0]}-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" ) # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in # a throttling error for SageMaker APIs. time.sleep(Random(x=training_job_name).random() * 60) test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = ( f"results-{commit_info}-{time_str}-optimized-tf{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt" ) run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {image_uri} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name} " f"--xla-{'on' if xla else 'off'} " f"2>&1 | tee {log_file}", warn=True, echo=True, ) if not (run_out.ok or run_out.return_code == 124): target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run(f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}") LOGGER.info(f"Test results can be found at {os.path.join(target_upload_location, log_file)}") result_statement, throughput = _print_results_of_test(os.path.join(test_dir, log_file)) throughput /= num_nodes assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) LOGGER.info( f"optimized-tensorflow-{framework_version} sagemaker training {ec2_instance_type} {device_cuda_str} {py_version} " f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec" ) if threshold: assert throughput > threshold, ( f"optimized-tensorflow-{framework_version} sagemaker training {ec2_instance_type} {device_cuda_str} {py_version} imagenet {num_nodes} nodes " f"Regression Benchmark Result {throughput} does not reach the threshold {threshold}" ) return throughput
def test_resnet101_at_fp16(self, instance_type, num_gpus, total_n_gpus, instance_count, distribution_strategy, caching, tensorflow_training, sagemaker_session, capsys, framework_version): epochs = int(100 * total_n_gpus) batches = np.array([224]) * total_n_gpus for batch in np.array(batches, dtype=int): train_steps = int(10240 * epochs / batch) steps_per_loop = train_steps // 10 overrides=\ f"runtime.enable_xla=True,"\ f"runtime.num_gpus={num_gpus},"\ f"runtime.distribution_strategy={distribution_strategy},"\ f"runtime.mixed_precision_dtype=float16,"\ f"task.train_data.global_batch_size={batch},"\ f"task.train_data.input_path=/opt/ml/input/data/training/validation*,"\ f"task.train_data.cache={caching},"\ f"trainer.train_steps={train_steps},"\ f"trainer.steps_per_loop={steps_per_loop},"\ f"trainer.summary_interval={steps_per_loop},"\ f"trainer.checkpoint_interval={train_steps},"\ f"task.model.backbone.type=resnet,"\ f"task.model.backbone.resnet.model_id=101" estimator = TensorFlow( sagemaker_session=sagemaker_session, git_config={ 'repo': 'https://github.com/tensorflow/models.git', 'branch': 'v2.9.2', }, source_dir='.', entry_point='official/vision/train.py', model_dir=False, instance_type=instance_type, instance_count=instance_count, image_uri=tensorflow_training, hyperparameters={ TrainingCompilerConfig.HP_ENABLE_COMPILER: True, 'experiment': 'resnet_imagenet', 'config_file': 'official/vision/configs/experiments/image_classification/imagenet_resnet50_gpu.yaml', 'mode': 'train', 'model_dir': '/opt/ml/model', 'params_override': overrides, }, debugger_hook_config=None, disable_profiler=True, max_run=60 * 60 * 1, # Timeout in 1 hours base_job_name= f"tf{framework_version.replace('.','')}-trcomp-bench-resnet101", role="SageMakerRole", ) estimator.fit( inputs= 's3://collection-of-ml-datasets/Imagenet/TFRecords/validation', logs=True, wait=True) captured = capsys.readouterr() logs = captured.out + captured.err match = re.search('Billable seconds: ([0-9]*)', logs) billable = int(match.group(1)) short_version = '.'.join(framework_version.split('.')[:2]) threshold = TRCOMP_THRESHOLD['tensorflow'][short_version][ 'resnet101'][instance_type][instance_count][batch] result = ( f"tensorflow-trcomp {framework_version} resnet101 fp16 XLA " f"imagenet {instance_type} {instance_count} {batch} Billable: {billable} secs threshold: {threshold} secs " f"{estimator.latest_training_job.name}") LOGGER.info(result) assert billable >= 1000, 'False Positive ' + result assert billable <= threshold, result
def test_tensorflow_sagemaker_training_performance(tensorflow_training, num_nodes, region): framework_version = re.search(r"[1,2](\.\d+){2}", tensorflow_training).group() if framework_version.startswith("1."): pytest.skip("Skipping benchmark test on TF 1.x images.") processor = "gpu" if "gpu" in tensorflow_training else "cpu" ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3" time_str = time.strftime('%Y-%m-%d-%H-%M-%S') commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", processor, py_version) training_job_name = ( f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}" f"-{commit_info[:7]}-{time_str}") # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in # a throttling error for SageMaker APIs. time.sleep(Random(x=training_job_name).random() * 60) test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt" run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {tensorflow_training} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name}" f"2>&1 > {log_file}", warn=True, echo=True) if not (run_out.ok or run_out.return_code == 124): target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}" ) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" )
def test_mxnet_sagemaker_training_performance(mxnet_training, num_nodes, region, gpu_only, py3_only): """ Run MX sagemaker training performance test Additional context: Setup for this function is performed by 'setup_sm_benchmark_mx_train_env' -- this installs some prerequisite packages, pulls required script, and creates a virtualenv called sm_benchmark_venv. The training script mxnet_imagenet_resnet50.py is invoked via a shell script smtrain-resnet50-imagenet.sh The shell script sets num-epochs to 40. This parameter is configurable. TODO: Refactor the above setup function to be more obviously connected to this function, TODO: and install requirements via a requirements.txt file TODO: Change latency [time/epoch] metric to Throughput metric :param mxnet_training: ECR image URI :param num_nodes: Number of nodes to run on :param region: AWS region """ _, framework_version = get_framework_and_version_from_tag(mxnet_training) device_cuda_str = f"gpu-{get_cuda_version_from_tag(mxnet_training)}" py_version = "py37" if "py37" in mxnet_training else "py2" if "py2" in mxnet_training else "py3" ec2_instance_type = "p3.16xlarge" time_str = time.strftime('%Y-%m-%d-%H-%M-%S') commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "manual") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", device_cuda_str, py_version) training_job_name = f"mx-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt" run_out = ctx.run( f"timeout 90m python mx_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {mxnet_training} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name} " f"2>&1 | tee {log_file}", warn=True, echo=True) if not run_out.ok: target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}", warn=True, echo=True) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) result_statement, time_val, accuracy = _print_results_of_test( os.path.join(test_dir, log_file)) accuracy_threshold = get_threshold_for_image( framework_version, MXNET_TRAINING_GPU_IMAGENET_ACCURACY_THRESHOLD) assert accuracy > accuracy_threshold, ( f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes " f"Benchmark Result {accuracy} does not reach the threshold accuracy {accuracy_threshold}" ) time_threshold = get_threshold_for_image( framework_version, MXNET_TRAINING_GPU_IMAGENET_LATENCY_THRESHOLD) assert time_val < time_threshold, ( f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes " f"Benchmark Result {time_val} does not reach the threshold latency {time_threshold}" )
def run_sm_perf_test(image_uri, num_nodes, region): """ Run TF sagemaker training performance tests Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv. TODO: Refactor the above setup function to be more obviously connected to this function, TODO: and install requirements via a requirements.txt file :param image_uri: ECR image URI :param num_nodes: Number of nodes to run on :param region: AWS region """ _, framework_version = get_framework_and_version_from_tag(image_uri) if framework_version.startswith("1."): pytest.skip("Skipping benchmark test on TF 1.x images.") processor = "gpu" if "gpu" in image_uri else "cpu" device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3" time_str = time.strftime("%Y-%m-%d-%H-%M-%S") commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version) training_job_name = ( f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" ) # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in # a throttling error for SageMaker APIs. time.sleep(Random(x=training_job_name).random() * 60) test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = ( f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt" ) run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {image_uri} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name}" f"2>&1 | tee {log_file}", warn=True, echo=True, ) if not (run_out.ok or run_out.return_code == 124): target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}" ) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) result_statement, throughput = _print_results_of_test( os.path.join(test_dir, log_file), processor) throughput /= num_nodes assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) threshold_table = ((TENSORFLOW_SM_TRAINING_CPU_1NODE_THRESHOLD if num_nodes == 1 else TENSORFLOW_SM_TRAINING_CPU_4NODE_THRESHOLD) if processor == "cpu" else TENSORFLOW_SM_TRAINING_GPU_1NODE_THRESHOLD if num_nodes == 1 else TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD) threshold = get_threshold_for_image(framework_version, threshold_table) LOGGER.info( f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} " f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec" ) assert throughput > threshold, ( f"tensorflow {framework_version} sagemaker training {processor} {py_version} imagenet {num_nodes} nodes " f"Benchmark Result {throughput} does not reach the threshold {threshold}" )
def test_ecr_scan(image, ecr_client, sts_client, region): """ Run ECR Scan Tool on an image being tested, and raise Error if vulnerabilities found 1. Start Scan. 2. For 5 minutes (Run DescribeImages): (We run this for 5 minutes because the Scan is expected to complete in about 2 minutes, though no analysis has been performed on exactly how long the Scan takes for a DLC image. Therefore we also have a 3 minute buffer beyond the expected amount of time taken.) 3.1. If imageScanStatus == COMPLETE: exit loop 3.2. If imageScanStatus == IN_PROGRESS or AttributeNotFound(imageScanStatus): continue loop 3.3. If imageScanStatus == FAILED: raise RuntimeError 4. If DescribeImages.imageScanStatus != COMPLETE: raise TimeOutError 5. assert imageScanFindingsSummary.findingSeverityCounts.HIGH/CRITICAL == 0 :param image: str Image URI for image to be tested :param ecr_client: boto3 Client for ECR :param sts_client: boto3 Client for STS :param region: str Name of region where test is executed """ test_account_id = sts_client.get_caller_identity().get("Account") image_account_id = get_account_id_from_image_uri(image) image_region = get_region_from_image_uri(image) image_repo_name, original_image_tag = get_repository_and_tag_from_image_uri(image) additional_image_tags = get_all_the_tags_of_an_image_from_ecr(ecr_client, image) if not is_image_available_locally(image): LOGGER.info(f"Image {image} not available locally!! Pulling the image...") login_to_ecr_registry(Context(), image_account_id, image_region) run(f"docker pull {image}") if not is_image_available_locally(image): raise RuntimeError("Image shown as not available even after pulling") for additional_tag in additional_image_tags: image_uri_with_new_tag = image.replace(original_image_tag, additional_tag) run(f"docker tag {image} {image_uri_with_new_tag}", hide=True) if image_account_id != test_account_id: original_image = image target_image_repo_name = f"beta-{image_repo_name}" for additional_tag in additional_image_tags: image_uri_with_new_tag = original_image.replace(original_image_tag, additional_tag) new_image_uri = ecr_utils.reupload_image_to_test_ecr(image_uri_with_new_tag, target_image_repo_name, region) if image_uri_with_new_tag == original_image: image = new_image_uri minimum_sev_threshold = get_minimum_sev_threshold_level(image) LOGGER.info(f"Severity threshold level is {minimum_sev_threshold}") run_scan(ecr_client, image) scan_results = ecr_utils.get_ecr_image_scan_results(ecr_client, image, minimum_vulnerability=minimum_sev_threshold) scan_results = ecr_utils.populate_ecr_scan_with_web_scraper_results(image, scan_results) ecr_image_vulnerability_list = ScanVulnerabilityList(minimum_severity=CVESeverity[minimum_sev_threshold]) ecr_image_vulnerability_list.construct_allowlist_from_ecr_scan_result(scan_results) remaining_vulnerabilities = ecr_image_vulnerability_list if not is_image_covered_by_allowlist_feature(image): if is_canary_context(): pytest.skip("Skipping the test on the canary.") common_ecr_scan_allowlist = ScanVulnerabilityList(minimum_severity=CVESeverity[minimum_sev_threshold]) common_ecr_scan_allowlist_path = os.path.join( os.sep, get_repository_local_path(), "data", "common-ecr-scan-allowlist.json" ) if os.path.exists(common_ecr_scan_allowlist_path): common_ecr_scan_allowlist.construct_allowlist_from_file(common_ecr_scan_allowlist_path) remaining_vulnerabilities = remaining_vulnerabilities - common_ecr_scan_allowlist if remaining_vulnerabilities: assert not remaining_vulnerabilities.vulnerability_list, ( f"The following vulnerabilities need to be fixed on {image}:\n" f"{json.dumps(remaining_vulnerabilities.vulnerability_list, indent=4)}" ) return upgraded_image_vulnerability_list, image_scan_allowlist = fetch_other_vulnerability_lists( image, ecr_client, minimum_sev_threshold ) s3_bucket_name = ECR_SCAN_HELPER_BUCKET ## In case new vulnerabilities (fixable or non-fixable) are found, then conduct failure routine newly_found_vulnerabilities = ecr_image_vulnerability_list - image_scan_allowlist # In case there is no new vulnerability but the allowlist is outdated vulnerabilities_that_can_be_fixed = image_scan_allowlist - upgraded_image_vulnerability_list if newly_found_vulnerabilities or vulnerabilities_that_can_be_fixed: failure_routine_summary = conduct_failure_routine( image, image_scan_allowlist, ecr_image_vulnerability_list, upgraded_image_vulnerability_list, s3_bucket_name, ) ( s3_filename_for_fixable_list, s3_filename_for_non_fixable_list, ) = process_failure_routine_summary_and_store_data_in_s3(failure_routine_summary, s3_bucket_name) prepend_message = "Found new vulnerabilities in image." if newly_found_vulnerabilities else "Allowlist is outdated." display_message = prepend_message + " " + ( f"""Found {len(failure_routine_summary["fixable_vulnerabilities"])} fixable vulnerabilites """ f"""and {len(failure_routine_summary["non_fixable_vulnerabilities"])} non fixable vulnerabilites. """ f"""Refer to files s3://{s3_bucket_name}/{s3_filename_for_fixable_list}, s3://{s3_bucket_name}/{s3_filename_for_non_fixable_list}, """ f"""s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_current_image_ecr_scan_list"]} and s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_allowlist"]}.""" ) if is_canary_context(): LOGGER.error(display_message) pytest.skip("Skipping the test failure on the canary.") else: raise RuntimeError(display_message)