def run_smclarify_bias_metrics( image_uri, ec2_connection, ec2_instance_type, docker_executable="docker", container_name="smclarify", test_script=SMCLARIFY_SCRIPT, ): container_test_local_dir = os.path.join("$HOME", "container_tests") account_id = get_account_id_from_image_uri(image_uri) region = get_region_from_image_uri(image_uri) login_to_ecr_registry(ec2_connection, account_id, region) ec2_connection.run(f"docker pull -q {image_uri}") try: ec2_connection.run( f"{docker_executable} run --name {container_name} -v " f"{container_test_local_dir}:{os.path.join(os.sep, 'test')} {image_uri} " f"python {test_script}", hide=True, timeout=300, ) except Exception as e: debug_output = ec2_connection.run(f"docker logs {container_name}") debug_stdout = debug_output.stdout if "Test SMClarify Bias Metrics succeeded!" in debug_stdout: LOGGER.warning( f"SMClarify test succeeded, but there is an issue with fabric. " f"Error:\n{e}\nTest output:\n{debug_stdout}" ) return raise SMClarifyTestFailure( f"SMClarify test failed on {image_uri} on {ec2_instance_type}. Full output:\n{debug_stdout}" ) from e
def _run_instance_role_disabled(image_uri, ec2_client, ec2_instance, ec2_connection): expected_tag_key = "aws-dlc-autogenerated-tag-do-not-delete" ec2_instance_id, _ = ec2_instance account_id = test_utils.get_account_id_from_image_uri(image_uri) image_region = test_utils.get_region_from_image_uri(image_uri) repo_name, image_tag = test_utils.get_repository_and_tag_from_image_uri( image_uri) framework, _ = test_utils.get_framework_and_version_from_tag(image_uri) job_type = test_utils.get_job_type_from_image(image_uri) processor = test_utils.get_processor_from_image_uri(image_uri) container_name = f"{repo_name}-telemetry_bad_instance_role-ec2" docker_cmd = "nvidia-docker" if processor == "gpu" else "docker" test_utils.login_to_ecr_registry(ec2_connection, account_id, image_region) ec2_connection.run(f"{docker_cmd} pull -q {image_uri}") preexisting_ec2_instance_tags = ec2_utils.get_ec2_instance_tags( ec2_instance_id, ec2_client=ec2_client) if expected_tag_key in preexisting_ec2_instance_tags: ec2_client.remove_tags(Resources=[ec2_instance_id], Tags=[{ "Key": expected_tag_key }]) # Disable access to EC2 instance metadata ec2_connection.run(f"sudo route add -host 169.254.169.254 reject") if "tensorflow" in framework and job_type == "inference": model_name = "saved_model_half_plus_two" model_base_path = test_utils.get_tensorflow_model_base_path(image_uri) env_vars_list = test_utils.get_tensorflow_inference_environment_variables( model_name, model_base_path) env_vars = " ".join([ f"-e {entry['name']}={entry['value']}" for entry in env_vars_list ]) inference_command = get_tensorflow_inference_command_tf27_above( image_uri, model_name) ec2_connection.run( f"{docker_cmd} run {env_vars} --name {container_name} -id {image_uri} {inference_command}" ) time.sleep(5) else: framework_to_import = framework.replace("huggingface_", "") framework_to_import = "torch" if framework_to_import == "pytorch" else framework_to_import ec2_connection.run( f"{docker_cmd} run --name {container_name} -id {image_uri} bash") output = ec2_connection.run( f"{docker_cmd} exec -i {container_name} python -c 'import {framework_to_import}; import time; time.sleep(5)'", warn=True) assert output.ok, f"'import {framework_to_import}' fails when credentials not configured" ec2_instance_tags = ec2_utils.get_ec2_instance_tags(ec2_instance_id, ec2_client=ec2_client) assert expected_tag_key not in ec2_instance_tags, ( f"{expected_tag_key} was applied as an instance tag." "EC2 create_tags went through even though it should not have")
def _run_tag_success(image_uri, ec2_client, ec2_instance, ec2_connection): expected_tag_key = "aws-dlc-autogenerated-tag-do-not-delete" ec2_instance_id, _ = ec2_instance account_id = test_utils.get_account_id_from_image_uri(image_uri) image_region = test_utils.get_region_from_image_uri(image_uri) repo_name, image_tag = test_utils.get_repository_and_tag_from_image_uri( image_uri) framework, _ = test_utils.get_framework_and_version_from_tag(image_uri) job_type = test_utils.get_job_type_from_image(image_uri) processor = test_utils.get_processor_from_image_uri(image_uri) container_name = f"{repo_name}-telemetry_tag_instance_success-ec2" docker_cmd = "nvidia-docker" if processor == "gpu" else "docker" test_utils.login_to_ecr_registry(ec2_connection, account_id, image_region) ec2_connection.run(f"{docker_cmd} pull -q {image_uri}") preexisting_ec2_instance_tags = ec2_utils.get_ec2_instance_tags( ec2_instance_id, ec2_client=ec2_client) if expected_tag_key in preexisting_ec2_instance_tags: ec2_client.remove_tags(Resources=[ec2_instance_id], Tags=[{ "Key": expected_tag_key }]) if framework == "tensorflow" and job_type == "inference": env_vars_list = ecs_utils.get_ecs_tensorflow_environment_variables( processor, "saved_model_half_plus_two") env_vars = " ".join([ f"-e {entry['name']}={entry['value']}" for entry in env_vars_list ]) ec2_connection.run( f"{docker_cmd} run {env_vars} --name {container_name} -id {image_uri}" ) time.sleep(5) else: framework_to_import = framework.replace("huggingface_", "") framework_to_import = "torch" if framework_to_import == "pytorch" else framework_to_import ec2_connection.run( f"{docker_cmd} run --name {container_name} -id {image_uri} bash") output = ec2_connection.run( f"{docker_cmd} exec -i {container_name} python -c 'import {framework_to_import}; import time; time.sleep(5)'", warn=True) ec2_instance_tags = ec2_utils.get_ec2_instance_tags(ec2_instance_id, ec2_client=ec2_client) assert expected_tag_key in ec2_instance_tags, f"{expected_tag_key} was not applied as an instance tag"
def ec2_connection(request, ec2_instance, ec2_key_name, ec2_instance_type, region): """ Fixture to establish connection with EC2 instance if necessary :param request: pytest test request :param ec2_instance: ec2_instance pytest fixture :param ec2_key_name: unique key name :param ec2_instance_type: ec2_instance_type pytest fixture :param region: Region where ec2 instance is launched :return: Fabric connection object """ instance_id, instance_pem_file = ec2_instance region = P3DN_REGION if ec2_instance_type == "p3dn.24xlarge" else region ip_address = ec2_utils.get_public_ip(instance_id, region=region) LOGGER.info(f"Instance ip_address: {ip_address}") user = ec2_utils.get_instance_user(instance_id, region=region) LOGGER.info(f"Connecting to {user}@{ip_address}") conn = Connection( user=user, host=ip_address, connect_kwargs={"key_filename": [instance_pem_file]}, connect_timeout=18000, ) random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}") unique_id = random.randint(1, 100000) artifact_folder = f"{ec2_key_name}-{unique_id}-folder" s3_test_artifact_location = test_utils.upload_tests_to_s3(artifact_folder) def delete_s3_artifact_copy(): test_utils.delete_uploaded_tests_from_s3(s3_test_artifact_location) request.addfinalizer(delete_s3_artifact_copy) conn.run( f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} $HOME/container_tests" ) conn.run( f"mkdir -p $HOME/container_tests/logs && chmod -R +x $HOME/container_tests/*" ) # Log into ECR if we are in canary context if test_utils.is_canary_context(): public_registry = test_utils.PUBLIC_DLC_REGISTRY test_utils.login_to_ecr_registry(conn, public_registry, region) return conn
def test_canary_images_pullable(region): """ Sanity test to verify canary specific functions """ ctx = Context() frameworks = ("tensorflow", "mxnet", "pytorch") # Have a default framework to test on framework = "pytorch" for fw in frameworks: if fw in os.getenv("CODEBUILD_INITIATOR"): framework = fw break images = parse_canary_images(framework, region) login_to_ecr_registry(ctx, PUBLIC_DLC_REGISTRY, region) for image in images.split(" "): ctx.run(f"docker pull {image}", hide=True)
def _reupload_image_to_test_ecr(source_image_uri, test_ecr_client, test_region, test_account_id): """ Helper function to reupload an image owned by a different account to an ECR repo in this account, so that this account can freely run ECR Scan without permission issues. :param source_image_uri: str Image URI for image to be tested :param test_ecr_client: boto3.Client ECR client for account where test is being run :param test_region: str Region where test is being run :param test_account_id: str Account ID for account where test is being run :return: str New image URI for re-uploaded image """ ctx = Context() image_account_id = get_account_id_from_image_uri(source_image_uri) image_region = get_region_from_image_uri(source_image_uri) login_to_ecr_registry(ctx, image_account_id, image_region) ctx.run(f"docker pull {source_image_uri}") image_repo_uri, image_tag = source_image_uri.split(":") _, image_repo_name = image_repo_uri.split("/") test_image_repo_name = f"beta-{image_repo_name}" if not ecr_utils.ecr_repo_exists(test_ecr_client, test_image_repo_name): raise ecr_utils.ECRRepoDoesNotExist( f"Repo named {test_image_repo_name} does not exist in {test_region} on the account {test_account_id}" ) test_image_uri = (source_image_uri.replace( image_region, test_region).replace(image_repo_name, test_image_repo_name).replace( image_account_id, test_account_id)) ctx.run(f"docker tag {source_image_uri} {test_image_uri}") login_to_ecr_registry(ctx, test_account_id, test_region) ctx.run(f"docker push {test_image_uri}") return test_image_uri
def test_ecr_scan(image, ecr_client, sts_client, region): """ Run ECR Scan Tool on an image being tested, and raise Error if vulnerabilities found 1. Start Scan. 2. For 5 minutes (Run DescribeImages): (We run this for 5 minutes because the Scan is expected to complete in about 2 minutes, though no analysis has been performed on exactly how long the Scan takes for a DLC image. Therefore we also have a 3 minute buffer beyond the expected amount of time taken.) 3.1. If imageScanStatus == COMPLETE: exit loop 3.2. If imageScanStatus == IN_PROGRESS or AttributeNotFound(imageScanStatus): continue loop 3.3. If imageScanStatus == FAILED: raise RuntimeError 4. If DescribeImages.imageScanStatus != COMPLETE: raise TimeOutError 5. assert imageScanFindingsSummary.findingSeverityCounts.HIGH/CRITICAL == 0 :param image: str Image URI for image to be tested :param ecr_client: boto3 Client for ECR :param sts_client: boto3 Client for STS :param region: str Name of region where test is executed """ test_account_id = sts_client.get_caller_identity().get("Account") image_account_id = get_account_id_from_image_uri(image) image_region = get_region_from_image_uri(image) image_repo_name, original_image_tag = get_repository_and_tag_from_image_uri(image) additional_image_tags = get_all_the_tags_of_an_image_from_ecr(ecr_client, image) if not is_image_available_locally(image): LOGGER.info(f"Image {image} not available locally!! Pulling the image...") login_to_ecr_registry(Context(), image_account_id, image_region) run(f"docker pull {image}") if not is_image_available_locally(image): raise RuntimeError("Image shown as not available even after pulling") for additional_tag in additional_image_tags: image_uri_with_new_tag = image.replace(original_image_tag, additional_tag) run(f"docker tag {image} {image_uri_with_new_tag}", hide=True) if image_account_id != test_account_id: original_image = image target_image_repo_name = f"beta-{image_repo_name}" for additional_tag in additional_image_tags: image_uri_with_new_tag = original_image.replace(original_image_tag, additional_tag) new_image_uri = ecr_utils.reupload_image_to_test_ecr(image_uri_with_new_tag, target_image_repo_name, region) if image_uri_with_new_tag == original_image: image = new_image_uri minimum_sev_threshold = get_minimum_sev_threshold_level(image) LOGGER.info(f"Severity threshold level is {minimum_sev_threshold}") run_scan(ecr_client, image) scan_results = ecr_utils.get_ecr_image_scan_results(ecr_client, image, minimum_vulnerability=minimum_sev_threshold) scan_results = ecr_utils.populate_ecr_scan_with_web_scraper_results(image, scan_results) ecr_image_vulnerability_list = ScanVulnerabilityList(minimum_severity=CVESeverity[minimum_sev_threshold]) ecr_image_vulnerability_list.construct_allowlist_from_ecr_scan_result(scan_results) remaining_vulnerabilities = ecr_image_vulnerability_list if not is_image_covered_by_allowlist_feature(image): if is_canary_context(): pytest.skip("Skipping the test on the canary.") common_ecr_scan_allowlist = ScanVulnerabilityList(minimum_severity=CVESeverity[minimum_sev_threshold]) common_ecr_scan_allowlist_path = os.path.join( os.sep, get_repository_local_path(), "data", "common-ecr-scan-allowlist.json" ) if os.path.exists(common_ecr_scan_allowlist_path): common_ecr_scan_allowlist.construct_allowlist_from_file(common_ecr_scan_allowlist_path) remaining_vulnerabilities = remaining_vulnerabilities - common_ecr_scan_allowlist if remaining_vulnerabilities: assert not remaining_vulnerabilities.vulnerability_list, ( f"The following vulnerabilities need to be fixed on {image}:\n" f"{json.dumps(remaining_vulnerabilities.vulnerability_list, indent=4)}" ) return upgraded_image_vulnerability_list, image_scan_allowlist = fetch_other_vulnerability_lists( image, ecr_client, minimum_sev_threshold ) s3_bucket_name = ECR_SCAN_HELPER_BUCKET ## In case new vulnerabilities (fixable or non-fixable) are found, then conduct failure routine newly_found_vulnerabilities = ecr_image_vulnerability_list - image_scan_allowlist # In case there is no new vulnerability but the allowlist is outdated vulnerabilities_that_can_be_fixed = image_scan_allowlist - upgraded_image_vulnerability_list if newly_found_vulnerabilities or vulnerabilities_that_can_be_fixed: failure_routine_summary = conduct_failure_routine( image, image_scan_allowlist, ecr_image_vulnerability_list, upgraded_image_vulnerability_list, s3_bucket_name, ) ( s3_filename_for_fixable_list, s3_filename_for_non_fixable_list, ) = process_failure_routine_summary_and_store_data_in_s3(failure_routine_summary, s3_bucket_name) prepend_message = "Found new vulnerabilities in image." if newly_found_vulnerabilities else "Allowlist is outdated." display_message = prepend_message + " " + ( f"""Found {len(failure_routine_summary["fixable_vulnerabilities"])} fixable vulnerabilites """ f"""and {len(failure_routine_summary["non_fixable_vulnerabilities"])} non fixable vulnerabilites. """ f"""Refer to files s3://{s3_bucket_name}/{s3_filename_for_fixable_list}, s3://{s3_bucket_name}/{s3_filename_for_non_fixable_list}, """ f"""s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_current_image_ecr_scan_list"]} and s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_allowlist"]}.""" ) if is_canary_context(): LOGGER.error(display_message) pytest.skip("Skipping the test failure on the canary.") else: raise RuntimeError(display_message)