def run_smclarify_bias_metrics( image_uri, ec2_connection, ec2_instance_type, docker_executable="docker", container_name="smclarify", test_script=SMCLARIFY_SCRIPT, ): container_test_local_dir = os.path.join("$HOME", "container_tests") account_id = get_account_id_from_image_uri(image_uri) region = get_region_from_image_uri(image_uri) login_to_ecr_registry(ec2_connection, account_id, region) ec2_connection.run(f"docker pull -q {image_uri}") try: ec2_connection.run( f"{docker_executable} run --name {container_name} -v " f"{container_test_local_dir}:{os.path.join(os.sep, 'test')} {image_uri} " f"python {test_script}", hide=True, timeout=300, ) except Exception as e: debug_output = ec2_connection.run(f"docker logs {container_name}") debug_stdout = debug_output.stdout if "Test SMClarify Bias Metrics succeeded!" in debug_stdout: LOGGER.warning( f"SMClarify test succeeded, but there is an issue with fabric. " f"Error:\n{e}\nTest output:\n{debug_stdout}" ) return raise SMClarifyTestFailure( f"SMClarify test failed on {image_uri} on {ec2_instance_type}. Full output:\n{debug_stdout}" ) from e
def start_ecr_image_scan(ecr_client, image_uri): """ Start ECR Scan for an image, and Warn if scan cannot be started :param ecr_client: boto3 client for ECR :param image_uri: image URI for image to be checked """ repository, tag = get_repository_and_tag_from_image_uri(image_uri) try: scan_info = ecr_client.start_image_scan(repositoryName=repository, imageId={"imageTag": tag}) except ecr_client.exceptions.LimitExceededException: LOGGER.warning("Scan has already been run on this image in the last 24 hours.") return if scan_info["imageScanStatus"]["status"] == "FAILED": raise ECRScanFailedError(f"ECR Scan failed and returned:\n{json.dumps(scan_info, indent=4)}") return
def test_tensorflow_with_horovod_cpu(tensorflow_training, ec2_connection, cpu_only, tf2_only): container_name = "tf_hvd_cpu_test" test_script = TF1_HVD_CMD if is_tf_version("1", tensorflow_training) else TF2_HVD_CMD try: execute_ec2_training_test( ec2_connection, tensorflow_training, test_script, container_name=container_name, timeout=1800 ) except Exception as e: debug_output = ec2_connection.run(f"docker logs {container_name}") debug_stdout = debug_output.stdout if "TF HVD tests passed!" in debug_stdout: LOGGER.warning( f"TF HVD tests succeeded, but there is an issue with fabric. Error:\n{e}\nTest output:\n{debug_stdout}" ) return raise TFTrainingTestFailure(f"TF HVD test failed. Full output:\n{debug_stdout}") from e
def run_smdebug_test( image_uri, ec2_connection, region, ec2_instance_type, docker_executable="docker", container_name="smdebug", test_script=SMDEBUG_SCRIPT, timeout=2400, ): large_shm_instance_types = ("p2.8xlarge", "m4.16xlarge") shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " " framework = get_framework_from_image_uri(image_uri) container_test_local_dir = os.path.join("$HOME", "container_tests") ec2_connection.run( f"$(aws ecr get-login --no-include-email --region {region})", hide=True) ec2_connection.run(f"docker pull -q {image_uri}") try: ec2_connection.run( f"{docker_executable} run --name {container_name} -v " f"{container_test_local_dir}:{os.path.join(os.sep, 'test')}{shm_setting}{image_uri} " f"./{test_script} {framework}", hide=True, timeout=timeout, ) except Exception as e: debug_output = ec2_connection.run(f"docker logs {container_name}") debug_stdout = debug_output.stdout if "All SMDebug tests succeeded!" in debug_stdout: LOGGER.warning( f"SMDebug tests succeeded, but there is an issue with fabric. Error:\n{e}\nTest output:\n{debug_stdout}" ) return raise SMDebugTestFailure( f"SMDebug test failed on {image_uri} on {ec2_instance_type}. Full output:\n{debug_stdout}" ) from e
def generate_coverage_doc(self, framework=None, job_type=None): """ Generate the test coverage docs based on pytest item objects :param framework: str, ML framework :param job_type: str, training or inference """ test_cov = {} for item in self.items: # Define additional csv options function_name = item.name.split("[")[0] function_key = f"{item.fspath}::{function_name}" str_fspath = str(item.fspath) str_keywords = str(item.keywords) # Construct Category and Github_Link fields based on the filepath category = str_fspath.split("/dlc_tests/")[-1].split("/")[0] if self.is_sagemaker: category = "sagemaker_local" if "local" in str_fspath else "sagemaker" repo_url = os.getenv( "CODEBUILD_SOURCE_REPO_URL", "https://github.com/aws/deep-learning-containers.git").rstrip( ".git") github_link = f"{repo_url}/blob/master/test/{str_fspath.split('/test/')[-1]}" # Only create a new test coverage item if we have not seen the function before. This is a necessary step, # as parametrization can make it appear as if the same test function is a unique test function if test_cov.get(function_key): continue # Based on keywords and filepaths, assign values framework_scope = (framework if framework else _infer_field_value( "all", ("mxnet", "tensorflow", "pytorch"), str_fspath)) job_type_scope = (job_type if job_type else _infer_field_value( "both", ("training", "inference"), str_fspath, str_keywords)) integration_scope = _infer_field_value( "general integration", ("_dgl_", "smdebug", "gluonnlp", "smexperiments", "_mme_", "pipemode", "tensorboard", "_s3_", "nccl"), str_keywords, ) processor_scope = _infer_field_value("all", ("cpu", "gpu", "eia"), str_keywords) if processor_scope == "gpu": processor_scope = self.handle_single_gpu_instances_test_report( function_key, str_keywords) # Create a new test coverage item if we have not seen the function before. This is a necessary step, # as parametrization can make it appear as if the same test function is a unique test function test_cov[function_key] = { "Category": category, "Name": function_name, "Scope": framework_scope, "Job_Type": job_type_scope, "Num_Instances": self.get_marker_arg_value(item, function_key, "multinode", 1), "Processor": self.get_marker_arg_value(item, function_key, "processor", processor_scope), "Integration": self.get_marker_arg_value(item, function_key, "integration", integration_scope), "Model": self.get_marker_arg_value(item, function_key, "model"), "GitHub_Link": github_link, } self.write_test_coverage_file(test_cov) if self.failure_conditions: message, total_issues, error_file = self.assemble_report_failure_message( ) if total_issues == 0: LOGGER.warning( f"Found failure message, but no issues. Message:\n{message}" ) else: raise TestReportGenerationFailure( f"{message}\nFollow {error_file} if message is truncated")