Exemplo n.º 1
0
def run_smclarify_bias_metrics(
    image_uri,
    ec2_connection,
    ec2_instance_type,
    docker_executable="docker",
    container_name="smclarify",
    test_script=SMCLARIFY_SCRIPT,
):
    container_test_local_dir = os.path.join("$HOME", "container_tests")
    account_id = get_account_id_from_image_uri(image_uri)
    region = get_region_from_image_uri(image_uri)

    login_to_ecr_registry(ec2_connection, account_id, region)
    ec2_connection.run(f"docker pull -q {image_uri}")

    try:
        ec2_connection.run(
            f"{docker_executable} run --name {container_name} -v "
            f"{container_test_local_dir}:{os.path.join(os.sep, 'test')} {image_uri} "
            f"python {test_script}",
            hide=True,
            timeout=300,
        )
    except Exception as e:
        debug_output = ec2_connection.run(f"docker logs {container_name}")
        debug_stdout = debug_output.stdout
        if "Test SMClarify Bias Metrics succeeded!" in debug_stdout:
            LOGGER.warning(
                f"SMClarify test succeeded, but there is an issue with fabric. "
                f"Error:\n{e}\nTest output:\n{debug_stdout}"
            )
            return
        raise SMClarifyTestFailure(
            f"SMClarify test failed on {image_uri} on {ec2_instance_type}. Full output:\n{debug_stdout}"
        ) from e
Exemplo n.º 2
0
def start_ecr_image_scan(ecr_client, image_uri):
    """
    Start ECR Scan for an image, and Warn if scan cannot be started
    :param ecr_client: boto3 client for ECR
    :param image_uri: image URI for image to be checked
    """
    repository, tag = get_repository_and_tag_from_image_uri(image_uri)
    try:
        scan_info = ecr_client.start_image_scan(repositoryName=repository, imageId={"imageTag": tag})
    except ecr_client.exceptions.LimitExceededException:
        LOGGER.warning("Scan has already been run on this image in the last 24 hours.")
        return
    if scan_info["imageScanStatus"]["status"] == "FAILED":
        raise ECRScanFailedError(f"ECR Scan failed and returned:\n{json.dumps(scan_info, indent=4)}")
    return
def test_tensorflow_with_horovod_cpu(tensorflow_training, ec2_connection, cpu_only, tf2_only):
    container_name = "tf_hvd_cpu_test"
    test_script = TF1_HVD_CMD if is_tf_version("1", tensorflow_training) else TF2_HVD_CMD
    try:
        execute_ec2_training_test(
            ec2_connection, tensorflow_training, test_script, container_name=container_name, timeout=1800
        )
    except Exception as e:
        debug_output = ec2_connection.run(f"docker logs {container_name}")
        debug_stdout = debug_output.stdout
        if "TF HVD tests passed!" in debug_stdout:
            LOGGER.warning(
                f"TF HVD tests succeeded, but there is an issue with fabric. Error:\n{e}\nTest output:\n{debug_stdout}"
            )
            return
        raise TFTrainingTestFailure(f"TF HVD test failed. Full output:\n{debug_stdout}") from e
Exemplo n.º 4
0
def run_smdebug_test(
    image_uri,
    ec2_connection,
    region,
    ec2_instance_type,
    docker_executable="docker",
    container_name="smdebug",
    test_script=SMDEBUG_SCRIPT,
    timeout=2400,
):
    large_shm_instance_types = ("p2.8xlarge", "m4.16xlarge")
    shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " "
    framework = get_framework_from_image_uri(image_uri)
    container_test_local_dir = os.path.join("$HOME", "container_tests")
    ec2_connection.run(
        f"$(aws ecr get-login --no-include-email --region {region})",
        hide=True)
    ec2_connection.run(f"docker pull -q {image_uri}")

    try:
        ec2_connection.run(
            f"{docker_executable} run --name {container_name} -v "
            f"{container_test_local_dir}:{os.path.join(os.sep, 'test')}{shm_setting}{image_uri} "
            f"./{test_script} {framework}",
            hide=True,
            timeout=timeout,
        )
    except Exception as e:
        debug_output = ec2_connection.run(f"docker logs {container_name}")
        debug_stdout = debug_output.stdout
        if "All SMDebug tests succeeded!" in debug_stdout:
            LOGGER.warning(
                f"SMDebug tests succeeded, but there is an issue with fabric. Error:\n{e}\nTest output:\n{debug_stdout}"
            )
            return
        raise SMDebugTestFailure(
            f"SMDebug test failed on {image_uri} on {ec2_instance_type}. Full output:\n{debug_stdout}"
        ) from e
Exemplo n.º 5
0
    def generate_coverage_doc(self, framework=None, job_type=None):
        """
        Generate the test coverage docs based on pytest item objects

        :param framework: str, ML framework
        :param job_type: str, training or inference
        """
        test_cov = {}
        for item in self.items:
            # Define additional csv options
            function_name = item.name.split("[")[0]
            function_key = f"{item.fspath}::{function_name}"
            str_fspath = str(item.fspath)
            str_keywords = str(item.keywords)

            # Construct Category and Github_Link fields based on the filepath
            category = str_fspath.split("/dlc_tests/")[-1].split("/")[0]
            if self.is_sagemaker:
                category = "sagemaker_local" if "local" in str_fspath else "sagemaker"
            repo_url = os.getenv(
                "CODEBUILD_SOURCE_REPO_URL",
                "https://github.com/aws/deep-learning-containers.git").rstrip(
                    ".git")
            github_link = f"{repo_url}/blob/master/test/{str_fspath.split('/test/')[-1]}"

            # Only create a new test coverage item if we have not seen the function before. This is a necessary step,
            # as parametrization can make it appear as if the same test function is a unique test function
            if test_cov.get(function_key):
                continue

            # Based on keywords and filepaths, assign values
            framework_scope = (framework if framework else _infer_field_value(
                "all", ("mxnet", "tensorflow", "pytorch"), str_fspath))
            job_type_scope = (job_type if job_type else _infer_field_value(
                "both", ("training", "inference"), str_fspath, str_keywords))
            integration_scope = _infer_field_value(
                "general integration",
                ("_dgl_", "smdebug", "gluonnlp", "smexperiments", "_mme_",
                 "pipemode", "tensorboard", "_s3_", "nccl"),
                str_keywords,
            )
            processor_scope = _infer_field_value("all", ("cpu", "gpu", "eia"),
                                                 str_keywords)
            if processor_scope == "gpu":
                processor_scope = self.handle_single_gpu_instances_test_report(
                    function_key, str_keywords)

            # Create a new test coverage item if we have not seen the function before. This is a necessary step,
            # as parametrization can make it appear as if the same test function is a unique test function
            test_cov[function_key] = {
                "Category":
                category,
                "Name":
                function_name,
                "Scope":
                framework_scope,
                "Job_Type":
                job_type_scope,
                "Num_Instances":
                self.get_marker_arg_value(item, function_key, "multinode", 1),
                "Processor":
                self.get_marker_arg_value(item, function_key, "processor",
                                          processor_scope),
                "Integration":
                self.get_marker_arg_value(item, function_key, "integration",
                                          integration_scope),
                "Model":
                self.get_marker_arg_value(item, function_key, "model"),
                "GitHub_Link":
                github_link,
            }
        self.write_test_coverage_file(test_cov)

        if self.failure_conditions:
            message, total_issues, error_file = self.assemble_report_failure_message(
            )
            if total_issues == 0:
                LOGGER.warning(
                    f"Found failure message, but no issues. Message:\n{message}"
                )
            else:
                raise TestReportGenerationFailure(
                    f"{message}\nFollow {error_file} if message is truncated")