Exemplo n.º 1
0
def run_sagemaker_local_tests(images):
    """
    Function to run the SageMaker Local tests
    :param images: <list> List of all images to be used in SageMaker tests
    """
    if not images:
        return
    # Run sagemaker Local tests
    framework, _ = get_framework_and_version_from_tag(images[0])
    sm_tests_path = os.path.join("test", "sagemaker_tests", framework)
    sm_tests_tar_name = "sagemaker_tests.tar.gz"
    run(f"tar -cz --exclude='*.pytest_cache' --exclude='__pycache__' -f {sm_tests_tar_name} {sm_tests_path}")
    ec2_client = boto3.client("ec2", config=Config(retries={"max_attempts": 10}), region_name=DEFAULT_REGION)
    for image in images:
        sm_utils.execute_local_tests(image, ec2_client)
Exemplo n.º 2
0
def run_sagemaker_local_tests(images):
    """
    Function to run the SageMaker Local tests
    :param images: <list> List of all images to be used in SageMaker tests
    """
    if not images:
        return
    # Run sagemaker Local tests
    framework, _ = get_framework_and_version_from_tag(images[0])
    sm_tests_path = os.path.join("test", "sagemaker_tests", framework)
    sm_tests_tar_name = "sagemaker_tests.tar.gz"
    run(f"tar -cz --exclude='*.pytest_cache' --exclude='__pycache__' -f {sm_tests_tar_name} {sm_tests_path}")

    pool_number = len(images)
    with Pool(pool_number) as p:
        p.map(sm_utils.execute_local_tests, images)
Exemplo n.º 3
0
def run_sagemaker_local_tests(images, pytest_cache_params):
    """
    Function to run the SageMaker Local tests
    :param images: <list> List of all images to be used in SageMaker tests
    :param pytest_cache_params: <dict> dictionary with data required for pytest cache handler
    """
    if not images:
        return
    # Run sagemaker Local tests
    framework, _ = get_framework_and_version_from_tag(images[0])
    sm_tests_path = (os.path.join("test", "sagemaker_tests", framework)
                     if "huggingface" not in framework else os.path.join(
                         "test", "sagemaker_tests", "huggingface*"))
    sm_tests_tar_name = "sagemaker_tests.tar.gz"
    run(f"tar -cz --exclude='*.pytest_cache' --exclude='__pycache__' -f {sm_tests_tar_name} {sm_tests_path}"
        )

    pool_number = len(images)
    with Pool(pool_number) as p:
        p.starmap(sm_utils.execute_local_tests,
                  [[image, pytest_cache_params] for image in images])
def generate_sagemaker_pytest_cmd(image, sagemaker_test_type):
    """
    Parses the image ECR url and returns appropriate pytest command
    :param image: ECR url of image
    :param sagemaker_test_type: local or remote test type
    :return: <tuple> pytest command to be run, path where it should be executed, image tag
    """
    region = os.getenv("AWS_REGION", DEFAULT_REGION)
    account_id = os.getenv("ACCOUNT_ID", image.split(".")[0])
    print("image name {}".format(image))
    sm_remote_docker_base_name, tag = image.split("/")[1].split(":")
    sm_local_docker_repo_uri = image.split(":")[0]

    # Assign instance type
    instance_type = assign_sagemaker_remote_job_instance_type(image)

    # Get path to test directory
    find_path = sm_remote_docker_base_name.split("-")

    # NOTE: We are relying on the fact that repos are defined as <context>-<framework>-<job_type> in our infrastructure
    framework, framework_version = get_framework_and_version_from_tag(image)
    framework_major_version = framework_version.split(".")[0]
    job_type = get_job_type_from_image(image)
    path = os.path.join("test", "sagemaker_tests", framework, job_type)
    aws_id_arg = "--aws-id"
    docker_base_arg = "--docker-base-name"
    instance_type_arg = "--instance-type"
    accelerator_type_arg = "--accelerator-type"
    framework_version_arg = "--framework-version"
    eia_arg = "ml.eia1.large"
    processor = ("neuron" if "neuron" in image else "gpu"
                 if "gpu" in image else "eia" if "eia" in image else "cpu")
    py_version = re.search(r"py\d+", tag).group()
    sm_local_py_version = "37" if py_version == "py37" else "38" if py_version == "py38" else "2" if py_version == "py27" else "3"
    if framework == "tensorflow" and job_type == "inference":
        # Tf Inference tests have an additional sub directory with test
        integration_path = os.path.join("test", "integration",
                                        sagemaker_test_type)
    else:
        integration_path = os.path.join("integration", sagemaker_test_type)

    # Conditions for modifying tensorflow SageMaker pytest commands
    if framework == "tensorflow" and sagemaker_test_type == SAGEMAKER_REMOTE_TEST_TYPE:
        if job_type == "inference":
            aws_id_arg = "--registry"
            docker_base_arg = "--repo"
            instance_type_arg = "--instance-types"
            framework_version_arg = "--versions"
            integration_path = os.path.join(
                integration_path,
                "test_tfs.py") if processor != "eia" else os.path.join(
                    integration_path, "test_ei.py")

    if framework == "tensorflow" and job_type == "training":
        aws_id_arg = "--account-id"

    test_report = os.path.join(os.getcwd(), "test", f"{job_type}_{tag}.xml")
    local_test_report = os.path.join(UBUNTU_HOME_DIR, "test",
                                     f"{job_type}_{tag}_sm_local.xml")

    # Explanation of why we need the if-condition below:
    # We have separate Pipeline Actions that run EFA tests, which have the env variable "EFA_DEDICATED=True" configured
    # so that those Actions only run the EFA tests.
    # However, there is no such dedicated CB job dedicated to EFA tests in the PR context. This means that when in the
    # PR context, setting "DISABLE_EFA_TESTS" to True should skip EFA tests, but setting it to False should enable
    # not just the EFA tests, but also all other tests as well.
    if is_pr_context():
        efa_tests_disabled = os.getenv("DISABLE_EFA_TESTS",
                                       "False").lower() == "true"
        efa_flag = "-m \"not efa\"" if efa_tests_disabled else ""
    else:
        efa_dedicated = os.getenv("EFA_DEDICATED", "False").lower() == "true"
        efa_flag = '--efa' if efa_dedicated else '-m \"not efa\"'

    region_list = ",".join(SAGEMAKER_EXECUTION_REGIONS)

    sagemaker_regions_list = f"--sagemaker-regions {region_list}"

    remote_pytest_cmd = (
        f"pytest -rA {integration_path} --region {region} --processor {processor} {docker_base_arg} "
        f"{sm_remote_docker_base_name} --tag {tag} {framework_version_arg} {framework_version} "
        f"{aws_id_arg} {account_id} {instance_type_arg} {instance_type} {efa_flag} {sagemaker_regions_list} --junitxml {test_report}"
    )

    if processor == "eia":
        remote_pytest_cmd += f"{accelerator_type_arg} {eia_arg}"

    local_pytest_cmd = (
        f"pytest -s -v {integration_path} {docker_base_arg} "
        f"{sm_local_docker_repo_uri} --tag {tag} --framework-version {framework_version} "
        f"--processor {processor} {aws_id_arg} {account_id} --junitxml {local_test_report}"
    )

    if framework == "tensorflow" and job_type != "inference":
        local_pytest_cmd = f"{local_pytest_cmd} --py-version {sm_local_py_version} --region {region}"
    if framework == "tensorflow" and job_type == "training":
        path = os.path.join(os.path.dirname(path),
                            f"{framework}{framework_major_version}_training")
    if "huggingface" in framework and job_type == "inference":
        path = os.path.join("test", "sagemaker_tests", "huggingface",
                            "inference")

    return (
        remote_pytest_cmd if sagemaker_test_type == SAGEMAKER_REMOTE_TEST_TYPE
        else local_pytest_cmd,
        path,
        tag,
        job_type,
    )
def execute_local_tests(image, pytest_cache_params):
    """
    Run the sagemaker local tests in ec2 instance for the image
    :param image: ECR url
    :param pytest_cache_params: parameters required for :param pytest_cache_util
    :return: None
    """
    account_id = os.getenv(
        "ACCOUNT_ID",
        boto3.client("sts").get_caller_identity()["Account"])
    pytest_cache_util = PytestCache(boto3.client("s3"), account_id)
    ec2_client = boto3.client("ec2",
                              config=Config(retries={"max_attempts": 10}),
                              region_name=DEFAULT_REGION)
    pytest_command, path, tag, job_type = generate_sagemaker_pytest_cmd(
        image, SAGEMAKER_LOCAL_TEST_TYPE)
    pytest_command += " --last-failed --last-failed-no-failures all "
    print(pytest_command)
    framework, _ = get_framework_and_version_from_tag(image)
    random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}")
    ec2_key_name = f"{job_type}_{tag}_sagemaker_{random.randint(1, 1000)}"
    region = os.getenv("AWS_REGION", DEFAULT_REGION)
    ec2_ami_id = UBUNTU_18_BASE_DLAMI_US_EAST_1 if region == "us-east-1" else UBUNTU_18_BASE_DLAMI_US_WEST_2
    sm_tests_tar_name = "sagemaker_tests.tar.gz"
    ec2_test_report_path = os.path.join(UBUNTU_HOME_DIR, "test",
                                        f"{job_type}_{tag}_sm_local.xml")
    instance_id = ""
    ec2_conn = None
    try:
        key_file = generate_ssh_keypair(ec2_client, ec2_key_name)
        print(f"Launching new Instance for image: {image}")
        instance_id, ip_address = launch_sagemaker_local_ec2_instance(
            image, ec2_ami_id, ec2_key_name, region)
        ec2_conn = ec2_utils.get_ec2_fabric_connection(instance_id, key_file,
                                                       region)
        ec2_conn.put(sm_tests_tar_name, f"{UBUNTU_HOME_DIR}")
        ec2_conn.run(
            f"$(aws ecr get-login --no-include-email --region {region})")
        try:
            ec2_conn.run(f"docker pull {image}", timeout=600)
        except invoke.exceptions.CommandTimedOut as e:
            output = ec2_conn.run(
                f"docker images {image} --format '{{.Repository}}:{{.Tag}}'"
            ).stdout.strip("\n")
            if output != image:
                raise DLCSageMakerLocalTestFailure(
                    f"Image pull for {image} failed.\ndocker images output = {output}"
                ) from e
        ec2_conn.run(f"tar -xzf {sm_tests_tar_name}")
        kill_background_processes_and_run_apt_get_update(ec2_conn)
        with ec2_conn.cd(path):
            install_sm_local_dependencies(framework, job_type, image, ec2_conn,
                                          ec2_ami_id)
            pytest_cache_util.download_pytest_cache_from_s3_to_ec2(
                ec2_conn, path, **pytest_cache_params)
            # Workaround for mxnet cpu training images as test distributed
            # causes an issue with fabric ec2_connection
            if framework == "mxnet" and job_type == "training" and "cpu" in image:
                try:
                    ec2_conn.run(pytest_command, timeout=1000, warn=True)
                except exceptions.CommandTimedOut as exc:
                    print(f"Ec2 connection timed out for {image}, {exc}")
                finally:
                    print(f"Downloading Test reports for image: {image}")
                    ec2_conn.close()
                    ec2_conn_new = ec2_utils.get_ec2_fabric_connection(
                        instance_id, key_file, region)
                    ec2_conn_new.get(
                        ec2_test_report_path,
                        os.path.join("test", f"{job_type}_{tag}_sm_local.xml"))
                    output = subprocess.check_output(
                        f"cat test/{job_type}_{tag}_sm_local.xml",
                        shell=True,
                        executable="/bin/bash")
                    pytest_cache_util.upload_pytest_cache_from_ec2_to_s3(
                        ec2_conn_new, path, **pytest_cache_params)
                    if 'failures="0"' not in str(output):
                        raise ValueError(
                            f"Sagemaker Local tests failed for {image}")
            else:
                ec2_conn.run(pytest_command)
                print(f"Downloading Test reports for image: {image}")
                ec2_conn.get(
                    ec2_test_report_path,
                    os.path.join("test", f"{job_type}_{tag}_sm_local.xml"))
    finally:
        with ec2_conn.cd(path):
            pytest_cache_util.upload_pytest_cache_from_ec2_to_s3(
                ec2_conn, path, **pytest_cache_params)
        print(f"Terminating Instances for image: {image}")
        ec2_utils.terminate_instance(instance_id, region)
        print(f"Destroying ssh Key_pair for image: {image}")
        destroy_ssh_keypair(ec2_client, ec2_key_name)
        # return None here to prevent errors from multiprocessing.map(). Without this it returns some object by default
        # which is causing "cannot pickle '_thread.lock' object" error
        return None
Exemplo n.º 6
0
def generate_sagemaker_pytest_cmd(image, sagemaker_test_type):
    """
    Parses the image ECR url and returns appropriate pytest command
    :param image: ECR url of image
    :param sagemaker_test_type: local or remote test type
    :return: <tuple> pytest command to be run, path where it should be executed, image tag
    """
    reruns = 4
    region = os.getenv("AWS_REGION", DEFAULT_REGION)
    account_id = os.getenv("ACCOUNT_ID", image.split(".")[0])
    print("image name {}".format(image))
    sm_remote_docker_base_name, tag = image.split("/")[1].split(":")
    sm_local_docker_repo_uri = image.split(":")[0]

    # Assign instance type
    instance_type = assign_sagemaker_remote_job_instance_type(image)

    # Get path to test directory
    find_path = sm_remote_docker_base_name.split("-")

    # NOTE: We are relying on the fact that repos are defined as <context>-<framework>-<job_type> in our infrastructure
    framework, framework_version = get_framework_and_version_from_tag(image)
    job_type = get_job_type_from_image(image)
    path = os.path.join("test", "sagemaker_tests", framework, job_type)
    aws_id_arg = "--aws-id"
    docker_base_arg = "--docker-base-name"
    instance_type_arg = "--instance-type"
    accelerator_type_arg = "--accelerator-type"
    eia_arg = "ml.eia1.large"
    framework_version = re.search(r"\d+(\.\d+){2}", tag).group()
    framework_major_version = framework_version.split(".")[0]
    processor = "gpu" if "gpu" in image else "eia" if "eia" in image else "cpu"
    py_version = re.search(r"py\d+", tag).group()
    sm_local_py_version = "37" if py_version == "py37" else "2" if py_version == "py27" else "3"
    if framework == "tensorflow" and job_type == "inference":
        # Tf Inference tests have an additional sub directory with test
        integration_path = os.path.join("test", "integration",
                                        sagemaker_test_type)
    else:
        integration_path = os.path.join("integration", sagemaker_test_type)

    # Conditions for modifying tensorflow SageMaker pytest commands
    if framework == "tensorflow" and sagemaker_test_type == SAGEMAKER_REMOTE_TEST_TYPE:
        if job_type == "inference":
            aws_id_arg = "--registry"
            docker_base_arg = "--repo"
            instance_type_arg = "--instance-types"
            integration_path = os.path.join(
                integration_path,
                "test_tfs.py") if processor != "eia" else os.path.join(
                    integration_path, "test_ei.py")

    if framework == "tensorflow" and job_type == "training":
        aws_id_arg = "--account-id"

    test_report = os.path.join(os.getcwd(), "test", f"{job_type}_{tag}.xml")
    local_test_report = os.path.join(UBUNTU_HOME_DIR, "test",
                                     f"{job_type}_{tag}_sm_local.xml")
    is_py3 = " python3 -m "

    remote_pytest_cmd = (
        f"pytest {integration_path} --region {region} {docker_base_arg} "
        f"{sm_remote_docker_base_name} --tag {tag} {aws_id_arg} {account_id} "
        f"{instance_type_arg} {instance_type} --junitxml {test_report}")

    if processor == "eia":
        remote_pytest_cmd += (f" {accelerator_type_arg} {eia_arg}")

    local_pytest_cmd = (
        f"{is_py3} pytest -v {integration_path} {docker_base_arg} "
        f"{sm_local_docker_repo_uri} --tag {tag} --framework-version {framework_version} "
        f"--processor {processor} {aws_id_arg} {account_id} --junitxml {local_test_report}"
    )

    if framework == "tensorflow" and job_type != "inference":
        local_pytest_cmd = f"{local_pytest_cmd} --py-version {sm_local_py_version} --region {region}"
    if framework == "tensorflow" and job_type == "training":
        path = os.path.join(os.path.dirname(path),
                            f"{framework}{framework_major_version}_training")

    return (
        remote_pytest_cmd if sagemaker_test_type == SAGEMAKER_REMOTE_TEST_TYPE
        else local_pytest_cmd,
        path,
        tag,
        job_type,
    )
Exemplo n.º 7
0
def execute_local_tests(image, ec2_client):
    """
    Run the sagemaker local tests in ec2 instance for the image
    :param image: ECR url
    :param ec2_client: boto3_obj
    :return: None
    """
    pytest_command, path, tag, job_type = generate_sagemaker_pytest_cmd(
        image, SAGEMAKER_LOCAL_TEST_TYPE)
    print(pytest_command)
    framework, _ = get_framework_and_version_from_tag(image)
    random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}")
    ec2_key_name = f"{job_type}_{tag}_sagemaker_{random.randint(1, 1000)}"
    region = os.getenv("AWS_REGION", DEFAULT_REGION)
    sm_tests_tar_name = "sagemaker_tests.tar.gz"
    ec2_test_report_path = os.path.join(UBUNTU_HOME_DIR, "test",
                                        f"{job_type}_{tag}_sm_local.xml")
    try:
        key_file = generate_ssh_keypair(ec2_client, ec2_key_name)
        print(f"Launching new Instance for image: {image}")
        instance_id, ip_address = launch_sagemaker_local_ec2_instance(
            image, UBUNTU_16_BASE_DLAMI_US_EAST_1 if region == "us-east-1" else
            UBUNTU_16_BASE_DLAMI_US_WEST_2, ec2_key_name, region)
        ec2_conn = ec2_utils.get_ec2_fabric_connection(instance_id, key_file,
                                                       region)
        ec2_conn.put(sm_tests_tar_name, f"{UBUNTU_HOME_DIR}")
        ec2_conn.run(
            f"$(aws ecr get-login --no-include-email --region {region})")
        ec2_conn.run(f"docker pull {image}")
        ec2_conn.run(f"tar -xzf {sm_tests_tar_name}")
        with ec2_conn.cd(path):
            install_sm_local_dependencies(framework, job_type, image, ec2_conn)
            # Workaround for mxnet cpu training images as test distributed
            # causes an issue with fabric ec2_connection
            if framework == "mxnet" and job_type == "training" and "cpu" in image:
                try:
                    ec2_conn.run(pytest_command, timeout=1000, warn=True)
                except exceptions.CommandTimedOut as exc:
                    print(f"Ec2 connection timed out for {image}, {exc}")
                finally:
                    print(f"Downloading Test reports for image: {image}")
                    ec2_conn.close()
                    ec2_conn_new = ec2_utils.get_ec2_fabric_connection(
                        instance_id, key_file, region)
                    ec2_conn_new.get(
                        ec2_test_report_path,
                        os.path.join("test", f"{job_type}_{tag}_sm_local.xml"))
                    output = subprocess.check_output(
                        f"cat test/{job_type}_{tag}_sm_local.xml",
                        shell=True,
                        executable="/bin/bash")
                    if 'failures="0"' not in str(output):
                        raise ValueError(
                            f"Sagemaker Local tests failed for {image}")
            else:
                ec2_conn.run(pytest_command)
                print(f"Downloading Test reports for image: {image}")
                ec2_conn.get(
                    ec2_test_report_path,
                    os.path.join("test", f"{job_type}_{tag}_sm_local.xml"))
    finally:
        print(f"Terminating Instances for image: {image}")
        ec2_utils.terminate_instance(instance_id, region)
        print(f"Destroying ssh Key_pair for image: {image}")
        destroy_ssh_keypair(ec2_client, ec2_key_name)
Exemplo n.º 8
0
def main():
    # Define constants
    start_time = datetime.now()
    test_type = os.getenv("TEST_TYPE")

    efa_dedicated = os.getenv("EFA_DEDICATED", "False").lower() == "true"
    executor_mode = os.getenv("EXECUTOR_MODE", "False").lower() == "true"
    dlc_images = os.getenv("DLC_IMAGE") if executor_mode else get_dlc_images()
    # Executing locally ona can provide commit_id or may ommit it. Assigning default value for local executions:
    commit_id = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION",
                          default="unrecognised_commit_id")
    LOGGER.info(f"Images tested: {dlc_images}")
    all_image_list = dlc_images.split(" ")
    standard_images_list = [
        image_uri for image_uri in all_image_list if "example" not in image_uri
    ]
    # Do not create EKS cluster for when EIA Only Images are present
    is_all_images_list_eia = all("eia" in image_uri
                                 for image_uri in all_image_list)
    eks_cluster_name = None
    benchmark_mode = "benchmark" in test_type or is_benchmark_dev_context()
    specific_test_type = re.sub(
        "benchmark-", "", test_type) if "benchmark" in test_type else test_type
    build_context = get_build_context()

    # quick_checks tests don't have images in it. Using a placeholder here for jobs like that
    try:
        framework, version = get_framework_and_version_from_tag(
            all_image_list[0])
    except:
        framework, version = "general_test", "none"

    pytest_cache_params = {
        "commit_id": commit_id,
        "framework": framework,
        "version": version,
        "build_context": build_context,
        "test_type": test_type,
    }

    # In PR context, allow us to switch sagemaker tests to RC tests.
    # Do not allow them to be both enabled due to capacity issues.
    if specific_test_type == "sagemaker" and is_rc_test_context(
    ) and is_pr_context():
        specific_test_type = "release_candidate_integration"

    test_path = os.path.join(
        "benchmark",
        specific_test_type) if benchmark_mode else specific_test_type

    # Skipping non HuggingFace/AG specific tests to execute only sagemaker tests
    is_hf_image_present = any("huggingface" in image_uri
                              for image_uri in all_image_list)
    is_ag_image_present = any("autogluon" in image_uri
                              for image_uri in all_image_list)
    if (is_hf_image_present
            or is_ag_image_present) and specific_test_type in ("ecs", "ec2",
                                                               "eks", "bai"):
        # Creating an empty file for because codebuild job fails without it
        LOGGER.info(
            f"NOTE: {specific_test_type} tests not supported on HF or AG. Skipping..."
        )
        report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
        sm_utils.generate_empty_report(report, test_type, "huggingface")
        return

    if specific_test_type in (
            "sanity",
            "ecs",
            "ec2",
            "eks",
            "canary",
            "bai",
            "quick_checks",
            "release_candidate_integration",
    ):
        report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
        # The following two report files will only be used by EKS tests, as eks_train.xml and eks_infer.xml.
        # This is to sequence the tests and prevent one set of tests from waiting too long to be scheduled.
        report_train = os.path.join(os.getcwd(), "test",
                                    f"{test_type}_train.xml")
        report_infer = os.path.join(os.getcwd(), "test",
                                    f"{test_type}_infer.xml")
        report_multinode_train = os.path.join(os.getcwd(), "test",
                                              f"eks_multinode_train.xml")

        # PyTest must be run in this directory to avoid conflicting w/ sagemaker_tests conftests
        os.chdir(os.path.join("test", "dlc_tests"))

        # Pull images for necessary tests
        if specific_test_type == "sanity":
            pull_dlc_images(all_image_list)
        if specific_test_type == "bai":
            build_bai_docker_container()
        if specific_test_type == "eks" and not is_all_images_list_eia:
            frameworks_in_images = [
                framework for framework in ("mxnet", "pytorch", "tensorflow")
                if framework in dlc_images
            ]
            if len(frameworks_in_images) != 1:
                raise ValueError(
                    f"All images in dlc_images must be of a single framework for EKS tests.\n"
                    f"Instead seeing {frameworks_in_images} frameworks.")
            framework = frameworks_in_images[0]
            eks_cluster_name = f"{framework}-{build_context}"
            eks_utils.eks_setup()
            if eks_utils.is_eks_cluster_active(eks_cluster_name):
                eks_utils.eks_write_kubeconfig(eks_cluster_name)
            else:
                raise Exception(
                    f"EKS cluster {eks_cluster_name} is not in active state")

        # Execute dlc_tests pytest command
        pytest_cmd = [
            "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto"
        ]

        is_habana_image = any("habana" in image_uri
                              for image_uri in all_image_list)
        if specific_test_type == "ec2":
            if is_habana_image:
                context = Context()
                context.run(
                    "git clone https://github.com/HabanaAI/gaudi-test-suite.git"
                )
                context.run(
                    "tar -c -f gaudi-test-suite.tar.gz gaudi-test-suite")
            else:
                pytest_cmd += ["--reruns=1", "--reruns-delay=10"]

        if is_pr_context():
            if specific_test_type == "eks":
                pytest_cmd.append("--timeout=2340")
            else:
                if is_habana_image:
                    pytest_cmd.append("--timeout=18000")
                else:
                    pytest_cmd.append("--timeout=4860")

        pytest_cmds = [pytest_cmd]
        # Execute separate cmd for canaries
        if specific_test_type in ("canary", "quick_checks"):
            pytest_cmds = [[
                "-s", "-rA", f"--junitxml={report}", "-n=auto",
                f"--{specific_test_type}", "--ignore=container_tests/"
            ]]

        pytest_cmds = [
            pytest_cmd + ["--last-failed", "--last-failed-no-failures", "all"]
            for pytest_cmd in pytest_cmds
        ]
        pytest_cache_util.download_pytest_cache_from_s3_to_local(
            os.getcwd(), **pytest_cache_params)
        try:
            # Note:- Running multiple pytest_cmds in a sequence will result in the execution log having two
            #        separate pytest reports, both of which must be examined in case of a manual review of results.
            cmd_exit_statuses = [
                pytest.main(pytest_cmd) for pytest_cmd in pytest_cmds
            ]
            if all([status == 0 for status in cmd_exit_statuses]):
                sys.exit(0)
            else:
                raise RuntimeError(pytest_cmds)
        finally:
            pytest_cache_util.upload_pytest_cache_from_local_to_s3(
                os.getcwd(), **pytest_cache_params)
            # Delete dangling EC2 KeyPairs
            if os.path.exists(KEYS_TO_DESTROY_FILE):
                delete_key_pairs(KEYS_TO_DESTROY_FILE)
    elif specific_test_type == "sagemaker":
        if "habana" in dlc_images:
            LOGGER.info(f"Skipping SM tests for Habana. Images: {dlc_images}")
            # Creating an empty file for because codebuild job fails without it
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            sm_utils.generate_empty_report(report, test_type, "habana")
            return
        if benchmark_mode:
            if "neuron" in dlc_images:
                LOGGER.info(
                    f"Skipping benchmark sm tests for Neuron. Images: {dlc_images}"
                )
                # Creating an empty file for because codebuild job fails without it
                report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
                sm_utils.generate_empty_report(report, test_type, "neuron")
                return
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            os.chdir(os.path.join("test", "dlc_tests"))

            setup_sm_benchmark_env(dlc_images, test_path)
            pytest_cmd = [
                "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto",
                "-o", "norecursedirs=resources"
            ]
            if not is_pr_context():
                pytest_cmd += ["--efa"] if efa_dedicated else ["-m", "not efa"]
            sys.exit(pytest.main(pytest_cmd))

        else:
            sm_remote_images = [
                image for image in standard_images_list
                if not (("tensorflow-inference" in image and "py2" in image)
                        or is_e3_image(image))
            ]
            run_sagemaker_remote_tests(sm_remote_images, pytest_cache_params)
            if standard_images_list and not sm_remote_images:
                report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
                sm_utils.generate_empty_report(report, test_type,
                                               "sm_remote_unsupported")
        metrics_utils.send_test_duration_metrics(start_time)

    elif specific_test_type == "sagemaker-local":
        if "neuron" in dlc_images:
            LOGGER.info(
                f"Skipping sagemaker tests because Neuron is not yet supported on SM. Images: {dlc_images}"
            )
            # Creating an empty file for because codebuild job fails without it
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            sm_utils.generate_empty_report(report, test_type, "neuron")
            return
        if "habana" in dlc_images:
            LOGGER.info(
                f"Skipping sagemaker tests because Habana is not yet supported on SM. Images: {dlc_images}"
            )
            # Creating an empty file for because codebuild job fails without it
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            sm_utils.generate_empty_report(report, test_type, "habana")
            return
        testing_image_list = [
            image for image in standard_images_list
            if not (("tensorflow-inference" in image and "py2" in image) or
                    ("eia" in image) or (is_e3_image(image)))
        ]
        run_sagemaker_local_tests(testing_image_list, pytest_cache_params)
        # for EIA Images
        if len(testing_image_list) == 0:
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            sm_utils.generate_empty_report(report, test_type, "eia")
    else:
        raise NotImplementedError(
            f"{test_type} test is not supported. Only support ec2, ecs, eks, sagemaker and sanity currently"
        )