예제 #1
0
def main():
    # Define constants
    start_time = datetime.now()
    test_type = os.getenv("TEST_TYPE")
    efa_dedicated = os.getenv("EFA_DEDICATED", "False").lower() == "true"
    executor_mode = os.getenv("EXECUTOR_MODE", "False").lower() == "true"
    dlc_images = os.getenv("DLC_IMAGE") if executor_mode else get_dlc_images()
    LOGGER.info(f"Images tested: {dlc_images}")
    all_image_list = dlc_images.split(" ")
    standard_images_list = [image_uri for image_uri in all_image_list if "example" not in image_uri]
    # Do not create EKS cluster for when EIA Only Images are present
    is_all_images_list_eia = all("eia" in image_uri for image_uri in all_image_list)
    eks_cluster_name = None
    benchmark_mode = "benchmark" in test_type or is_benchmark_dev_context()
    specific_test_type = re.sub("benchmark-", "", test_type) if "benchmark" in test_type else test_type
    test_path = os.path.join("benchmark", specific_test_type) if benchmark_mode else specific_test_type

    # Skipping non HuggingFace specific tests to execute only sagemaker tests
    if any("huggingface" in image_uri for image_uri in all_image_list) and \
            specific_test_type in ("ecs", "ec2", "eks", "bai"):
        # Creating an empty file for because codebuild job fails without it
        report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
        sm_utils.generate_empty_report(report, test_type, "huggingface")
        return

    if specific_test_type in ("sanity", "ecs", "ec2", "eks", "canary", "bai"):
        report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
        # The following two report files will only be used by EKS tests, as eks_train.xml and eks_infer.xml.
        # This is to sequence the tests and prevent one set of tests from waiting too long to be scheduled.
        report_train = os.path.join(os.getcwd(), "test", f"{test_type}_train.xml")
        report_infer = os.path.join(os.getcwd(), "test", f"{test_type}_infer.xml")
        report_multinode_train = os.path.join(os.getcwd(), "test", f"eks_multinode_train.xml")

        # PyTest must be run in this directory to avoid conflicting w/ sagemaker_tests conftests
        os.chdir(os.path.join("test", "dlc_tests"))

        # Pull images for necessary tests
        if specific_test_type == "sanity":
            pull_dlc_images(all_image_list)
        if specific_test_type == "bai":
            build_bai_docker_container()
        if specific_test_type == "eks" and not is_all_images_list_eia:
            frameworks_in_images = [
                framework for framework in ("mxnet", "pytorch", "tensorflow") if framework in dlc_images
            ]
            if len(frameworks_in_images) != 1:
                raise ValueError(
                    f"All images in dlc_images must be of a single framework for EKS tests.\n"
                    f"Instead seeing {frameworks_in_images} frameworks."
                )
            framework = frameworks_in_images[0]
            is_neuron = "neuron" in dlc_images
            eks_cluster_name = setup_eks_cluster(framework, is_neuron)
            setup_ssm_eks_cluster(eks_cluster_name)

            if not is_neuron:
                # setup kubeflow
                eks_utils.setup_kubeflow(eks_cluster_name)

            # Change 1: Split training and inference, and run one after the other, to prevent scheduling issues
            # Set -n=4, instead of -n=auto, because initiating too many pods simultaneously has been resulting in
            # pods timing-out while they were in the Pending state. Scheduling 4 tests (and hence, 4 pods) at once
            # seems to be an optimal configuration.
            # Change 2: Separate multi-node EKS tests from single-node tests in execution to prevent resource contention
            if not is_neuron:
                pytest_cmds = [
                    [
                        "-s",
                        "-rA",
                        os.path.join(test_path, framework, "training"),
                        f"--junitxml={report_train}",
                        "-n=4",
                        "-m",
                        "not multinode",
                    ],
                    [
                        "-s",
                        "-rA",
                        os.path.join(test_path, framework, "inference"),
                        f"--junitxml={report_infer}",
                        "-n=4",
                        "-m",
                        "not multinode",
                    ],
                    ["-s", "-rA", test_path, f"--junitxml={report_multinode_train}", "--multinode"],
                ]
            else:
                pytest_cmds = [
                    [
                        "-s",
                        "-rA",
                        os.path.join(test_path, framework, "inference"),
                        f"--junitxml={report_infer}",
                        "-n=4",
                        "-m",
                        "not multinode",
                    ],
                    ["-s", "-rA", test_path, f"--junitxml={report_multinode_train}", "--multinode"],
                ]

            if is_pr_context():
                for cmd in pytest_cmds:
                    cmd.append("--timeout=2340")
        else:
            # Execute dlc_tests pytest command
            pytest_cmd = ["-s", "-rA", test_path, f"--junitxml={report}", "-n=auto"]
            if specific_test_type == "ec2":
                pytest_cmd += ["--reruns=1", "--reruns-delay=10"]
            if is_pr_context():
                pytest_cmd.append("--timeout=4860")

            pytest_cmds = [pytest_cmd]
        # Execute separate cmd for canaries
        if specific_test_type == "canary":
            pytest_cmds = [["-s", "-rA", f"--junitxml={report}", "-n=auto", "--canary", "--ignore=container_tests/"]]
        try:
            # Note:- Running multiple pytest_cmds in a sequence will result in the execution log having two
            #        separate pytest reports, both of which must be examined in case of a manual review of results.


            cmd_exit_statuses = [pytest.main(pytest_cmd) for pytest_cmd in pytest_cmds]
            if all([status == 0 for status in cmd_exit_statuses]):
                sys.exit(0)
            else:
                raise RuntimeError(pytest_cmds)
        finally:
            if specific_test_type == "eks" and eks_cluster_name:
                delete_eks_cluster(eks_cluster_name)

            # Delete dangling EC2 KeyPairs
            if os.path.exists(KEYS_TO_DESTROY_FILE):
                delete_key_pairs(KEYS_TO_DESTROY_FILE)

    elif specific_test_type == "sagemaker":
        if "neuron" in dlc_images:
            LOGGER.info(f"Skipping sagemaker tests because Neuron is not yet supported on SM. Images: {dlc_images}")
            # Creating an empty file for because codebuild job fails without it
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            sm_utils.generate_empty_report(report, test_type, "neuron")
            return
        if benchmark_mode:
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            os.chdir(os.path.join("test", "dlc_tests"))

            setup_sm_benchmark_env(dlc_images, test_path)
            pytest_cmd = ["-s", "-rA", test_path, f"--junitxml={report}", "-n=auto", "-o", "norecursedirs=resources"]
            if not is_pr_context():
                pytest_cmd += ["--efa"] if efa_dedicated else ["-m", "not efa"]
            sys.exit(pytest.main(pytest_cmd))

        else:
            run_sagemaker_remote_tests(
                [image for image in standard_images_list if not ("tensorflow-inference" in image and "py2" in image)]
            )
        metrics_utils.send_test_duration_metrics(start_time)

    elif specific_test_type == "sagemaker-local":
        if "neuron" in dlc_images:
            LOGGER.info(f"Skipping sagemaker tests because Neuron is not yet supported on SM. Images: {dlc_images}")
            # Creating an empty file for because codebuild job fails without it
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            sm_utils.generate_empty_report(report, test_type, "neuron")
            return
        testing_image_list = [
            image
            for image in standard_images_list
            if not (("tensorflow-inference" in image and "py2" in image) or ("eia" in image))
        ]
        run_sagemaker_local_tests(testing_image_list)
        # for EIA Images
        if len(testing_image_list) == 0:
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            sm_utils.generate_empty_report(report, test_type, "eia")
    else:
        raise NotImplementedError(
            f"{test_type} test is not supported. Only support ec2, ecs, eks, sagemaker and sanity currently"
        )
예제 #2
0
def main():
    # Define constants
    start_time = datetime.now()
    test_type = os.getenv("TEST_TYPE")

    efa_dedicated = os.getenv("EFA_DEDICATED", "False").lower() == "true"
    executor_mode = os.getenv("EXECUTOR_MODE", "False").lower() == "true"
    dlc_images = os.getenv("DLC_IMAGE") if executor_mode else get_dlc_images()
    # Executing locally ona can provide commit_id or may ommit it. Assigning default value for local executions:
    commit_id = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION",
                          default="unrecognised_commit_id")
    LOGGER.info(f"Images tested: {dlc_images}")
    all_image_list = dlc_images.split(" ")
    standard_images_list = [
        image_uri for image_uri in all_image_list if "example" not in image_uri
    ]
    # Do not create EKS cluster for when EIA Only Images are present
    is_all_images_list_eia = all("eia" in image_uri
                                 for image_uri in all_image_list)
    eks_cluster_name = None
    benchmark_mode = "benchmark" in test_type or is_benchmark_dev_context()
    specific_test_type = re.sub(
        "benchmark-", "", test_type) if "benchmark" in test_type else test_type
    build_context = get_build_context()

    # quick_checks tests don't have images in it. Using a placeholder here for jobs like that
    try:
        framework, version = get_framework_and_version_from_tag(
            all_image_list[0])
    except:
        framework, version = "general_test", "none"

    pytest_cache_params = {
        "commit_id": commit_id,
        "framework": framework,
        "version": version,
        "build_context": build_context,
        "test_type": test_type,
    }

    # In PR context, allow us to switch sagemaker tests to RC tests.
    # Do not allow them to be both enabled due to capacity issues.
    if specific_test_type == "sagemaker" and is_rc_test_context(
    ) and is_pr_context():
        specific_test_type = "release_candidate_integration"

    test_path = os.path.join(
        "benchmark",
        specific_test_type) if benchmark_mode else specific_test_type

    # Skipping non HuggingFace/AG specific tests to execute only sagemaker tests
    is_hf_image_present = any("huggingface" in image_uri
                              for image_uri in all_image_list)
    is_ag_image_present = any("autogluon" in image_uri
                              for image_uri in all_image_list)
    if (is_hf_image_present
            or is_ag_image_present) and specific_test_type in ("ecs", "ec2",
                                                               "eks", "bai"):
        # Creating an empty file for because codebuild job fails without it
        LOGGER.info(
            f"NOTE: {specific_test_type} tests not supported on HF or AG. Skipping..."
        )
        report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
        sm_utils.generate_empty_report(report, test_type, "huggingface")
        return

    if specific_test_type in (
            "sanity",
            "ecs",
            "ec2",
            "eks",
            "canary",
            "bai",
            "quick_checks",
            "release_candidate_integration",
    ):
        report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
        # The following two report files will only be used by EKS tests, as eks_train.xml and eks_infer.xml.
        # This is to sequence the tests and prevent one set of tests from waiting too long to be scheduled.
        report_train = os.path.join(os.getcwd(), "test",
                                    f"{test_type}_train.xml")
        report_infer = os.path.join(os.getcwd(), "test",
                                    f"{test_type}_infer.xml")
        report_multinode_train = os.path.join(os.getcwd(), "test",
                                              f"eks_multinode_train.xml")

        # PyTest must be run in this directory to avoid conflicting w/ sagemaker_tests conftests
        os.chdir(os.path.join("test", "dlc_tests"))

        # Pull images for necessary tests
        if specific_test_type == "sanity":
            pull_dlc_images(all_image_list)
        if specific_test_type == "bai":
            build_bai_docker_container()
        if specific_test_type == "eks" and not is_all_images_list_eia:
            frameworks_in_images = [
                framework for framework in ("mxnet", "pytorch", "tensorflow")
                if framework in dlc_images
            ]
            if len(frameworks_in_images) != 1:
                raise ValueError(
                    f"All images in dlc_images must be of a single framework for EKS tests.\n"
                    f"Instead seeing {frameworks_in_images} frameworks.")
            framework = frameworks_in_images[0]
            eks_cluster_name = f"{framework}-{build_context}"
            eks_utils.eks_setup()
            if eks_utils.is_eks_cluster_active(eks_cluster_name):
                eks_utils.eks_write_kubeconfig(eks_cluster_name)
            else:
                raise Exception(
                    f"EKS cluster {eks_cluster_name} is not in active state")

        # Execute dlc_tests pytest command
        pytest_cmd = [
            "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto"
        ]

        is_habana_image = any("habana" in image_uri
                              for image_uri in all_image_list)
        if specific_test_type == "ec2":
            if is_habana_image:
                context = Context()
                context.run(
                    "git clone https://github.com/HabanaAI/gaudi-test-suite.git"
                )
                context.run(
                    "tar -c -f gaudi-test-suite.tar.gz gaudi-test-suite")
            else:
                pytest_cmd += ["--reruns=1", "--reruns-delay=10"]

        if is_pr_context():
            if specific_test_type == "eks":
                pytest_cmd.append("--timeout=2340")
            else:
                if is_habana_image:
                    pytest_cmd.append("--timeout=18000")
                else:
                    pytest_cmd.append("--timeout=4860")

        pytest_cmds = [pytest_cmd]
        # Execute separate cmd for canaries
        if specific_test_type in ("canary", "quick_checks"):
            pytest_cmds = [[
                "-s", "-rA", f"--junitxml={report}", "-n=auto",
                f"--{specific_test_type}", "--ignore=container_tests/"
            ]]

        pytest_cmds = [
            pytest_cmd + ["--last-failed", "--last-failed-no-failures", "all"]
            for pytest_cmd in pytest_cmds
        ]
        pytest_cache_util.download_pytest_cache_from_s3_to_local(
            os.getcwd(), **pytest_cache_params)
        try:
            # Note:- Running multiple pytest_cmds in a sequence will result in the execution log having two
            #        separate pytest reports, both of which must be examined in case of a manual review of results.
            cmd_exit_statuses = [
                pytest.main(pytest_cmd) for pytest_cmd in pytest_cmds
            ]
            if all([status == 0 for status in cmd_exit_statuses]):
                sys.exit(0)
            else:
                raise RuntimeError(pytest_cmds)
        finally:
            pytest_cache_util.upload_pytest_cache_from_local_to_s3(
                os.getcwd(), **pytest_cache_params)
            # Delete dangling EC2 KeyPairs
            if os.path.exists(KEYS_TO_DESTROY_FILE):
                delete_key_pairs(KEYS_TO_DESTROY_FILE)
    elif specific_test_type == "sagemaker":
        if "habana" in dlc_images:
            LOGGER.info(f"Skipping SM tests for Habana. Images: {dlc_images}")
            # Creating an empty file for because codebuild job fails without it
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            sm_utils.generate_empty_report(report, test_type, "habana")
            return
        if benchmark_mode:
            if "neuron" in dlc_images:
                LOGGER.info(
                    f"Skipping benchmark sm tests for Neuron. Images: {dlc_images}"
                )
                # Creating an empty file for because codebuild job fails without it
                report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
                sm_utils.generate_empty_report(report, test_type, "neuron")
                return
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            os.chdir(os.path.join("test", "dlc_tests"))

            setup_sm_benchmark_env(dlc_images, test_path)
            pytest_cmd = [
                "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto",
                "-o", "norecursedirs=resources"
            ]
            if not is_pr_context():
                pytest_cmd += ["--efa"] if efa_dedicated else ["-m", "not efa"]
            sys.exit(pytest.main(pytest_cmd))

        else:
            sm_remote_images = [
                image for image in standard_images_list
                if not (("tensorflow-inference" in image and "py2" in image)
                        or is_e3_image(image))
            ]
            run_sagemaker_remote_tests(sm_remote_images, pytest_cache_params)
            if standard_images_list and not sm_remote_images:
                report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
                sm_utils.generate_empty_report(report, test_type,
                                               "sm_remote_unsupported")
        metrics_utils.send_test_duration_metrics(start_time)

    elif specific_test_type == "sagemaker-local":
        if "neuron" in dlc_images:
            LOGGER.info(
                f"Skipping sagemaker tests because Neuron is not yet supported on SM. Images: {dlc_images}"
            )
            # Creating an empty file for because codebuild job fails without it
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            sm_utils.generate_empty_report(report, test_type, "neuron")
            return
        if "habana" in dlc_images:
            LOGGER.info(
                f"Skipping sagemaker tests because Habana is not yet supported on SM. Images: {dlc_images}"
            )
            # Creating an empty file for because codebuild job fails without it
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            sm_utils.generate_empty_report(report, test_type, "habana")
            return
        testing_image_list = [
            image for image in standard_images_list
            if not (("tensorflow-inference" in image and "py2" in image) or
                    ("eia" in image) or (is_e3_image(image)))
        ]
        run_sagemaker_local_tests(testing_image_list, pytest_cache_params)
        # for EIA Images
        if len(testing_image_list) == 0:
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            sm_utils.generate_empty_report(report, test_type, "eia")
    else:
        raise NotImplementedError(
            f"{test_type} test is not supported. Only support ec2, ecs, eks, sagemaker and sanity currently"
        )