def main():
    # Define constants
    test_type = os.getenv("TEST_TYPE")
    dlc_images = get_dlc_images()
    LOGGER.info(f"Images tested: {dlc_images}")
    all_image_list = dlc_images.split(" ")
    standard_images_list = [
        image_uri for image_uri in all_image_list if "example" not in image_uri
    ]
    eks_terminable_clusters = []
    benchmark_mode = "benchmark" in test_type
    specific_test_type = re.sub("benchmark-", "",
                                test_type) if benchmark_mode else test_type
    test_path = os.path.join(
        "benchmark",
        specific_test_type) if benchmark_mode else specific_test_type

    if specific_test_type in ("sanity", "ecs", "ec2", "eks"):
        report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")

        # PyTest must be run in this directory to avoid conflicting w/ sagemaker_tests conftests
        os.chdir(os.path.join("test", "dlc_tests"))

        # Pull images for necessary tests
        if specific_test_type == "sanity":
            pull_dlc_images(all_image_list)
        if specific_test_type == "eks":
            eks_terminable_clusters = setup_eks_clusters(dlc_images)
        # Execute dlc_tests pytest command
        pytest_cmd = [
            "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto"
        ]
        try:
            sys.exit(pytest.main(pytest_cmd))
        finally:
            if specific_test_type == "eks" and eks_terminable_clusters:
                for cluster in eks_terminable_clusters:
                    eks_utils.delete_eks_cluster(cluster)

            # Delete dangling EC2 KeyPairs
            if specific_test_type == "ec2" and os.path.exists(
                    KEYS_TO_DESTROY_FILE):
                with open(KEYS_TO_DESTROY_FILE) as key_destroy_file:
                    for key_file in key_destroy_file:
                        LOGGER.info(key_file)
                        ec2_client = boto3.client(
                            "ec2", config=Config(retries={'max_attempts': 10}))
                        if ".pem" in key_file:
                            _resp, keyname = destroy_ssh_keypair(
                                ec2_client, key_file)
                            LOGGER.info(f"Deleted {keyname}")
    elif specific_test_type == "sagemaker":
        run_sagemaker_tests([
            image for image in standard_images_list
            if not ("tensorflow-inference" in image and "py2" in image)
        ])
    else:
        raise NotImplementedError(
            f"{test_type} test is not supported. "
            f"Only support ec2, ecs, eks, sagemaker and sanity currently")
def delete_key_pairs(keyfile):
    """
    Function to delete key pairs from a file in mainline context

    :param keyfile: file with all of the keys to delete
    """
    with open(keyfile) as key_destroy_file:
        for key_file in key_destroy_file:
            LOGGER.info(key_file)
            ec2_client = boto3.client("ec2", config=Config(retries={"max_attempts": 10}))
            if ".pem" in key_file:
                _resp, keyname = destroy_ssh_keypair(ec2_client, key_file)
                LOGGER.info(f"Deleted {keyname}")
def execute_local_tests(image, pytest_cache_params):
    """
    Run the sagemaker local tests in ec2 instance for the image
    :param image: ECR url
    :param pytest_cache_params: parameters required for :param pytest_cache_util
    :return: None
    """
    account_id = os.getenv(
        "ACCOUNT_ID",
        boto3.client("sts").get_caller_identity()["Account"])
    pytest_cache_util = PytestCache(boto3.client("s3"), account_id)
    ec2_client = boto3.client("ec2",
                              config=Config(retries={"max_attempts": 10}),
                              region_name=DEFAULT_REGION)
    pytest_command, path, tag, job_type = generate_sagemaker_pytest_cmd(
        image, SAGEMAKER_LOCAL_TEST_TYPE)
    pytest_command += " --last-failed --last-failed-no-failures all "
    print(pytest_command)
    framework, _ = get_framework_and_version_from_tag(image)
    random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}")
    ec2_key_name = f"{job_type}_{tag}_sagemaker_{random.randint(1, 1000)}"
    region = os.getenv("AWS_REGION", DEFAULT_REGION)
    ec2_ami_id = UBUNTU_18_BASE_DLAMI_US_EAST_1 if region == "us-east-1" else UBUNTU_18_BASE_DLAMI_US_WEST_2
    sm_tests_tar_name = "sagemaker_tests.tar.gz"
    ec2_test_report_path = os.path.join(UBUNTU_HOME_DIR, "test",
                                        f"{job_type}_{tag}_sm_local.xml")
    instance_id = ""
    ec2_conn = None
    try:
        key_file = generate_ssh_keypair(ec2_client, ec2_key_name)
        print(f"Launching new Instance for image: {image}")
        instance_id, ip_address = launch_sagemaker_local_ec2_instance(
            image, ec2_ami_id, ec2_key_name, region)
        ec2_conn = ec2_utils.get_ec2_fabric_connection(instance_id, key_file,
                                                       region)
        ec2_conn.put(sm_tests_tar_name, f"{UBUNTU_HOME_DIR}")
        ec2_conn.run(
            f"$(aws ecr get-login --no-include-email --region {region})")
        try:
            ec2_conn.run(f"docker pull {image}", timeout=600)
        except invoke.exceptions.CommandTimedOut as e:
            output = ec2_conn.run(
                f"docker images {image} --format '{{.Repository}}:{{.Tag}}'"
            ).stdout.strip("\n")
            if output != image:
                raise DLCSageMakerLocalTestFailure(
                    f"Image pull for {image} failed.\ndocker images output = {output}"
                ) from e
        ec2_conn.run(f"tar -xzf {sm_tests_tar_name}")
        kill_background_processes_and_run_apt_get_update(ec2_conn)
        with ec2_conn.cd(path):
            install_sm_local_dependencies(framework, job_type, image, ec2_conn,
                                          ec2_ami_id)
            pytest_cache_util.download_pytest_cache_from_s3_to_ec2(
                ec2_conn, path, **pytest_cache_params)
            # Workaround for mxnet cpu training images as test distributed
            # causes an issue with fabric ec2_connection
            if framework == "mxnet" and job_type == "training" and "cpu" in image:
                try:
                    ec2_conn.run(pytest_command, timeout=1000, warn=True)
                except exceptions.CommandTimedOut as exc:
                    print(f"Ec2 connection timed out for {image}, {exc}")
                finally:
                    print(f"Downloading Test reports for image: {image}")
                    ec2_conn.close()
                    ec2_conn_new = ec2_utils.get_ec2_fabric_connection(
                        instance_id, key_file, region)
                    ec2_conn_new.get(
                        ec2_test_report_path,
                        os.path.join("test", f"{job_type}_{tag}_sm_local.xml"))
                    output = subprocess.check_output(
                        f"cat test/{job_type}_{tag}_sm_local.xml",
                        shell=True,
                        executable="/bin/bash")
                    pytest_cache_util.upload_pytest_cache_from_ec2_to_s3(
                        ec2_conn_new, path, **pytest_cache_params)
                    if 'failures="0"' not in str(output):
                        raise ValueError(
                            f"Sagemaker Local tests failed for {image}")
            else:
                ec2_conn.run(pytest_command)
                print(f"Downloading Test reports for image: {image}")
                ec2_conn.get(
                    ec2_test_report_path,
                    os.path.join("test", f"{job_type}_{tag}_sm_local.xml"))
    finally:
        with ec2_conn.cd(path):
            pytest_cache_util.upload_pytest_cache_from_ec2_to_s3(
                ec2_conn, path, **pytest_cache_params)
        print(f"Terminating Instances for image: {image}")
        ec2_utils.terminate_instance(instance_id, region)
        print(f"Destroying ssh Key_pair for image: {image}")
        destroy_ssh_keypair(ec2_client, ec2_key_name)
        # return None here to prevent errors from multiprocessing.map(). Without this it returns some object by default
        # which is causing "cannot pickle '_thread.lock' object" error
        return None
def main():
    # Define constants
    test_type = os.getenv("TEST_TYPE")
    dlc_images = get_dlc_images()
    LOGGER.info(f"Images tested: {dlc_images}")
    all_image_list = dlc_images.split(" ")
    standard_images_list = [
        image_uri for image_uri in all_image_list if "example" not in image_uri
    ]
    eks_cluster_name = None
    benchmark_mode = "benchmark" in test_type
    specific_test_type = re.sub("benchmark-", "",
                                test_type) if benchmark_mode else test_type
    test_path = os.path.join(
        "benchmark",
        specific_test_type) if benchmark_mode else specific_test_type

    if specific_test_type in ("sanity", "ecs", "ec2", "eks", "canary"):
        report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
        # The following two report files will only be used by EKS tests, as eks_train.xml and eks_infer.xml.
        # This is to sequence the tests and prevent one set of tests from waiting too long to be scheduled.
        report_train = os.path.join(os.getcwd(), "test",
                                    f"{test_type}_train.xml")
        report_infer = os.path.join(os.getcwd(), "test",
                                    f"{test_type}_infer.xml")

        # PyTest must be run in this directory to avoid conflicting w/ sagemaker_tests conftests
        os.chdir(os.path.join("test", "dlc_tests"))

        # Pull images for necessary tests
        if specific_test_type == "sanity":
            pull_dlc_images(all_image_list)
        if specific_test_type == "eks":
            frameworks_in_images = [
                framework for framework in ("mxnet", "pytorch", "tensorflow")
                if framework in dlc_images
            ]
            if len(frameworks_in_images) != 1:
                raise ValueError(
                    f"All images in dlc_images must be of a single framework for EKS tests.\n"
                    f"Instead seeing {frameworks_in_images} frameworks.")
            framework = frameworks_in_images[0]
            eks_cluster_name = setup_eks_cluster(framework)

            #setup kubeflow
            eks_utils.setup_kubeflow(eks_cluster_name)

            # Split training and inference, and run one after the other, to prevent scheduling issues
            # Set -n=4, instead of -n=auto, because initiating too many pods simultaneously has been resulting in
            # pods timing-out while they were in the Pending state. Scheduling 4 tests (and hence, 4 pods) at once
            # seems to be an optimal configuration.
            pytest_cmds = [
                [
                    "-s", "-rA",
                    os.path.join(test_path, framework, "training"),
                    f"--junitxml={report_train}", "-n=4"
                ],
                [
                    "-s", "-rA",
                    os.path.join(test_path, framework, "inference"),
                    f"--junitxml={report_infer}", "-n=4"
                ],
            ]
        else:
            # Execute dlc_tests pytest command
            pytest_cmds = [[
                "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto"
            ]]
        # Execute separate cmd for canaries
        if specific_test_type == "canary":
            pytest_cmds = [[
                "-s", "-rA", f"--junitxml={report}", "-n=auto", "--canary",
                "--ignore=container_tests/"
            ]]
        try:
            # Note:- Running multiple pytest_cmds in a sequence will result in the execution log having two
            #        separate pytest reports, both of which must be examined in case of a manual review of results.
            cmd_exit_statuses = [
                pytest.main(pytest_cmd) for pytest_cmd in pytest_cmds
            ]
            sys.exit(0) if all([status == 0 for status in cmd_exit_statuses
                                ]) else sys.exit(1)
        finally:
            if specific_test_type == "eks" and eks_cluster_name:
                eks_utils.delete_eks_cluster(eks_cluster_name)

            # Delete dangling EC2 KeyPairs
            if os.path.exists(KEYS_TO_DESTROY_FILE):
                with open(KEYS_TO_DESTROY_FILE) as key_destroy_file:
                    for key_file in key_destroy_file:
                        LOGGER.info(key_file)
                        ec2_client = boto3.client(
                            "ec2", config=Config(retries={'max_attempts': 10}))
                        if ".pem" in key_file:
                            _resp, keyname = destroy_ssh_keypair(
                                ec2_client, key_file)
                            LOGGER.info(f"Deleted {keyname}")
    elif specific_test_type == "sagemaker":
        if benchmark_mode:
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            os.chdir(os.path.join("test", "dlc_tests"))

            setup_sm_benchmark_env(dlc_images, test_path)
            pytest_cmd = [
                "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto",
                "-o", "norecursedirs=resources"
            ]
            sys.exit(pytest.main(pytest_cmd))
        else:
            run_sagemaker_remote_tests([
                image for image in standard_images_list
                if not ("tensorflow-inference" in image and "py2" in image)
            ])
    elif specific_test_type == "sagemaker-local":
        run_sagemaker_local_tests([
            image for image in standard_images_list
            if not ("tensorflow-inference" in image and "py2" in image)
        ])
    else:
        raise NotImplementedError(
            f"{test_type} test is not supported. "
            f"Only support ec2, ecs, eks, sagemaker and sanity currently")
示例#5
0
def execute_local_tests(image, ec2_client):
    """
    Run the sagemaker local tests in ec2 instance for the image
    :param image: ECR url
    :param ec2_client: boto3_obj
    :return: None
    """
    pytest_command, path, tag, job_type = generate_sagemaker_pytest_cmd(
        image, SAGEMAKER_LOCAL_TEST_TYPE)
    print(pytest_command)
    framework, _ = get_framework_and_version_from_tag(image)
    random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}")
    ec2_key_name = f"{job_type}_{tag}_sagemaker_{random.randint(1, 1000)}"
    region = os.getenv("AWS_REGION", DEFAULT_REGION)
    sm_tests_tar_name = "sagemaker_tests.tar.gz"
    ec2_test_report_path = os.path.join(UBUNTU_HOME_DIR, "test",
                                        f"{job_type}_{tag}_sm_local.xml")
    try:
        key_file = generate_ssh_keypair(ec2_client, ec2_key_name)
        print(f"Launching new Instance for image: {image}")
        instance_id, ip_address = launch_sagemaker_local_ec2_instance(
            image, UBUNTU_16_BASE_DLAMI_US_EAST_1 if region == "us-east-1" else
            UBUNTU_16_BASE_DLAMI_US_WEST_2, ec2_key_name, region)
        ec2_conn = ec2_utils.get_ec2_fabric_connection(instance_id, key_file,
                                                       region)
        ec2_conn.put(sm_tests_tar_name, f"{UBUNTU_HOME_DIR}")
        ec2_conn.run(
            f"$(aws ecr get-login --no-include-email --region {region})")
        ec2_conn.run(f"docker pull {image}")
        ec2_conn.run(f"tar -xzf {sm_tests_tar_name}")
        with ec2_conn.cd(path):
            install_sm_local_dependencies(framework, job_type, image, ec2_conn)
            # Workaround for mxnet cpu training images as test distributed
            # causes an issue with fabric ec2_connection
            if framework == "mxnet" and job_type == "training" and "cpu" in image:
                try:
                    ec2_conn.run(pytest_command, timeout=1000, warn=True)
                except exceptions.CommandTimedOut as exc:
                    print(f"Ec2 connection timed out for {image}, {exc}")
                finally:
                    print(f"Downloading Test reports for image: {image}")
                    ec2_conn.close()
                    ec2_conn_new = ec2_utils.get_ec2_fabric_connection(
                        instance_id, key_file, region)
                    ec2_conn_new.get(
                        ec2_test_report_path,
                        os.path.join("test", f"{job_type}_{tag}_sm_local.xml"))
                    output = subprocess.check_output(
                        f"cat test/{job_type}_{tag}_sm_local.xml",
                        shell=True,
                        executable="/bin/bash")
                    if 'failures="0"' not in str(output):
                        raise ValueError(
                            f"Sagemaker Local tests failed for {image}")
            else:
                ec2_conn.run(pytest_command)
                print(f"Downloading Test reports for image: {image}")
                ec2_conn.get(
                    ec2_test_report_path,
                    os.path.join("test", f"{job_type}_{tag}_sm_local.xml"))
    finally:
        print(f"Terminating Instances for image: {image}")
        ec2_utils.terminate_instance(instance_id, region)
        print(f"Destroying ssh Key_pair for image: {image}")
        destroy_ssh_keypair(ec2_client, ec2_key_name)