def main(): # Define constants test_type = os.getenv("TEST_TYPE") dlc_images = get_dlc_images() LOGGER.info(f"Images tested: {dlc_images}") all_image_list = dlc_images.split(" ") standard_images_list = [ image_uri for image_uri in all_image_list if "example" not in image_uri ] eks_terminable_clusters = [] benchmark_mode = "benchmark" in test_type specific_test_type = re.sub("benchmark-", "", test_type) if benchmark_mode else test_type test_path = os.path.join( "benchmark", specific_test_type) if benchmark_mode else specific_test_type if specific_test_type in ("sanity", "ecs", "ec2", "eks"): report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") # PyTest must be run in this directory to avoid conflicting w/ sagemaker_tests conftests os.chdir(os.path.join("test", "dlc_tests")) # Pull images for necessary tests if specific_test_type == "sanity": pull_dlc_images(all_image_list) if specific_test_type == "eks": eks_terminable_clusters = setup_eks_clusters(dlc_images) # Execute dlc_tests pytest command pytest_cmd = [ "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto" ] try: sys.exit(pytest.main(pytest_cmd)) finally: if specific_test_type == "eks" and eks_terminable_clusters: for cluster in eks_terminable_clusters: eks_utils.delete_eks_cluster(cluster) # Delete dangling EC2 KeyPairs if specific_test_type == "ec2" and os.path.exists( KEYS_TO_DESTROY_FILE): with open(KEYS_TO_DESTROY_FILE) as key_destroy_file: for key_file in key_destroy_file: LOGGER.info(key_file) ec2_client = boto3.client( "ec2", config=Config(retries={'max_attempts': 10})) if ".pem" in key_file: _resp, keyname = destroy_ssh_keypair( ec2_client, key_file) LOGGER.info(f"Deleted {keyname}") elif specific_test_type == "sagemaker": run_sagemaker_tests([ image for image in standard_images_list if not ("tensorflow-inference" in image and "py2" in image) ]) else: raise NotImplementedError( f"{test_type} test is not supported. " f"Only support ec2, ecs, eks, sagemaker and sanity currently")
def delete_key_pairs(keyfile): """ Function to delete key pairs from a file in mainline context :param keyfile: file with all of the keys to delete """ with open(keyfile) as key_destroy_file: for key_file in key_destroy_file: LOGGER.info(key_file) ec2_client = boto3.client("ec2", config=Config(retries={"max_attempts": 10})) if ".pem" in key_file: _resp, keyname = destroy_ssh_keypair(ec2_client, key_file) LOGGER.info(f"Deleted {keyname}")
def execute_local_tests(image, pytest_cache_params): """ Run the sagemaker local tests in ec2 instance for the image :param image: ECR url :param pytest_cache_params: parameters required for :param pytest_cache_util :return: None """ account_id = os.getenv( "ACCOUNT_ID", boto3.client("sts").get_caller_identity()["Account"]) pytest_cache_util = PytestCache(boto3.client("s3"), account_id) ec2_client = boto3.client("ec2", config=Config(retries={"max_attempts": 10}), region_name=DEFAULT_REGION) pytest_command, path, tag, job_type = generate_sagemaker_pytest_cmd( image, SAGEMAKER_LOCAL_TEST_TYPE) pytest_command += " --last-failed --last-failed-no-failures all " print(pytest_command) framework, _ = get_framework_and_version_from_tag(image) random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}") ec2_key_name = f"{job_type}_{tag}_sagemaker_{random.randint(1, 1000)}" region = os.getenv("AWS_REGION", DEFAULT_REGION) ec2_ami_id = UBUNTU_18_BASE_DLAMI_US_EAST_1 if region == "us-east-1" else UBUNTU_18_BASE_DLAMI_US_WEST_2 sm_tests_tar_name = "sagemaker_tests.tar.gz" ec2_test_report_path = os.path.join(UBUNTU_HOME_DIR, "test", f"{job_type}_{tag}_sm_local.xml") instance_id = "" ec2_conn = None try: key_file = generate_ssh_keypair(ec2_client, ec2_key_name) print(f"Launching new Instance for image: {image}") instance_id, ip_address = launch_sagemaker_local_ec2_instance( image, ec2_ami_id, ec2_key_name, region) ec2_conn = ec2_utils.get_ec2_fabric_connection(instance_id, key_file, region) ec2_conn.put(sm_tests_tar_name, f"{UBUNTU_HOME_DIR}") ec2_conn.run( f"$(aws ecr get-login --no-include-email --region {region})") try: ec2_conn.run(f"docker pull {image}", timeout=600) except invoke.exceptions.CommandTimedOut as e: output = ec2_conn.run( f"docker images {image} --format '{{.Repository}}:{{.Tag}}'" ).stdout.strip("\n") if output != image: raise DLCSageMakerLocalTestFailure( f"Image pull for {image} failed.\ndocker images output = {output}" ) from e ec2_conn.run(f"tar -xzf {sm_tests_tar_name}") kill_background_processes_and_run_apt_get_update(ec2_conn) with ec2_conn.cd(path): install_sm_local_dependencies(framework, job_type, image, ec2_conn, ec2_ami_id) pytest_cache_util.download_pytest_cache_from_s3_to_ec2( ec2_conn, path, **pytest_cache_params) # Workaround for mxnet cpu training images as test distributed # causes an issue with fabric ec2_connection if framework == "mxnet" and job_type == "training" and "cpu" in image: try: ec2_conn.run(pytest_command, timeout=1000, warn=True) except exceptions.CommandTimedOut as exc: print(f"Ec2 connection timed out for {image}, {exc}") finally: print(f"Downloading Test reports for image: {image}") ec2_conn.close() ec2_conn_new = ec2_utils.get_ec2_fabric_connection( instance_id, key_file, region) ec2_conn_new.get( ec2_test_report_path, os.path.join("test", f"{job_type}_{tag}_sm_local.xml")) output = subprocess.check_output( f"cat test/{job_type}_{tag}_sm_local.xml", shell=True, executable="/bin/bash") pytest_cache_util.upload_pytest_cache_from_ec2_to_s3( ec2_conn_new, path, **pytest_cache_params) if 'failures="0"' not in str(output): raise ValueError( f"Sagemaker Local tests failed for {image}") else: ec2_conn.run(pytest_command) print(f"Downloading Test reports for image: {image}") ec2_conn.get( ec2_test_report_path, os.path.join("test", f"{job_type}_{tag}_sm_local.xml")) finally: with ec2_conn.cd(path): pytest_cache_util.upload_pytest_cache_from_ec2_to_s3( ec2_conn, path, **pytest_cache_params) print(f"Terminating Instances for image: {image}") ec2_utils.terminate_instance(instance_id, region) print(f"Destroying ssh Key_pair for image: {image}") destroy_ssh_keypair(ec2_client, ec2_key_name) # return None here to prevent errors from multiprocessing.map(). Without this it returns some object by default # which is causing "cannot pickle '_thread.lock' object" error return None
def main(): # Define constants test_type = os.getenv("TEST_TYPE") dlc_images = get_dlc_images() LOGGER.info(f"Images tested: {dlc_images}") all_image_list = dlc_images.split(" ") standard_images_list = [ image_uri for image_uri in all_image_list if "example" not in image_uri ] eks_cluster_name = None benchmark_mode = "benchmark" in test_type specific_test_type = re.sub("benchmark-", "", test_type) if benchmark_mode else test_type test_path = os.path.join( "benchmark", specific_test_type) if benchmark_mode else specific_test_type if specific_test_type in ("sanity", "ecs", "ec2", "eks", "canary"): report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") # The following two report files will only be used by EKS tests, as eks_train.xml and eks_infer.xml. # This is to sequence the tests and prevent one set of tests from waiting too long to be scheduled. report_train = os.path.join(os.getcwd(), "test", f"{test_type}_train.xml") report_infer = os.path.join(os.getcwd(), "test", f"{test_type}_infer.xml") # PyTest must be run in this directory to avoid conflicting w/ sagemaker_tests conftests os.chdir(os.path.join("test", "dlc_tests")) # Pull images for necessary tests if specific_test_type == "sanity": pull_dlc_images(all_image_list) if specific_test_type == "eks": frameworks_in_images = [ framework for framework in ("mxnet", "pytorch", "tensorflow") if framework in dlc_images ] if len(frameworks_in_images) != 1: raise ValueError( f"All images in dlc_images must be of a single framework for EKS tests.\n" f"Instead seeing {frameworks_in_images} frameworks.") framework = frameworks_in_images[0] eks_cluster_name = setup_eks_cluster(framework) #setup kubeflow eks_utils.setup_kubeflow(eks_cluster_name) # Split training and inference, and run one after the other, to prevent scheduling issues # Set -n=4, instead of -n=auto, because initiating too many pods simultaneously has been resulting in # pods timing-out while they were in the Pending state. Scheduling 4 tests (and hence, 4 pods) at once # seems to be an optimal configuration. pytest_cmds = [ [ "-s", "-rA", os.path.join(test_path, framework, "training"), f"--junitxml={report_train}", "-n=4" ], [ "-s", "-rA", os.path.join(test_path, framework, "inference"), f"--junitxml={report_infer}", "-n=4" ], ] else: # Execute dlc_tests pytest command pytest_cmds = [[ "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto" ]] # Execute separate cmd for canaries if specific_test_type == "canary": pytest_cmds = [[ "-s", "-rA", f"--junitxml={report}", "-n=auto", "--canary", "--ignore=container_tests/" ]] try: # Note:- Running multiple pytest_cmds in a sequence will result in the execution log having two # separate pytest reports, both of which must be examined in case of a manual review of results. cmd_exit_statuses = [ pytest.main(pytest_cmd) for pytest_cmd in pytest_cmds ] sys.exit(0) if all([status == 0 for status in cmd_exit_statuses ]) else sys.exit(1) finally: if specific_test_type == "eks" and eks_cluster_name: eks_utils.delete_eks_cluster(eks_cluster_name) # Delete dangling EC2 KeyPairs if os.path.exists(KEYS_TO_DESTROY_FILE): with open(KEYS_TO_DESTROY_FILE) as key_destroy_file: for key_file in key_destroy_file: LOGGER.info(key_file) ec2_client = boto3.client( "ec2", config=Config(retries={'max_attempts': 10})) if ".pem" in key_file: _resp, keyname = destroy_ssh_keypair( ec2_client, key_file) LOGGER.info(f"Deleted {keyname}") elif specific_test_type == "sagemaker": if benchmark_mode: report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") os.chdir(os.path.join("test", "dlc_tests")) setup_sm_benchmark_env(dlc_images, test_path) pytest_cmd = [ "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto", "-o", "norecursedirs=resources" ] sys.exit(pytest.main(pytest_cmd)) else: run_sagemaker_remote_tests([ image for image in standard_images_list if not ("tensorflow-inference" in image and "py2" in image) ]) elif specific_test_type == "sagemaker-local": run_sagemaker_local_tests([ image for image in standard_images_list if not ("tensorflow-inference" in image and "py2" in image) ]) else: raise NotImplementedError( f"{test_type} test is not supported. " f"Only support ec2, ecs, eks, sagemaker and sanity currently")
def execute_local_tests(image, ec2_client): """ Run the sagemaker local tests in ec2 instance for the image :param image: ECR url :param ec2_client: boto3_obj :return: None """ pytest_command, path, tag, job_type = generate_sagemaker_pytest_cmd( image, SAGEMAKER_LOCAL_TEST_TYPE) print(pytest_command) framework, _ = get_framework_and_version_from_tag(image) random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}") ec2_key_name = f"{job_type}_{tag}_sagemaker_{random.randint(1, 1000)}" region = os.getenv("AWS_REGION", DEFAULT_REGION) sm_tests_tar_name = "sagemaker_tests.tar.gz" ec2_test_report_path = os.path.join(UBUNTU_HOME_DIR, "test", f"{job_type}_{tag}_sm_local.xml") try: key_file = generate_ssh_keypair(ec2_client, ec2_key_name) print(f"Launching new Instance for image: {image}") instance_id, ip_address = launch_sagemaker_local_ec2_instance( image, UBUNTU_16_BASE_DLAMI_US_EAST_1 if region == "us-east-1" else UBUNTU_16_BASE_DLAMI_US_WEST_2, ec2_key_name, region) ec2_conn = ec2_utils.get_ec2_fabric_connection(instance_id, key_file, region) ec2_conn.put(sm_tests_tar_name, f"{UBUNTU_HOME_DIR}") ec2_conn.run( f"$(aws ecr get-login --no-include-email --region {region})") ec2_conn.run(f"docker pull {image}") ec2_conn.run(f"tar -xzf {sm_tests_tar_name}") with ec2_conn.cd(path): install_sm_local_dependencies(framework, job_type, image, ec2_conn) # Workaround for mxnet cpu training images as test distributed # causes an issue with fabric ec2_connection if framework == "mxnet" and job_type == "training" and "cpu" in image: try: ec2_conn.run(pytest_command, timeout=1000, warn=True) except exceptions.CommandTimedOut as exc: print(f"Ec2 connection timed out for {image}, {exc}") finally: print(f"Downloading Test reports for image: {image}") ec2_conn.close() ec2_conn_new = ec2_utils.get_ec2_fabric_connection( instance_id, key_file, region) ec2_conn_new.get( ec2_test_report_path, os.path.join("test", f"{job_type}_{tag}_sm_local.xml")) output = subprocess.check_output( f"cat test/{job_type}_{tag}_sm_local.xml", shell=True, executable="/bin/bash") if 'failures="0"' not in str(output): raise ValueError( f"Sagemaker Local tests failed for {image}") else: ec2_conn.run(pytest_command) print(f"Downloading Test reports for image: {image}") ec2_conn.get( ec2_test_report_path, os.path.join("test", f"{job_type}_{tag}_sm_local.xml")) finally: print(f"Terminating Instances for image: {image}") ec2_utils.terminate_instance(instance_id, region) print(f"Destroying ssh Key_pair for image: {image}") destroy_ssh_keypair(ec2_client, ec2_key_name)