def main(): # Define constants start_time = datetime.now() test_type = os.getenv("TEST_TYPE") efa_dedicated = os.getenv("EFA_DEDICATED", "False").lower() == "true" executor_mode = os.getenv("EXECUTOR_MODE", "False").lower() == "true" dlc_images = os.getenv("DLC_IMAGE") if executor_mode else get_dlc_images() LOGGER.info(f"Images tested: {dlc_images}") all_image_list = dlc_images.split(" ") standard_images_list = [image_uri for image_uri in all_image_list if "example" not in image_uri] # Do not create EKS cluster for when EIA Only Images are present is_all_images_list_eia = all("eia" in image_uri for image_uri in all_image_list) eks_cluster_name = None benchmark_mode = "benchmark" in test_type or is_benchmark_dev_context() specific_test_type = re.sub("benchmark-", "", test_type) if "benchmark" in test_type else test_type test_path = os.path.join("benchmark", specific_test_type) if benchmark_mode else specific_test_type # Skipping non HuggingFace specific tests to execute only sagemaker tests if any("huggingface" in image_uri for image_uri in all_image_list) and \ specific_test_type in ("ecs", "ec2", "eks", "bai"): # Creating an empty file for because codebuild job fails without it report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") sm_utils.generate_empty_report(report, test_type, "huggingface") return if specific_test_type in ("sanity", "ecs", "ec2", "eks", "canary", "bai"): report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") # The following two report files will only be used by EKS tests, as eks_train.xml and eks_infer.xml. # This is to sequence the tests and prevent one set of tests from waiting too long to be scheduled. report_train = os.path.join(os.getcwd(), "test", f"{test_type}_train.xml") report_infer = os.path.join(os.getcwd(), "test", f"{test_type}_infer.xml") report_multinode_train = os.path.join(os.getcwd(), "test", f"eks_multinode_train.xml") # PyTest must be run in this directory to avoid conflicting w/ sagemaker_tests conftests os.chdir(os.path.join("test", "dlc_tests")) # Pull images for necessary tests if specific_test_type == "sanity": pull_dlc_images(all_image_list) if specific_test_type == "bai": build_bai_docker_container() if specific_test_type == "eks" and not is_all_images_list_eia: frameworks_in_images = [ framework for framework in ("mxnet", "pytorch", "tensorflow") if framework in dlc_images ] if len(frameworks_in_images) != 1: raise ValueError( f"All images in dlc_images must be of a single framework for EKS tests.\n" f"Instead seeing {frameworks_in_images} frameworks." ) framework = frameworks_in_images[0] is_neuron = "neuron" in dlc_images eks_cluster_name = setup_eks_cluster(framework, is_neuron) setup_ssm_eks_cluster(eks_cluster_name) if not is_neuron: # setup kubeflow eks_utils.setup_kubeflow(eks_cluster_name) # Change 1: Split training and inference, and run one after the other, to prevent scheduling issues # Set -n=4, instead of -n=auto, because initiating too many pods simultaneously has been resulting in # pods timing-out while they were in the Pending state. Scheduling 4 tests (and hence, 4 pods) at once # seems to be an optimal configuration. # Change 2: Separate multi-node EKS tests from single-node tests in execution to prevent resource contention if not is_neuron: pytest_cmds = [ [ "-s", "-rA", os.path.join(test_path, framework, "training"), f"--junitxml={report_train}", "-n=4", "-m", "not multinode", ], [ "-s", "-rA", os.path.join(test_path, framework, "inference"), f"--junitxml={report_infer}", "-n=4", "-m", "not multinode", ], ["-s", "-rA", test_path, f"--junitxml={report_multinode_train}", "--multinode"], ] else: pytest_cmds = [ [ "-s", "-rA", os.path.join(test_path, framework, "inference"), f"--junitxml={report_infer}", "-n=4", "-m", "not multinode", ], ["-s", "-rA", test_path, f"--junitxml={report_multinode_train}", "--multinode"], ] if is_pr_context(): for cmd in pytest_cmds: cmd.append("--timeout=2340") else: # Execute dlc_tests pytest command pytest_cmd = ["-s", "-rA", test_path, f"--junitxml={report}", "-n=auto"] if specific_test_type == "ec2": pytest_cmd += ["--reruns=1", "--reruns-delay=10"] if is_pr_context(): pytest_cmd.append("--timeout=4860") pytest_cmds = [pytest_cmd] # Execute separate cmd for canaries if specific_test_type == "canary": pytest_cmds = [["-s", "-rA", f"--junitxml={report}", "-n=auto", "--canary", "--ignore=container_tests/"]] try: # Note:- Running multiple pytest_cmds in a sequence will result in the execution log having two # separate pytest reports, both of which must be examined in case of a manual review of results. cmd_exit_statuses = [pytest.main(pytest_cmd) for pytest_cmd in pytest_cmds] if all([status == 0 for status in cmd_exit_statuses]): sys.exit(0) else: raise RuntimeError(pytest_cmds) finally: if specific_test_type == "eks" and eks_cluster_name: delete_eks_cluster(eks_cluster_name) # Delete dangling EC2 KeyPairs if os.path.exists(KEYS_TO_DESTROY_FILE): delete_key_pairs(KEYS_TO_DESTROY_FILE) elif specific_test_type == "sagemaker": if "neuron" in dlc_images: LOGGER.info(f"Skipping sagemaker tests because Neuron is not yet supported on SM. Images: {dlc_images}") # Creating an empty file for because codebuild job fails without it report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") sm_utils.generate_empty_report(report, test_type, "neuron") return if benchmark_mode: report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") os.chdir(os.path.join("test", "dlc_tests")) setup_sm_benchmark_env(dlc_images, test_path) pytest_cmd = ["-s", "-rA", test_path, f"--junitxml={report}", "-n=auto", "-o", "norecursedirs=resources"] if not is_pr_context(): pytest_cmd += ["--efa"] if efa_dedicated else ["-m", "not efa"] sys.exit(pytest.main(pytest_cmd)) else: run_sagemaker_remote_tests( [image for image in standard_images_list if not ("tensorflow-inference" in image and "py2" in image)] ) metrics_utils.send_test_duration_metrics(start_time) elif specific_test_type == "sagemaker-local": if "neuron" in dlc_images: LOGGER.info(f"Skipping sagemaker tests because Neuron is not yet supported on SM. Images: {dlc_images}") # Creating an empty file for because codebuild job fails without it report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") sm_utils.generate_empty_report(report, test_type, "neuron") return testing_image_list = [ image for image in standard_images_list if not (("tensorflow-inference" in image and "py2" in image) or ("eia" in image)) ] run_sagemaker_local_tests(testing_image_list) # for EIA Images if len(testing_image_list) == 0: report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") sm_utils.generate_empty_report(report, test_type, "eia") else: raise NotImplementedError( f"{test_type} test is not supported. Only support ec2, ecs, eks, sagemaker and sanity currently" )
def main(): # Define constants start_time = datetime.now() test_type = os.getenv("TEST_TYPE") efa_dedicated = os.getenv("EFA_DEDICATED", "False").lower() == "true" executor_mode = os.getenv("EXECUTOR_MODE", "False").lower() == "true" dlc_images = os.getenv("DLC_IMAGE") if executor_mode else get_dlc_images() # Executing locally ona can provide commit_id or may ommit it. Assigning default value for local executions: commit_id = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", default="unrecognised_commit_id") LOGGER.info(f"Images tested: {dlc_images}") all_image_list = dlc_images.split(" ") standard_images_list = [ image_uri for image_uri in all_image_list if "example" not in image_uri ] # Do not create EKS cluster for when EIA Only Images are present is_all_images_list_eia = all("eia" in image_uri for image_uri in all_image_list) eks_cluster_name = None benchmark_mode = "benchmark" in test_type or is_benchmark_dev_context() specific_test_type = re.sub( "benchmark-", "", test_type) if "benchmark" in test_type else test_type build_context = get_build_context() # quick_checks tests don't have images in it. Using a placeholder here for jobs like that try: framework, version = get_framework_and_version_from_tag( all_image_list[0]) except: framework, version = "general_test", "none" pytest_cache_params = { "commit_id": commit_id, "framework": framework, "version": version, "build_context": build_context, "test_type": test_type, } # In PR context, allow us to switch sagemaker tests to RC tests. # Do not allow them to be both enabled due to capacity issues. if specific_test_type == "sagemaker" and is_rc_test_context( ) and is_pr_context(): specific_test_type = "release_candidate_integration" test_path = os.path.join( "benchmark", specific_test_type) if benchmark_mode else specific_test_type # Skipping non HuggingFace/AG specific tests to execute only sagemaker tests is_hf_image_present = any("huggingface" in image_uri for image_uri in all_image_list) is_ag_image_present = any("autogluon" in image_uri for image_uri in all_image_list) if (is_hf_image_present or is_ag_image_present) and specific_test_type in ("ecs", "ec2", "eks", "bai"): # Creating an empty file for because codebuild job fails without it LOGGER.info( f"NOTE: {specific_test_type} tests not supported on HF or AG. Skipping..." ) report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") sm_utils.generate_empty_report(report, test_type, "huggingface") return if specific_test_type in ( "sanity", "ecs", "ec2", "eks", "canary", "bai", "quick_checks", "release_candidate_integration", ): report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") # The following two report files will only be used by EKS tests, as eks_train.xml and eks_infer.xml. # This is to sequence the tests and prevent one set of tests from waiting too long to be scheduled. report_train = os.path.join(os.getcwd(), "test", f"{test_type}_train.xml") report_infer = os.path.join(os.getcwd(), "test", f"{test_type}_infer.xml") report_multinode_train = os.path.join(os.getcwd(), "test", f"eks_multinode_train.xml") # PyTest must be run in this directory to avoid conflicting w/ sagemaker_tests conftests os.chdir(os.path.join("test", "dlc_tests")) # Pull images for necessary tests if specific_test_type == "sanity": pull_dlc_images(all_image_list) if specific_test_type == "bai": build_bai_docker_container() if specific_test_type == "eks" and not is_all_images_list_eia: frameworks_in_images = [ framework for framework in ("mxnet", "pytorch", "tensorflow") if framework in dlc_images ] if len(frameworks_in_images) != 1: raise ValueError( f"All images in dlc_images must be of a single framework for EKS tests.\n" f"Instead seeing {frameworks_in_images} frameworks.") framework = frameworks_in_images[0] eks_cluster_name = f"{framework}-{build_context}" eks_utils.eks_setup() if eks_utils.is_eks_cluster_active(eks_cluster_name): eks_utils.eks_write_kubeconfig(eks_cluster_name) else: raise Exception( f"EKS cluster {eks_cluster_name} is not in active state") # Execute dlc_tests pytest command pytest_cmd = [ "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto" ] is_habana_image = any("habana" in image_uri for image_uri in all_image_list) if specific_test_type == "ec2": if is_habana_image: context = Context() context.run( "git clone https://github.com/HabanaAI/gaudi-test-suite.git" ) context.run( "tar -c -f gaudi-test-suite.tar.gz gaudi-test-suite") else: pytest_cmd += ["--reruns=1", "--reruns-delay=10"] if is_pr_context(): if specific_test_type == "eks": pytest_cmd.append("--timeout=2340") else: if is_habana_image: pytest_cmd.append("--timeout=18000") else: pytest_cmd.append("--timeout=4860") pytest_cmds = [pytest_cmd] # Execute separate cmd for canaries if specific_test_type in ("canary", "quick_checks"): pytest_cmds = [[ "-s", "-rA", f"--junitxml={report}", "-n=auto", f"--{specific_test_type}", "--ignore=container_tests/" ]] pytest_cmds = [ pytest_cmd + ["--last-failed", "--last-failed-no-failures", "all"] for pytest_cmd in pytest_cmds ] pytest_cache_util.download_pytest_cache_from_s3_to_local( os.getcwd(), **pytest_cache_params) try: # Note:- Running multiple pytest_cmds in a sequence will result in the execution log having two # separate pytest reports, both of which must be examined in case of a manual review of results. cmd_exit_statuses = [ pytest.main(pytest_cmd) for pytest_cmd in pytest_cmds ] if all([status == 0 for status in cmd_exit_statuses]): sys.exit(0) else: raise RuntimeError(pytest_cmds) finally: pytest_cache_util.upload_pytest_cache_from_local_to_s3( os.getcwd(), **pytest_cache_params) # Delete dangling EC2 KeyPairs if os.path.exists(KEYS_TO_DESTROY_FILE): delete_key_pairs(KEYS_TO_DESTROY_FILE) elif specific_test_type == "sagemaker": if "habana" in dlc_images: LOGGER.info(f"Skipping SM tests for Habana. Images: {dlc_images}") # Creating an empty file for because codebuild job fails without it report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") sm_utils.generate_empty_report(report, test_type, "habana") return if benchmark_mode: if "neuron" in dlc_images: LOGGER.info( f"Skipping benchmark sm tests for Neuron. Images: {dlc_images}" ) # Creating an empty file for because codebuild job fails without it report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") sm_utils.generate_empty_report(report, test_type, "neuron") return report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") os.chdir(os.path.join("test", "dlc_tests")) setup_sm_benchmark_env(dlc_images, test_path) pytest_cmd = [ "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto", "-o", "norecursedirs=resources" ] if not is_pr_context(): pytest_cmd += ["--efa"] if efa_dedicated else ["-m", "not efa"] sys.exit(pytest.main(pytest_cmd)) else: sm_remote_images = [ image for image in standard_images_list if not (("tensorflow-inference" in image and "py2" in image) or is_e3_image(image)) ] run_sagemaker_remote_tests(sm_remote_images, pytest_cache_params) if standard_images_list and not sm_remote_images: report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") sm_utils.generate_empty_report(report, test_type, "sm_remote_unsupported") metrics_utils.send_test_duration_metrics(start_time) elif specific_test_type == "sagemaker-local": if "neuron" in dlc_images: LOGGER.info( f"Skipping sagemaker tests because Neuron is not yet supported on SM. Images: {dlc_images}" ) # Creating an empty file for because codebuild job fails without it report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") sm_utils.generate_empty_report(report, test_type, "neuron") return if "habana" in dlc_images: LOGGER.info( f"Skipping sagemaker tests because Habana is not yet supported on SM. Images: {dlc_images}" ) # Creating an empty file for because codebuild job fails without it report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") sm_utils.generate_empty_report(report, test_type, "habana") return testing_image_list = [ image for image in standard_images_list if not (("tensorflow-inference" in image and "py2" in image) or ("eia" in image) or (is_e3_image(image))) ] run_sagemaker_local_tests(testing_image_list, pytest_cache_params) # for EIA Images if len(testing_image_list) == 0: report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") sm_utils.generate_empty_report(report, test_type, "eia") else: raise NotImplementedError( f"{test_type} test is not supported. Only support ec2, ecs, eks, sagemaker and sanity currently" )