def test_ubuntu_version(image): """ Check that the ubuntu version in the image tag is the same as the one on a running container. :param image: ECR image URI """ ctx = Context() container_name = get_container_name("ubuntu-version", image) ubuntu_version = "" for tag_split in image.split("-"): if tag_split.startswith("ubuntu"): ubuntu_version = tag_split.split("ubuntu")[-1] start_container(container_name, image, ctx) output = run_cmd_on_container(container_name, ctx, "cat /etc/os-release") container_ubuntu_version = output.stdout assert "Ubuntu" in container_ubuntu_version assert ubuntu_version in container_ubuntu_version
def test_torchvision_nms_training(pytorch_training): """ Check that the internally built torchvision binary is used to resolve the missing nms issue. :param pytorch_training: framework fixture for pytorch training """ _, framework_version = get_framework_and_version_from_tag(pytorch_training) if Version(framework_version) >= Version("1.10.0"): pytest.skip( "Skipping this test for PT 1.10.0 and onward, since torch.ops.torchvision.nms api is outdated." ) if Version(framework_version) == Version( "1.5.1") and get_processor_from_image_uri( pytorch_training) == "gpu": pytest.skip("Skipping this test for PT 1.5.1 GPU Training DLC images") ctx = Context() container_name = get_container_name("torchvision-nms", pytorch_training) start_container(container_name, pytorch_training, ctx) run_cmd_on_container( container_name, ctx, f"import torch; import torchvision; print(torch.ops.torchvision.nms)", executable="python")
def test_framework_version_cpu(image): """ Check that the framework version in the image tag is the same as the one on a running container. This function tests CPU, EIA, and Neuron images. :param image: ECR image URI """ if "gpu" in image: pytest.skip( "GPU images will have their framework version tested in test_framework_and_cuda_version_gpu" ) image_repo_name, _ = get_repository_and_tag_from_image_uri(image) if re.fullmatch(r"(pr-|beta-|nightly-)?tensorflow-inference(-eia)?", image_repo_name): pytest.skip( msg= "TF inference for CPU/GPU/EIA does not have core tensorflow installed" ) tested_framework, tag_framework_version = get_framework_and_version_from_tag( image) # Framework name may include huggingface tested_framework = tested_framework.lstrip("huggingface_") # Module name is torch if tested_framework == "pytorch": tested_framework = "torch" ctx = Context() container_name = get_container_name("framework-version", image) start_container(container_name, image, ctx) output = run_cmd_on_container( container_name, ctx, f"import {tested_framework}; print({tested_framework}.__version__)", executable="python") if is_canary_context(): assert tag_framework_version in output.stdout.strip() else: assert tag_framework_version == output.stdout.strip()
def test_tf_serving_version_cpu(tensorflow_inference): """ For non-huggingface non-GPU TF inference images, check that the tag version matches the version of TF serving in the container. Huggingface includes MMS and core TF, hence the versioning scheme is based off of the underlying tensorflow framework version, rather than the TF serving version. GPU inference images will be tested along side `test_framework_and_cuda_version_gpu` in order to be judicious about GPU resources. This test can run directly on the host, and thus does not require additional resources to be spun up. @param tensorflow_inference: ECR image URI """ # Set local variable to clarify contents of fixture image = tensorflow_inference if "gpu" in image: pytest.skip( "GPU images will have their framework version tested in test_framework_and_cuda_version_gpu" ) if "neuron" in image: pytest.skip( "Neuron images will have their framework version tested in test_framework_and_neuron_sdk_version" ) _, tag_framework_version = get_framework_and_version_from_tag(image) ctx = Context() container_name = get_container_name("tf-serving-version", image) start_container(container_name, image, ctx) output = run_cmd_on_container(container_name, ctx, "tensorflow_model_server --version", executable="bash") assert re.match(rf"TensorFlow ModelServer: {tag_framework_version}(\D+)?", output.stdout), \ f"Cannot find model server version {tag_framework_version} in {output.stdout}" stop_and_remove_container(container_name, ctx)
def test_sagemaker_studio_analytics_extension(training, package_name): framework, framework_version = test_utils.get_framework_and_version_from_tag( training) utility_package_minimum_framework_version = { "pytorch": "1.7", "tensorflow": "2.4" } utility_package_maximum_framework_version = { "pytorch": "1.8", "tensorflow": "2.6" } if framework not in utility_package_minimum_framework_version or Version( framework_version) < Version( utility_package_minimum_framework_version[framework] ) or Version(framework_version) > Version( utility_package_maximum_framework_version[framework]): pytest.skip( f"sagemaker_studio_analytics_extension is not installed in {framework} {framework_version} DLCs" ) ctx = Context() container_name = test_utils.get_container_name( f"sagemaker_studio_analytics_extension-{package_name}", training) test_utils.start_container(container_name, training, ctx) # Optionally add version validation in the following steps, rather than just printing it. test_utils.run_cmd_on_container(container_name, ctx, f"pip list | grep -i {package_name}") import_package = package_name.replace("-", "_") import_test_cmd = (f"import {import_package}" if package_name in [ "sagemaker-studio-sparkmagic-lib", "sagemaker-studio-analytics-extension" ] else f"import {import_package}; print({import_package}.__version__)") test_utils.run_cmd_on_container(container_name, ctx, import_test_cmd, executable="python")
def test_stray_files(image): """ Test to ensure that unnecessary build artifacts are not present in any easily visible or tmp directories :param image: ECR image URI """ ctx = Context() container_name = get_container_name("test_tmp_dirs", image) start_container(container_name, image, ctx) # Running list of artifacts/artifact regular expressions we do not want in any of the directories stray_artifacts = [r"\.py"] # Running list of allowed files in the /tmp directory allowed_tmp_files = ["hsperfdata_root"] # Ensure stray artifacts are not in the tmp directory tmp = run_cmd_on_container(container_name, ctx, "ls -A /tmp") _assert_artifact_free(tmp, stray_artifacts) # Ensure tmp dir is empty except for whitelisted files tmp_files = tmp.stdout.split() for tmp_file in tmp_files: assert ( tmp_file in allowed_tmp_files ), f"Found unexpected file in tmp dir: {tmp_file}. Allowed tmp files: {allowed_tmp_files}" # We always expect /var/tmp to be empty var_tmp = run_cmd_on_container(container_name, ctx, "ls -A /var/tmp") _assert_artifact_free(var_tmp, stray_artifacts) assert var_tmp.stdout.strip() == "" # Additional check of home and root directories to ensure that stray artifacts are not present home = run_cmd_on_container(container_name, ctx, "ls -A ~") _assert_artifact_free(home, stray_artifacts) root = run_cmd_on_container(container_name, ctx, "ls -A /") _assert_artifact_free(root, stray_artifacts)
def test_sm_profiler_pt(pytorch_training): processor = get_processor_from_image_uri(pytorch_training) if processor not in ("cpu", "gpu"): pytest.skip(f"Processor {processor} not supported. Skipping test.") _, image_framework_version = get_framework_and_version_from_tag(pytorch_training) if Version(image_framework_version) in SpecifierSet(">=1.12"): pytest.skip("sm profiler ZCC test is not supported in PT 1.12 and above") ctx = Context() profiler_tests_dir = os.path.join( os.getenv("CODEBUILD_SRC_DIR"), get_container_name("smprof", pytorch_training), "smprofiler_tests" ) ctx.run(f"mkdir -p {profiler_tests_dir}", hide=True) # Download sagemaker-tests zip sm_tests_zip = "sagemaker-tests.zip" ctx.run( f"aws s3 cp {os.getenv('SMPROFILER_TESTS_BUCKET')}/{sm_tests_zip} {profiler_tests_dir}/{sm_tests_zip}", hide=True, ) # PT test setup requirements with ctx.prefix(f"cd {profiler_tests_dir}"): ctx.run(f"unzip {sm_tests_zip}", hide=True) with ctx.prefix("cd sagemaker-tests/tests/scripts/pytorch_scripts"): ctx.run("mkdir -p data", hide=True) ctx.run( "aws s3 cp s3://smdebug-testing/datasets/cifar-10-python.tar.gz data/cifar-10-batches-py.tar.gz", hide=True, ) ctx.run("aws s3 cp s3://smdebug-testing/datasets/MNIST_pytorch.tar.gz data/MNIST_pytorch.tar.gz", hide=True) with ctx.prefix("cd data"): ctx.run("tar -zxf MNIST_pytorch.tar.gz", hide=True) ctx.run("tar -zxf cifar-10-batches-py.tar.gz", hide=True) run_sm_profiler_tests(pytorch_training, profiler_tests_dir, "test_profiler_pytorch.py", processor)
def test_sm_profiler_tf(tensorflow_training): if is_tf_version("1", tensorflow_training): pytest.skip("Skipping test on TF1, since there are no smprofiler config files for TF1") processor = get_processor_from_image_uri(tensorflow_training) if processor not in ("cpu", "gpu"): pytest.skip(f"Processor {processor} not supported. Skipping test.") ctx = Context() profiler_tests_dir = os.path.join( os.getenv("CODEBUILD_SRC_DIR"), get_container_name("smprof", tensorflow_training), "smprofiler_tests" ) ctx.run(f"mkdir -p {profiler_tests_dir}", hide=True) # Download sagemaker-tests zip sm_tests_zip = "sagemaker-tests.zip" ctx.run( f"aws s3 cp {os.getenv('SMPROFILER_TESTS_BUCKET')}/{sm_tests_zip} {profiler_tests_dir}/{sm_tests_zip}", hide=True ) ctx.run(f"cd {profiler_tests_dir} && unzip {sm_tests_zip}", hide=True) run_sm_profiler_tests(tensorflow_training, profiler_tests_dir, "test_profiler_tensorflow.py", processor)
def test_pandas(image): """ It's possible that in newer python versions, we may have issues with installing pandas due to lack of presence of the bz2 module in py3 containers. This is a sanity test to ensure that pandas import works properly in all containers. :param image: ECR image URI """ ctx = Context() container_name = get_container_name("pandas", image) start_container(container_name, image, ctx) # Make sure we can install pandas, do not fail right away if there are pip check issues run_cmd_on_container(container_name, ctx, "pip install pandas", warn=True) pandas_import_output = run_cmd_on_container(container_name, ctx, "import pandas", executable="python") assert ( not pandas_import_output.stdout.strip() ), f"Expected no output when importing pandas, but got {pandas_import_output.stdout}" # Simple import test to ensure we do not get a bz2 module import failure run_cmd_on_container(container_name, ctx, "import pandas; print(pandas.__version__)", executable="python")
def test_python_version(image): """ Check that the python version in the image tag is the same as the one on a running container. :param image: ECR image URI """ ctx = Context() container_name = get_container_name("py-version", image) py_version = "" for tag_split in image.split("-"): if tag_split.startswith("py"): if len(tag_split) > 3: py_version = f"Python {tag_split[2]}.{tag_split[3]}" else: py_version = f"Python {tag_split[2]}" start_container(container_name, image, ctx) output = run_cmd_on_container(container_name, ctx, "python --version") # Due to py2 deprecation, Python2 version gets streamed to stderr. Python installed via Conda also appears to # stream to stderr (in some cases). container_py_version = output.stdout + output.stderr assert py_version in container_py_version, f"Cannot find {py_version} in {container_py_version}"
def test_utility_packages_using_import(training): """ Verify that utility packages are installed in the Training DLC image :param training: training ECR image URI """ ctx = Context() container_name = test_utils.get_container_name( "utility_packages_using_import", training) test_utils.start_container(container_name, training, ctx) framework, framework_version = test_utils.get_framework_and_version_from_tag( training) utility_package_minimum_framework_version = { "mxnet": "1.8", "pytorch": "1.7", "tensorflow2": "2.4", "tensorflow1": "1.15", } framework = "tensorflow1" if framework == "tensorflow" and framework_version.startswith( "1.") else "tensorflow2" if Version(framework_version) < Version( utility_package_minimum_framework_version[framework]): pytest.skip("Extra utility packages will be added going forward.") for package in UTILITY_PACKAGES_IMPORT: version = test_utils.run_cmd_on_container( container_name, ctx, f"import {package}; print({package}.__version__)", executable="python").stdout.strip() if package == "sagemaker": assert Version(version) > Version( "2" ), f"Sagemaker version should be > 2.0. Found version {sm_version}"
def test_oss_compliance(image): """ Run oss compliance check on a container to check if license attribution files exist. And upload source of third party packages to S3 bucket. """ THIRD_PARTY_SOURCE_CODE_BUCKET = "aws-dlinfra-licenses" THIRD_PARTY_SOURCE_CODE_BUCKET_PATH = "third_party_source_code" file = "THIRD_PARTY_SOURCE_CODE_URLS" container_name = get_container_name("oss_compliance", image) context = Context() local_repo_path = get_repository_local_path() start_container(container_name, image, context) # run compliance test to make sure license attribution files exists. testOSSCompliance is copied as part of Dockerfile run_cmd_on_container(container_name, context, "/usr/local/bin/testOSSCompliance /root") try: context.run( f"docker cp {container_name}:/root/{file} {os.path.join(local_repo_path, file)}" ) finally: context.run(f"docker rm -f {container_name}", hide=True) s3_resource = boto3.resource("s3") with open(os.path.join(local_repo_path, file)) as source_code_file: for line in source_code_file: name, version, url = line.split(" ") file_name = f"{name}_v{version}_source_code" s3_object_path = f"{THIRD_PARTY_SOURCE_CODE_BUCKET_PATH}/{file_name}.tar.gz" local_file_path = os.path.join(local_repo_path, file_name) for i in range(3): try: if not os.path.isdir(local_file_path): context.run( f"git clone {url.rstrip()} {local_file_path}") context.run( f"tar -czvf {local_file_path}.tar.gz {local_file_path}" ) except Exception as e: time.sleep(1) if i == 2: LOGGER.error(f"Unable to clone git repo. Error: {e}") raise continue try: if os.path.exists(f"{local_file_path}.tar.gz"): LOGGER.info(f"Uploading package to s3 bucket: {line}") s3_resource.Object(THIRD_PARTY_SOURCE_CODE_BUCKET, s3_object_path).load() except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "404": try: # using aws cli as using boto3 expects to upload folder by iterating through each file instead of entire folder. context.run( f"aws s3 cp {local_file_path}.tar.gz s3://{THIRD_PARTY_SOURCE_CODE_BUCKET}/{s3_object_path}" ) object = s3_resource.Bucket( THIRD_PARTY_SOURCE_CODE_BUCKET).Object( s3_object_path) object.Acl().put(ACL="public-read") except ClientError as e: LOGGER.error( f"Unable to upload source code to bucket {THIRD_PARTY_SOURCE_CODE_BUCKET}. Error: {e}" ) raise else: LOGGER.error( f"Unable to check if source code is present on bucket {THIRD_PARTY_SOURCE_CODE_BUCKET}. Error: {e}" ) raise
def _run_dependency_check_test(image, ec2_connection): # Record any whitelisted medium/low severity CVEs; I.E. allowed_vulnerabilities = {CVE-1000-5555, CVE-9999-9999} allowed_vulnerabilities = { # Those vulnerabilities are fixed. Current openssl version is 1.1.1g. These are false positive "CVE-2016-2109", "CVE-2016-2177", "CVE-2016-6303", "CVE-2016-2182", # CVE-2020-13936: vulnerability found in apache velocity package which is a dependency for dependency-check package. Hence, ignoring. "CVE-2020-13936", } processor = get_processor_from_image_uri(image) # Whitelist CVE #CVE-2021-3711 for DLCs where openssl is installed using apt-get framework, _ = get_framework_and_version_from_tag(image) short_fw_version = re.search(r"(\d+\.\d+)", image).group(1) # Check that these versions have been matched on https://ubuntu.com/security/CVE-2021-3711 before adding allow_openssl_cve_fw_versions = { "tensorflow": { "1.15": ["cpu", "gpu", "neuron"], "2.3": ["cpu", "gpu"], "2.4": ["cpu", "gpu"], "2.5": ["cpu", "gpu", "neuron"], "2.6": ["cpu", "gpu"], "2.7": ["cpu", "gpu"], }, "mxnet": { "1.8": ["neuron"], "1.9": ["cpu", "gpu"] }, "pytorch": { "1.10": ["cpu"] }, "huggingface_pytorch": { "1.8": ["cpu", "gpu"], "1.9": ["cpu", "gpu"] }, "huggingface_tensorflow": { "2.4": ["cpu", "gpu"], "2.5": ["cpu", "gpu"] }, "autogluon": { "0.3": ["cpu"] }, } if processor in allow_openssl_cve_fw_versions.get(framework, {}).get( short_fw_version, []): allowed_vulnerabilities.add("CVE-2021-3711") container_name = f"dep_check_{processor}" report_addon = get_container_name("depcheck-report", image) dependency_check_report = f"{report_addon}.html" html_file = f"{container_name}:/build/dependency-check-report.html" test_script = os.path.join(CONTAINER_TESTS_PREFIX, "testDependencyCheck") # Execute test, copy results to s3 ec2.execute_ec2_training_test(ec2_connection, image, test_script, container_name=container_name, bin_bash_entrypoint=True) ec2_connection.run(f"docker cp {html_file} ~/{dependency_check_report}") ec2_connection.run( f"aws s3 cp ~/{dependency_check_report} s3://dlc-dependency-check") # Check for any vulnerabilities not mentioned in allowed_vulnerabilities html_output = ec2_connection.run(f"cat ~/{dependency_check_report}", hide=True).stdout cves = re.findall(r">(CVE-\d+-\d+)</a>", html_output) vulnerabilities = set(cves) - allowed_vulnerabilities if vulnerabilities: vulnerability_severity = {} # Check NVD for vulnerability severity to provide this useful info in error message. for vulnerability in vulnerabilities: try: cve_url = f"https://services.nvd.nist.gov/rest/json/cve/1.0/{vulnerability}" session = requests.Session() session.mount( "https://", requests.adapters.HTTPAdapter(max_retries=Retry( total=5, status_forcelist=[404, 504, 502])), ) response = session.get(cve_url) if response.status_code == 200: severity = (response.json().get("result", {}).get( "CVE_Items", [{}])[0].get("impact", {}).get("baseMetricV2", {}).get("severity", "UNKNOWN")) if vulnerability_severity.get(severity): vulnerability_severity[severity].append(vulnerability) else: vulnerability_severity[severity] = [vulnerability] except ConnectionError: LOGGER.exception( f"Failed to load NIST data for CVE {vulnerability}") # TODO: Remove this once we have whitelisted appropriate LOW/MEDIUM vulnerabilities if not (vulnerability_severity.get("CRITICAL") or vulnerability_severity.get("HIGH")): return raise DependencyCheckFailure( f"Unrecognized CVEs have been reported : {vulnerability_severity}. " f"Allowed vulnerabilities are {allowed_vulnerabilities or None}. Please see " f"{dependency_check_report} for more details.")
def test_framework_version_cpu(image): """ Check that the framework version in the image tag is the same as the one on a running container. This function tests CPU, EIA images. :param image: ECR image URI """ if "gpu" in image: pytest.skip( "GPU images will have their framework version tested in test_framework_and_cuda_version_gpu" ) if "neuron" in image: pytest.skip( "Neuron images will have their framework version tested in test_framework_and_neuron_sdk_version" ) image_repo_name, _ = get_repository_and_tag_from_image_uri(image) if re.fullmatch( r"(pr-|beta-|nightly-)?tensorflow-inference(-eia|-graviton)?", image_repo_name): pytest.skip( "Non-gpu tensorflow-inference images will be tested in test_tf_serving_version_cpu." ) tested_framework, tag_framework_version = get_framework_and_version_from_tag( image) # Framework name may include huggingface if tested_framework.startswith('huggingface_'): tested_framework = tested_framework[len("huggingface_"):] # Module name is torch if tested_framework == "pytorch": tested_framework = "torch" elif tested_framework == "autogluon": tested_framework = "autogluon.core" ctx = Context() container_name = get_container_name("framework-version", image) start_container(container_name, image, ctx) output = run_cmd_on_container( container_name, ctx, f"import {tested_framework}; print({tested_framework}.__version__)", executable="python") if is_canary_context(): assert tag_framework_version in output.stdout.strip() else: if tested_framework == "autogluon.core": version_to_check = "0.3.1" if tag_framework_version == "0.3.2" else tag_framework_version assert output.stdout.strip().startswith(version_to_check) # Habana v1.2 binary does not follow the X.Y.Z+cpu naming convention elif "habana" not in image_repo_name: if tested_framework == "torch" and Version( tag_framework_version) >= Version("1.10.0"): torch_version_pattern = r"{torch_version}(\+cpu)".format( torch_version=tag_framework_version) assert re.fullmatch( torch_version_pattern, output.stdout.strip() ), (f"torch.__version__ = {output.stdout.strip()} does not match {torch_version_pattern}\n" f"Please specify framework version as X.Y.Z+cpu") else: if "neuron" in image: assert tag_framework_version in output.stdout.strip() if all(_string in image for _string in ["pytorch", "habana", "synapseai1.3.0"]): # Habana Pytorch version looks like 1.10.0a0+gitb488e78 for SynapseAI1.3 PT1.10.1 images pt_fw_version_pattern = r"(\d+(\.\d+){1,2}(-rc\d)?)((a0\+git\w{7}))" pt_fw_version_match = re.fullmatch(pt_fw_version_pattern, output.stdout.strip()) # This is desired for PT1.10.1 images assert pt_fw_version_match.group(1) == "1.10.0" else: assert tag_framework_version == output.stdout.strip() stop_and_remove_container(container_name, ctx)
def _run_dependency_check_test(image, ec2_connection, processor): # Record any whitelisted medium/low severity CVEs; I.E. allowed_vulnerabilities = {CVE-1000-5555, CVE-9999-9999} allowed_vulnerabilities = { # Those vulnerabilities are fixed. Current openssl version is 1.1.1g. These are false positive "CVE-2016-2109", "CVE-2016-2177", "CVE-2016-6303", "CVE-2016-2182", # CVE-2020-13936: vulnerability found in apache velocity package which is a dependency for dependency-check package. Hence, ignoring. "CVE-2020-13936", } container_name = f"dep_check_{processor}" report_addon = get_container_name("depcheck-report", image) dependency_check_report = f"{report_addon}.html" html_file = f"{container_name}:/build/dependency-check-report.html" test_script = os.path.join(CONTAINER_TESTS_PREFIX, "testDependencyCheck") # Execute test, copy results to s3 ec2.execute_ec2_training_test(ec2_connection, image, test_script, container_name=container_name) ec2_connection.run(f"docker cp {html_file} ~/{dependency_check_report}") ec2_connection.run(f"aws s3 cp ~/{dependency_check_report} s3://dlc-dependency-check") # Check for any vulnerabilities not mentioned in allowed_vulnerabilities html_output = ec2_connection.run(f"cat ~/{dependency_check_report}", hide=True).stdout cves = re.findall(r">(CVE-\d+-\d+)</a>", html_output) vulnerabilities = set(cves) - allowed_vulnerabilities if vulnerabilities: vulnerability_severity = {} # Check NVD for vulnerability severity to provide this useful info in error message. for vulnerability in vulnerabilities: try: cve_url = f"https://services.nvd.nist.gov/rest/json/cve/1.0/{vulnerability}" session = requests.Session() session.mount( "https://", requests.adapters.HTTPAdapter(max_retries=Retry(total=5, status_forcelist=[404, 504, 502])), ) response = session.get(cve_url) if response.status_code == 200: severity = ( response.json() .get("result", {}) .get("CVE_Items", [{}])[0] .get("impact", {}) .get("baseMetricV2", {}) .get("severity", "UNKNOWN") ) except ConnectionError: LOGGER.exception(f"Failed to load NIST data for CVE {vulnerability}") if vulnerability_severity.get(severity): vulnerability_severity[severity].append(vulnerability) else: vulnerability_severity[severity] = [vulnerability] # TODO: Remove this once we have whitelisted appropriate LOW/MEDIUM vulnerabilities if not (vulnerability_severity.get("CRITICAL") or vulnerability_severity.get("HIGH")): return raise DependencyCheckFailure( f"Unrecognized CVEs have been reported : {vulnerability_severity}. " f"Allowed vulnerabilities are {allowed_vulnerabilities or None}. Please see " f"{dependency_check_report} for more details." )
def test_eks_mxnet_dgl_single_node_training(mxnet_training, py3_only): """ Function to create a pod using kubectl and given container image, and run DGL training with MXNet backend Args: :param mxnet_training: the ECR URI """ # TODO: remove/update this when DGL supports MXNet 1.9 _, framework_version = get_framework_and_version_from_tag(mxnet_training) if Version(framework_version) >= Version('1.9.0'): pytest.skip("Skipping DGL tests as DGL does not yet support MXNet 1.9") training_result = False rand_int = random.randint(4001, 6000) yaml_path = os.path.join( os.sep, "tmp", f"mxnet_single_node_training_dgl_{rand_int}.yaml") pod_name = f"mxnet-single-node-training-dgl-{rand_int}" ctx = Context() # Run container to determine dgl version container_name = get_container_name("dgl-mx", mxnet_training) ctx.run(f"docker run --name {container_name} -itd {mxnet_training}") dgl_version = ctx.run( f"docker exec --user root {container_name} python -c 'import dgl; print(dgl.__version__)'" ).stdout.strip() dgl_major_minor = re.search(r'(^\d+.\d+).', dgl_version).group(1) dgl_branch = f"{dgl_major_minor}.x" args = ( f"git clone -b {dgl_branch} https://github.com/dmlc/dgl.git && " f"cd /dgl/examples/mxnet/gcn/ && DGLBACKEND=mxnet python train.py --dataset cora" ) # TODO: Change hardcoded value to read a mapping from the EKS cluster instance. cpu_limit = 72 cpu_limit = str(int(cpu_limit) / 2) if "gpu" in mxnet_training: args = args + " --gpu 0" else: args = args + " --gpu -1" search_replace_dict = { "<POD_NAME>": pod_name, "<CONTAINER_NAME>": mxnet_training, "<ARGS>": args, "<CPU_LIMIT>": cpu_limit, } eks_utils.write_eks_yaml_file_from_template( eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path, search_replace_dict) try: run("kubectl create -f {}".format(yaml_path)) if eks_utils.is_eks_training_complete(pod_name): dgl_out = run("kubectl logs {}".format(pod_name)).stdout if "Test accuracy" in dgl_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(dgl_out) assert training_result, f"Training failed" finally: run("kubectl delete pods {}".format(pod_name))
def test_framework_version_cpu(image): """ Check that the framework version in the image tag is the same as the one on a running container. This function tests CPU, EIA images. :param image: ECR image URI """ if "gpu" in image: pytest.skip( "GPU images will have their framework version tested in test_framework_and_cuda_version_gpu" ) if "neuron" in image: pytest.skip( "Neuron images will have their framework version tested in test_framework_and_neuron_sdk_version" ) image_repo_name, _ = get_repository_and_tag_from_image_uri(image) if re.fullmatch( r"(pr-|beta-|nightly-)?tensorflow-inference(-eia|-graviton)?", image_repo_name): pytest.skip( msg= "TF inference for CPU/GPU/EIA does not have core tensorflow installed" ) tested_framework, tag_framework_version = get_framework_and_version_from_tag( image) # Framework name may include huggingface if tested_framework.startswith('huggingface_'): tested_framework = tested_framework[len("huggingface_"):] # Module name is torch if tested_framework == "pytorch": tested_framework = "torch" elif tested_framework == "autogluon": tested_framework = "autogluon.core" ctx = Context() container_name = get_container_name("framework-version", image) start_container(container_name, image, ctx) output = run_cmd_on_container( container_name, ctx, f"import {tested_framework}; print({tested_framework}.__version__)", executable="python") if is_canary_context(): assert tag_framework_version in output.stdout.strip() else: if tested_framework == "autogluon.core": assert output.stdout.strip().startswith(tag_framework_version) # Habana v1.2 binary does not follow the X.Y.Z+cpu naming convention elif "habana" not in image_repo_name: if tested_framework == "torch" and Version( tag_framework_version) >= Version("1.10.0"): torch_version_pattern = r"{torch_version}(\+cpu)".format( torch_version=tag_framework_version) assert re.fullmatch( torch_version_pattern, output.stdout.strip() ), (f"torch.__version__ = {output.stdout.strip()} does not match {torch_version_pattern}\n" f"Please specify framework version as X.Y.Z+cpu") else: if "neuron" in image: assert tag_framework_version in output.stdout.strip() else: assert tag_framework_version == output.stdout.strip() stop_and_remove_container(container_name, ctx)