Exemplo n.º 1
0
def test_git_secrets():
    ctx = Context()
    repository_path = os.getenv("CODEBUILD_SRC_DIR")
    if not repository_path:
        repository_path = _recursive_find_repo_path()
    LOGGER.info(f"repository_path = {repository_path}")

    # Replace the regex pattern below with a matching string to run test that makes scan fail:
    SOME_FAKE_CREDENTIALS = "ASIA[A-Z0-9]{16}"
    WHITELISTED_CREDENTIALS = "AKIAIOSFODNN7EXAMPLE"
    # End of Test Section

    with ctx.cd(repository_path):
        ctx.run("git clone https://github.com/awslabs/git-secrets.git")
        with ctx.cd("git-secrets"):
            ctx.run("make install")
        ctx.run("git secrets --install")
        ctx.run("git secrets --register-aws")
        output = ctx.run("git secrets --list")
        LOGGER.info(f"\n--COMMAND--\n{output.command}\n"
                    f"--STDOUT--\n{output.stdout}\n"
                    f"--STDERR--\n{output.stderr}\n"
                    f"----------")
        scan_results = ctx.run("git secrets --scan", hide=True, warn=True)
        LOGGER.info(f"\n--COMMAND--\n{scan_results.command}\n"
                    f"--STDOUT--\n{scan_results.stdout}\n"
                    f"--STDERR--\n{scan_results.stderr}"
                    f"----------")
    assert scan_results.ok, scan_results.stderr
Exemplo n.º 2
0
def daemon_runner(pytestconfig, data_dir, downloads_dir, working_dir):
    """
    Provide an invoke's `Local` object that has started the arduino-cli in daemon mode.
    This way is simple to start and kill the daemon when the test is finished
    via the kill() function

    Useful reference:
        http://docs.pyinvoke.org/en/1.4/api/runners.html#invoke.runners.Local
        http://docs.pyinvoke.org/en/1.4/api/runners.html
    """
    cli_full_line = os.path.join(str(pytestconfig.rootdir), "..", "arduino-cli daemon")
    env = {
        "ARDUINO_DATA_DIR": data_dir,
        "ARDUINO_DOWNLOADS_DIR": downloads_dir,
        "ARDUINO_SKETCHBOOK_DIR": data_dir,
    }
    os.makedirs(os.path.join(data_dir, "packages"))
    run_context = Context()
    run_context.cd(working_dir)
    # Local Class is the implementation of a Runner abstract class
    runner = Local(run_context)
    runner.run(cli_full_line, echo=False, hide=True, warn=True, env=env, asynchronous=True)

    # we block here until the test function using this fixture has returned
    yield runner

    # Kill the runner's process as we finished our test (platform dependent)
    os_signal = signal.SIGTERM
    if platform.system() != "Windows":
        os_signal = signal.SIGKILL
    os.kill(runner.process.pid, os_signal)
Exemplo n.º 3
0
def _run_eks_mxnet_multi_node_training(namespace, app_name, job_name, remote_yaml_file_path, unique_id):
    """Run MXNet distributed training on EKS using MXNet Operator
    Args:
    namespace, app_name, job_name, remote_yaml_file_path
    """

    kubeflow_version = "v0.4.1"
    home_dir = run("echo $HOME").stdout.strip("\n")
    path_to_ksonnet_app = os.path.join(home_dir, f"mxnet_multi_node_eks_test-{unique_id}")
    env = f"{namespace}-env"

    training_result = False

    ctx = Context()

    # Namespaces will allow parallel runs on the same cluster. Create namespace if it doesnt exist.
    does_namespace_exist = ctx.run(f"kubectl get namespace | grep {namespace}", warn=True)
    if not does_namespace_exist:
        ctx.run(f"kubectl create namespace {namespace}")
    if not os.path.exists(path_to_ksonnet_app):
        ctx.run(f"mkdir -p {path_to_ksonnet_app}")

    with ctx.cd(f"{path_to_ksonnet_app}"):
        ctx.run(f"rm -rf {app_name}")
        github_handler = GitHubHandler("aws", "kubeflow")
        github_token = github_handler.get_auth_token()
        ctx.run(f"ks init {app_name} --namespace {namespace}")

        with ctx.cd(app_name):
            ctx.run(f"ks env add {env} --namespace {namespace}")
            # Check if the kubeflow registry exists and create. Registry will be available in each pod.
            does_registry_exist = ctx.run("ks registry list | grep kubeflow", warn=True)
            if not does_registry_exist:
                ctx.run(
                    f"ks registry add kubeflow github.com/kubeflow/kubeflow/tree/{kubeflow_version}/kubeflow",
                    env={"GITHUB_TOKEN": github_token},
                    hide=True,
                )
                ctx.run(
                    f"ks pkg install kubeflow/mxnet-job@{kubeflow_version}",
                    env={"GITHUB_TOKEN": github_token},
                    hide=True,
                )

                ctx.run("ks generate mxnet-operator mxnet-operator", hide=True)

                try:
                    ctx.run(f"kubectl get pods -n {namespace} -o wide")
                    LOGGER.debug(f"ks apply {env} -c mxnet-operator -n {namespace}")
                    ctx.run(f"ks apply {env} -c mxnet-operator -n {namespace}")
                    # Delete old job with same name if exists
                    ctx.run(f"kubectl delete -f {remote_yaml_file_path}", warn=True)
                    ctx.run(f"kubectl create -f {remote_yaml_file_path} -n {namespace}")
                    if is_mxnet_eks_multinode_training_complete(job_name, namespace):
                        training_result = True
                finally:
                    eks_utils.eks_multinode_cleanup("", job_name, namespace, env)

    return training_result
Exemplo n.º 4
0
    def generate_sagemaker_reports(self):
        """
        Append SageMaker data to the report
        """
        ctx = Context()
        git_repo_path = get_repository_local_path()

        for repo in self.SM_REPOS:
            framework, job_type = repo.split(os.sep)
            pytest_framework_path = os.path.join(git_repo_path, "test",
                                                 "sagemaker_tests", framework,
                                                 job_type)
            with ctx.cd(pytest_framework_path):
                # We need to install requirements in order to use the SM pytest frameworks
                venv = os.path.join(pytest_framework_path,
                                    f".{repo.replace('/', '-')}")
                ctx.run(f"virtualenv {venv}")
                with ctx.prefix(
                        f"source {os.path.join(venv, 'bin', 'activate')}"):
                    ctx.run("pip install -r requirements.txt", warn=True)

                    # TF inference separates remote/local conftests, and must be handled differently
                    if framework == "tensorflow" and job_type == "inference":
                        with ctx.cd(
                                os.path.join(pytest_framework_path, "test",
                                             "integration")):
                            # Handle local tests
                            ctx.run(
                                f"{self.COVERAGE_DOC_COMMAND} --framework-version 2 local/",
                                hide=True)
                            # Handle remote integration tests
                            ctx.run(f"{self.COVERAGE_DOC_COMMAND} sagemaker/",
                                    hide=True)
                    else:
                        ctx.run(f"{self.COVERAGE_DOC_COMMAND} integration/",
                                hide=True)

        # Handle TF inference remote tests
        tf_inf_path = os.path.join(git_repo_path, "test", "sagemaker_tests",
                                   "tensorflow", "inference")

        with ctx.cd(tf_inf_path):
            # Install TF inference pip requirements
            ctx.run(f"virtualenv .tf_inference")
            with ctx.prefix(
                    f"source {os.path.join(tf_inf_path, '.tf_inference', 'bin', 'activate')}"
            ):
                ctx.run("pip install -r requirements.txt", warn=True)
                with ctx.cd(os.path.join(tf_inf_path, "test", "integration")):
                    # Handle local tests
                    ctx.run(
                        f"{self.COVERAGE_DOC_COMMAND} --framework-version 2 local/"
                    )

                    # Handle remote integration tests
                    ctx.run(f"{self.COVERAGE_DOC_COMMAND} sagemaker/")
Exemplo n.º 5
0
def build_library(model: Enclave, mode: str):
    model.generate_state()
    model.generate_forward(mode, )
    context = Context()
    with context.cd(cfg.get_ennclave_home()):
        if mode == 'sgx':
            model.generate_config()
            context.run('build/backend_sgx_encryptor')

        with context.cd("build"):  # TODO: make more robust
            context.run(f"make backend_{mode}")
Exemplo n.º 6
0
def run_sagemaker_test_in_executor(image, num_of_instances, instance_type):
    """
    Run pytest in a virtual env for a particular image

    Expected to run under multi-threading

    :param num_of_instances: <int> number of instances the image test requires
    :param instance_type: type of sagemaker instance the test needs
    :param image: ECR url
    :return:
    """
    import log_return

    LOGGER.info("Started running SageMaker test.....")
    pytest_command, path, tag, job_type = sm_utils.generate_sagemaker_pytest_cmd(image, "sagemaker")

    # update resource pool accordingly, then add a try-catch statement here to update the pool in case of failure
    try:
        log_return.update_pool("running", instance_type, num_of_instances, job_type)
        context = Context()
        with context.cd(path):
            context.run(f"python3 -m virtualenv {tag}")
            with context.prefix(f"source {tag}/bin/activate"):
                context.run("pip install -r requirements.txt", warn=True)
                context.run(pytest_command)
    except Exception as e:
        LOGGER.error(e)
        return False

    return True
Exemplo n.º 7
0
 def _run(cmd_string):
     cli_full_line = "{} {}".format(cli_path, cmd_string)
     run_context = Context()
     with run_context.cd(working_dir):
         return run_context.run(
             cli_full_line, echo=False, hide=True, warn=True, env=env
         )
def test_tensorflow_sagemaker_training_performance(tensorflow_training,
                                                   num_nodes, region):

    # This sleep has been inserted because all the parametrized training jobs are automatically created
    # by SageMaker with the same name, due to being started around the same time, and with the same image uri.
    time.sleep(
        random.Random(x=f"{tensorflow_training}{num_nodes}").random() * 60)

    framework_version = re.search(r"[1,2](\.\d+){2}",
                                  tensorflow_training).group()
    processor = "gpu" if "gpu" in tensorflow_training else "cpu"

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3"

    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training", processor,
                                          py_version)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt"
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {tensorflow_training} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"> {log_file}",
            warn=True,
            echo=True)

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

        ctx.run(
            f"aws s3 cp {log_file} {os.path.join(target_upload_location, log_file)}"
        )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )
Exemplo n.º 9
0
def build_bai_docker_container():
    """
    Builds docker container with necessary script requirements (bash 5.0+,conda)
    """
    # Assuming we are in dlc_tests directory
    docker_dir = os.path.join("benchmark", "bai", "docker")
    ctx = Context()
    with ctx.cd(docker_dir):
        ctx.run("docker build -t bai_env_container -f Dockerfile .")
Exemplo n.º 10
0
def common(backend: str):
    target_dir = join(cfg.get_ennclave_home(), 'backend', 'generated')

    preamble_backend = backend
    if backend == 'sgx':
        preamble_backend = 'sgx_enclave'

    with open(join(target_dir, f'{backend}_forward.cpp'), 'w+') as forward_file:
        forward_file.write(templates.preamble.render(backend=preamble_backend))
        forward_file.write(
            f"print_out(\"Hello, this is backend {backend}\\n\");")
        forward_file.write(templates.postamble)

    with open(join(target_dir, 'parameters.bin'), 'w') as parameter_file:
        pass

    with open(join(target_dir, 'sgx_config.xml'), 'w') as config_file:
        config_file.write("""     
<EnclaveConfiguration>
  <ProdID>0</ProdID>
  <ISVSVN>0</ISVSVN>
  <StackMaxSize>0x40000</StackMaxSize>
  <HeapInitSize>0x7e00000</HeapInitSize>
  <HeapMaxSize>0x7e00000</HeapMaxSize>
  <TCSNum>10</TCSNum>
  <TCSPolicy>1</TCSPolicy>
  <!-- Recommend changing 'DisableDebug' to 1 to make the sgx undebuggable for sgx release -->
  <DisableDebug>0</DisableDebug>
  <MiscSelect>0</MiscSelect>
  <MiscMask>0xFFFFFFFF</MiscMask>
</EnclaveConfiguration>""")

    context = Context()
    with context.cd(cfg.get_ennclave_home()):
        context.run('mkdir -p build')
        with context.cd('build'):
            # context.run('cmake ..')
            context.run(f'make backend_{backend}')

    if backend == 'native':
        ennclave.native_forward(b'', 0, 0)
    else:
        ennclave.sgx_forward(b'', 0, 0)
Exemplo n.º 11
0
def setup_sm_benchmark_tf_train_env(resources_location, setup_tf1_env,
                                    setup_tf2_env):
    """
    Create a virtual environment for benchmark tests if it doesn't already exist, and download all necessary scripts

    :param resources_location: <str> directory in which test resources should be placed
    :param setup_tf1_env: <bool> True if tf1 resources need to be setup
    :param setup_tf2_env: <bool> True if tf2 resources need to be setup
    :return: absolute path to the location of the virtual environment
    """
    ctx = Context()

    tf_resource_dir_list = []
    if setup_tf1_env:
        tf_resource_dir_list.append("tensorflow1")
    if setup_tf2_env:
        tf_resource_dir_list.append("tensorflow2")

    for resource_dir in tf_resource_dir_list:
        with ctx.cd(os.path.join(resources_location, resource_dir)):
            if not os.path.isdir(
                    os.path.join(resources_location, resource_dir, "horovod")):
                # v0.19.4 is the last version for which horovod example tests are py2 compatible
                ctx.run(
                    "git clone -b v0.19.4 https://github.com/horovod/horovod.git"
                )
            if not os.path.isdir(
                    os.path.join(resources_location, resource_dir,
                                 "deep-learning-models")):
                # We clone branch tf2 for both 1.x and 2.x tests because tf2 branch contains all necessary files
                ctx.run(
                    f"git clone -b tf2 https://github.com/aws-samples/deep-learning-models.git"
                )

    venv_dir = os.path.join(resources_location, "sm_benchmark_venv")
    if not os.path.isdir(venv_dir):
        ctx.run(f"virtualenv {venv_dir}")
        with ctx.prefix(f"source {venv_dir}/bin/activate"):
            ctx.run(
                "pip install 'sagemaker>=2,<3' awscli boto3 botocore six==1.11"
            )

            # SageMaker TF estimator is coded to only accept framework versions up to 2.1.0 as py2 compatible.
            # Fixing this through the following changes:
            estimator_location = ctx.run(
                "echo $(pip3 show sagemaker |grep 'Location' |sed s/'Location: '//g)/sagemaker/tensorflow/estimator.py"
            ).stdout.strip("\n")
            system = ctx.run("uname -s").stdout.strip("\n")
            sed_input_arg = "'' " if system == "Darwin" else ""
            ctx.run(
                f"sed -i {sed_input_arg}'s/\[2, 1, 0\]/\[2, 1, 1\]/g' {estimator_location}"
            )
    return venv_dir
Exemplo n.º 12
0
def execute_sagemaker_remote_tests(image):
    """
    Run pytest in a virtual env for a particular image
    Expected to run via multiprocessing
    :param image: ECR url
    """
    pytest_command, path, tag, job_type = generate_sagemaker_pytest_cmd(
        image, SAGEMAKER_REMOTE_TEST_TYPE)
    context = Context()
    with context.cd(path):
        context.run(f"virtualenv {tag}")
        with context.prefix(f"source {tag}/bin/activate"):
            context.run("pip install -r requirements.txt", warn=True)
            res = context.run(pytest_command, warn=True)
            metrics_utils.send_test_result_metrics(res.return_code)
def run_sagemaker_pytest_cmd(image):
    """
    Run pytest in a virtual env for a particular image

    Expected to run via multiprocessing

    :param image: ECR url
    """
    pytest_command, path, tag = generate_sagemaker_pytest_cmd(image)

    context = Context()
    with context.cd(path):
        context.run(f"virtualenv {tag}")
        with context.prefix(f"source {tag}/bin/activate"):
            context.run("pip install -r requirements.txt", warn=True)
            context.run(pytest_command)
Exemplo n.º 14
0
def autoformat(
    context: Context = CONTEXT,
    filepaths: Optional[Iterable[str]] = None,
    staged: bool = False,
):
    """Autoformat Python code."""
    if get_staged_status is not None and stash_unstaged_changes is not None:
        pass
    else:
        print('Cannot autoformat; missing required autohooks module.')
    commands = [
        # https://isort.readthedocs.io/en/latest/
        'isort',
        # https://github.com/psf/black
        'black -S -q',
        # https://github.com/myint/autoflake
        'autoflake --imports=apps,django,requests,typing,urllib3 --ignore-init-module-imports -i -r',  # noqa: E501
    ]
    filepaths: Iterable[str] = filepaths or []
    if staged:
        staged_filepaths = get_staged_status()
        filepaths += staged_filepaths
    filepaths = [
        filepath for filepath in filepaths if filepath.endswith('.py')
    ]
    if filepaths:
        commands.append(
            'unify --in-place')  # does not support recursion (directories)
    if staged:
        if not filepaths:
            return
        with stash_unstaged_changes(staged_filepaths):
            for filepath in filepaths:
                for command in commands:
                    context.run(f'{command} {filepath}', warn=True)
    elif filepaths:
        for filepath in filepaths:
            for command in commands:
                context.run(f'{command} {filepath}', warn=True)
    else:
        with context.cd(settings.BASE_DIR):
            for command in commands:
                context.run(f'{command} .')
def execute_sagemaker_remote_tests(process_index, image, global_pytest_cache,
                                   pytest_cache_params):
    """
    Run pytest in a virtual env for a particular image. Creates a custom directory for each thread for pytest cache file.
    Stores pytest cache in a shared dict.  
    Expected to run via multiprocessing
    :param process_index - id for process. Used to create a custom cache dir 
    :param image - ECR url
    :param global_pytest_cache - shared Manager().dict() for cache merging
    :param pytest_cache_params - parameters required for s3 file path building
    """
    account_id = os.getenv(
        "ACCOUNT_ID",
        boto3.client("sts").get_caller_identity()["Account"])
    pytest_cache_util = PytestCache(boto3.client("s3"), account_id)
    pytest_command, path, tag, job_type = generate_sagemaker_pytest_cmd(
        image, SAGEMAKER_REMOTE_TEST_TYPE)
    context = Context()
    with context.cd(path):
        context.run(f"virtualenv {tag}")
        with context.prefix(f"source {tag}/bin/activate"):
            context.run("pip install -r requirements.txt", warn=True)
            pytest_cache_util.download_pytest_cache_from_s3_to_local(
                path,
                **pytest_cache_params,
                custom_cache_directory=str(process_index))
            # adding -o cache_dir with a custom directory name
            pytest_command += f" -o cache_dir={os.path.join(str(process_index), '.pytest_cache')}"
            res = context.run(pytest_command, warn=True)
            metrics_utils.send_test_result_metrics(res.return_code)
            cache_json = pytest_cache_util.convert_pytest_cache_file_to_json(
                path, custom_cache_directory=str(process_index))
            global_pytest_cache.update(cache_json)
            if res.failed:
                raise DLCSageMakerRemoteTestFailure(
                    f"{pytest_command} failed with error code: {res.return_code}\n"
                    f"Traceback:\n{res.stdout}")
    return None
Exemplo n.º 16
0
def setup_sm_benchmark_tf_train_env(resources_location, setup_tf1_env,
                                    setup_tf2_env):
    """
    Create a virtual environment for benchmark tests if it doesn't already exist, and download all necessary scripts
    :param resources_location: <str> directory in which test resources should be placed
    :param setup_tf1_env: <bool> True if tf1 resources need to be setup
    :param setup_tf2_env: <bool> True if tf2 resources need to be setup
    :return: absolute path to the location of the virtual environment
    """
    ctx = Context()

    tf_resource_dir_list = []
    if setup_tf1_env:
        tf_resource_dir_list.append("tensorflow1")
    if setup_tf2_env:
        tf_resource_dir_list.append("tensorflow2")

    for resource_dir in tf_resource_dir_list:
        with ctx.cd(os.path.join(resources_location, resource_dir)):
            if not os.path.isdir(
                    os.path.join(resources_location, resource_dir, "horovod")):
                ctx.run("git clone https://github.com/horovod/horovod.git")
            if not os.path.isdir(
                    os.path.join(resources_location, resource_dir,
                                 "deep-learning-models")):
                # We clone branch tf2 for both 1.x and 2.x tests because tf2 branch contains all necessary files
                ctx.run(
                    f"git clone -b tf2 https://github.com/aws-samples/deep-learning-models.git"
                )

    venv_dir = os.path.join(resources_location, "sm_benchmark_venv")
    if not os.path.isdir(venv_dir):
        ctx.run(f"virtualenv {venv_dir}")
        with ctx.prefix(f"source {venv_dir}/bin/activate"):
            ctx.run("pip install -U sagemaker awscli boto3 botocore six==1.11")
    return venv_dir
def run_sm_perf_test(image_uri, num_nodes, region):
    """
    Run TF sagemaker training performance tests

    Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs
    some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file

    :param image_uri: ECR image URI
    :param num_nodes: Number of nodes to run on
    :param region: AWS region
    """
    _, framework_version = get_framework_and_version_from_tag(image_uri)
    if framework_version.startswith("1."):
        pytest.skip("Skipping benchmark test on TF 1.x images.")

    processor = "gpu" if "gpu" in image_uri else "cpu"
    device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3"

    time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training",
                                          device_cuda_str, py_version)
    training_job_name = (
        f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
    )

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = (
            f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt"
        )
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {image_uri} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name}"
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True,
        )

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    result_statement, throughput = _print_results_of_test(
        os.path.join(test_dir, log_file), processor)
    throughput /= num_nodes

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    threshold_table = ((TENSORFLOW_SM_TRAINING_CPU_1NODE_THRESHOLD if num_nodes
                        == 1 else TENSORFLOW_SM_TRAINING_CPU_4NODE_THRESHOLD)
                       if processor == "cpu" else
                       TENSORFLOW_SM_TRAINING_GPU_1NODE_THRESHOLD if num_nodes
                       == 1 else TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD)
    threshold = get_threshold_for_image(framework_version, threshold_table)
    LOGGER.info(
        f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} "
        f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec"
    )
    assert throughput > threshold, (
        f"tensorflow {framework_version} sagemaker training {processor} {py_version} imagenet {num_nodes} nodes "
        f"Benchmark Result {throughput} does not reach the threshold {threshold}"
    )
def run_sm_perf_test(image_uri, xla, num_nodes, region, threshold=None):
    """
    Run TF sagemaker training performance tests

    Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs
    some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file

    :param image_uri: ECR image URI
    :param xla: [ True | False ] Enable XLA acceleration
    :param num_nodes: Number of nodes to run on
    :param region: AWS region

    This function was inspired by deep-learning-containers/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py

    """
    _, framework_version = get_framework_and_version_from_tag(image_uri)

    processor = "xla" if xla else "gpu"
    device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}"
    '''
    TODO: Switch to p3.16xlarge when EC2 availability issues are resolved
    '''
    ec2_instance_type = "p3.8xlarge"
    py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3"

    time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(
        BENCHMARK_RESULTS_S3_BUCKET, "xla", "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version
    )
    training_job_name = (
        f"opt-tf{framework_version[0]}-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
    )

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = (
            f"results-{commit_info}-{time_str}-optimized-tf{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt"
        )
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {image_uri} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name} "
            f"--xla-{'on' if xla else 'off'} "
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True,
        )

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location, "failure_log")

    ctx.run(f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}")

    LOGGER.info(f"Test results can be found at {os.path.join(target_upload_location, log_file)}")

    result_statement, throughput = _print_results_of_test(os.path.join(test_dir, log_file))
    throughput /= num_nodes

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"optimized-tensorflow-{framework_version} sagemaker training {ec2_instance_type} {device_cuda_str} {py_version} "
        f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec"
    )
    if threshold:
        assert throughput > threshold, (
            f"optimized-tensorflow-{framework_version} sagemaker training {ec2_instance_type} {device_cuda_str} {py_version} imagenet {num_nodes} nodes "
            f"Regression Benchmark Result {throughput} does not reach the threshold {threshold}"
        )
    return throughput
def test_tensorflow_sagemaker_training_performance(tensorflow_training,
                                                   num_nodes, region):

    framework_version = re.search(r"[1,2](\.\d+){2}",
                                  tensorflow_training).group()
    if framework_version.startswith("1."):
        pytest.skip("Skipping benchmark test on TF 1.x images.")

    processor = "gpu" if "gpu" in tensorflow_training else "cpu"

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3"

    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training", processor,
                                          py_version)
    training_job_name = (
        f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}"
        f"-{commit_info[:7]}-{time_str}")

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt"
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {tensorflow_training} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name}"
            f"2>&1 > {log_file}",
            warn=True,
            echo=True)

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )
Exemplo n.º 20
0
def _run_eks_tensorflow_multi_node_training_mpijob(
        namespace, app_name, custom_image, job_name, command_to_run,
        args_to_pass, path_to_ksonnet_app, cluster_size, eks_gpus_per_worker):
    """
    Run Tensorflow distributed training on EKS using horovod docker images using MPIJob
    :param namespace:
    :param app_name:
    :param custom_image:
    :param job_name:
    :param command_to_run:
    :param args_to_pass:
    :param path_to_ksonnet_app:
    :param cluster_size:
    :param eks_gpus_per_worker:
    :return: None
    """
    KUBEFLOW_VERSION = "v0.5.1"
    pod_name = None
    env = f"{namespace}-env"
    ctx = Context()
    github_handler = GitHubHandler("aws", "kubeflow")
    github_handler.set_ksonnet_env()

    ctx.run(f"kubectl create namespace {namespace}")

    if not os.path.exists(path_to_ksonnet_app):
        ctx.run(f"mkdir -p {path_to_ksonnet_app}")

    with ctx.cd(path_to_ksonnet_app):
        ctx.run(f"rm -rf {app_name}")
        ctx.run(f"ks init {app_name} --namespace {namespace}")

        with ctx.cd(app_name):
            ctx.run(f"ks env add {env} --namespace {namespace}")
            # Check if the kubeflow registry exists and create. Registry will be available in each pod.
            registry_not_exist = ctx.run("ks registry list | grep kubeflow",
                                         warn=True)

            if registry_not_exist.return_code:
                ctx.run(
                    f"ks registry add kubeflow github.com/kubeflow/kubeflow/tree/{KUBEFLOW_VERSION}/kubeflow",
                )
                ctx.run(f"ks pkg install kubeflow/common@{KUBEFLOW_VERSION}")
                ctx.run(f"ks pkg install kubeflow/mpi-job@{KUBEFLOW_VERSION}")

            try:
                ctx.run("ks generate mpi-operator mpi-operator")
                # The latest mpi-operator docker image does not accept the gpus-per-node parameter
                # which is specified by the older spec file from v0.5.1.
                ctx.run(
                    "ks param set mpi-operator image mpioperator/mpi-operator:0.2.0"
                )
                ctx.run(
                    "ks param set mpi-operator kubectlDeliveryImage mpioperator/kubectl-delivery:0.2.0"
                )
                mpi_operator_start = ctx.run(f"ks apply {env} -c mpi-operator",
                                             warn=True)
                if mpi_operator_start.return_code:
                    raise RuntimeError(
                        f"Failed to start mpi-operator:\n{mpi_operator_start.stderr}"
                    )

                eks_utils.LOGGER.info(
                    f"The mpi-operator package must be applied to {env} env before we can use mpiJob. "
                    f"Check status before moving on.")
                ctx.run("kubectl get crd")

                # Use Ksonnet to generate manifest files which are then applied to the default context.
                ctx.run(f"ks generate mpi-job-custom {job_name}")
                ctx.run(f"ks param set {job_name} replicas {cluster_size}")
                ctx.run(
                    f"ks param set {job_name} gpusPerReplica {eks_gpus_per_worker}"
                )
                ctx.run(f"ks param set {job_name} image {custom_image}")
                ctx.run(f"ks param set {job_name} command {command_to_run}")
                ctx.run(f"ks param set {job_name} args {args_to_pass}")

                # use `$ks show default` to see details.
                ctx.run(f"kubectl get pods -n {namespace} -o wide")
                eks_utils.LOGGER.info(
                    f"Apply the generated manifest to the {env} env.")
                training_job_start = ctx.run(f"ks apply {env} -c {job_name}",
                                             warn=True)
                if training_job_start.return_code:
                    raise RuntimeError(
                        f"Failed to start {job_name}:\n{training_job_start.stderr}"
                    )

                eks_utils.LOGGER.info("Check pods")
                ctx.run(f"kubectl get pods -n {namespace} -o wide")

                eks_utils.LOGGER.info(
                    "First the mpi-operator and the n-worker pods will be created and then "
                    "the launcher pod is created in the end. Use retries until launcher "
                    "pod's name is available to read logs.")
                complete_pod_name = eks_utils.is_mpijob_launcher_pod_ready(
                    ctx, namespace, job_name)

                _, pod_name = complete_pod_name.split("/")
                eks_utils.LOGGER.info(
                    f"The Pods have been created and the name of the launcher pod is {pod_name}"
                )

                eks_utils.LOGGER.info(
                    f"Wait for the {job_name} job to complete")
                if eks_utils.is_eks_multinode_training_complete(
                        ctx, namespace, env, pod_name, job_name):
                    eks_utils.LOGGER.info(
                        f"Wait for the {pod_name} pod to reach completion")
                    distributed_out = ctx.run(
                        f"kubectl logs -n {namespace} -f {complete_pod_name}"
                    ).stdout
                    eks_utils.LOGGER.info(distributed_out)
            finally:
                eks_utils.eks_multinode_cleanup(ctx, pod_name, job_name,
                                                namespace, env)
Exemplo n.º 21
0
def test_mxnet_sagemaker_training_performance(mxnet_training, num_nodes,
                                              region, gpu_only, py3_only):
    """
    Run MX sagemaker training performance test

    Additional context: Setup for this function is performed by 'setup_sm_benchmark_mx_train_env' -- this installs
    some prerequisite packages, pulls required script, and creates a virtualenv called sm_benchmark_venv.

    The training script mxnet_imagenet_resnet50.py is invoked via a shell script smtrain-resnet50-imagenet.sh
    The shell script sets num-epochs to 40. This parameter is configurable.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file
    TODO: Change latency [time/epoch] metric to Throughput metric

    :param mxnet_training: ECR image URI
    :param num_nodes: Number of nodes to run on
    :param region: AWS region
    """
    _, framework_version = get_framework_and_version_from_tag(mxnet_training)
    device_cuda_str = f"gpu-{get_cuda_version_from_tag(mxnet_training)}"
    py_version = "py37" if "py37" in mxnet_training else "py2" if "py2" in mxnet_training else "py3"
    ec2_instance_type = "p3.16xlarge"

    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "manual")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "mxnet",
                                          framework_version, "sagemaker",
                                          "training", device_cuda_str,
                                          py_version)
    training_job_name = f"mx-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt"
        run_out = ctx.run(
            f"timeout 90m python mx_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {mxnet_training} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name} "
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True)

        if not run_out.ok:
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}",
        warn=True,
        echo=True)

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    result_statement, time_val, accuracy = _print_results_of_test(
        os.path.join(test_dir, log_file))

    accuracy_threshold = get_threshold_for_image(
        framework_version, MXNET_TRAINING_GPU_IMAGENET_ACCURACY_THRESHOLD)
    assert accuracy > accuracy_threshold, (
        f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes "
        f"Benchmark Result {accuracy} does not reach the threshold accuracy {accuracy_threshold}"
    )

    time_threshold = get_threshold_for_image(
        framework_version, MXNET_TRAINING_GPU_IMAGENET_LATENCY_THRESHOLD)
    assert time_val < time_threshold, (
        f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes "
        f"Benchmark Result {time_val} does not reach the threshold latency {time_threshold}"
    )
Exemplo n.º 22
0
class WorkspaceContext:
    def __init__(self, root):
        self.root = root
        self.mrover_build_root = os.path.join(os.path.expanduser('~'),
                                              '.mrover')
        self.jarvis_root = os.path.join(root, 'jarvis_files')
        self.third_party_root = os.path.join(root, '3rdparty')
        self.build_intermediate = os.path.join(self.mrover_build_root,
                                               'scratch')
        self.product_env = os.path.join(self.mrover_build_root, 'build_env')
        self.jarvis_env = os.path.join(self.mrover_build_root, 'jarvis_env')
        self.mbed_env = os.path.join(self.mrover_build_root, 'mbed_env')
        self.hash_store = os.path.join(self.mrover_build_root,
                                       'project_hashes')

        self.templates = Environment(loader=FileSystemLoader(
            os.path.join(self.jarvis_root, 'templates')))

        self.ctx = Context()

    def ensure_dir(self, d):
        """
        Creates a directory if it does not exist. After invocation of this
        function, you can be ensured the directory exists.

        Parameters:
        d - the path to the directory to create.

        Raises:
        BuildError if there is a file named `d`.
        """
        if not os.path.exists(d):
            os.makedirs(d)
        else:
            if not os.path.isdir(d):
                raise BuildError("{} already exists and is a file".format(d))

    def ensure_build_dirs(self):
        """
        Ensures the build directory structure exists.
        """
        self.ensure_dir(self.mrover_build_root)
        self.ensure_dir(self.hash_store)

    def ensure_product_env(self, clear=False):
        """
        Ensures the product venv existence. If clear is True, re-creates
        the product venv.
        """
        self.ensure_build_dirs()
        if not os.path.isdir(self.product_env) and not clear:
            venv.create(self.product_env,
                        clear=clear,
                        symlinks=True,
                        with_pip=True)

    def ensure_mbed_env(self):
        """
        Ensures the mbed venv exitence.
        """
        self.ensure_build_dirs()
        if not os.path.isdir(self.mbed_env):
            self.ctx.run('virtualenv --python=python2 {}'.format(
                self.mbed_env))

    @contextmanager
    def inside_product_env(self):
        """
        A context manager for activating the product venv.
        """
        with self.ctx.prefix("source {}/bin/activate".format(
                self.product_env)):
            yield

    @contextmanager
    def inside_mbed_env(self):
        """
        A context manager for activating the mbed venv.
        """
        with self.ctx.prefix("source {}/bin/activate".format(self.mbed_env)):
            yield

    def template(self, name, **kwargs):
        """
        Templates out a file and returns the rendered copy.
        """
        tpl = self.templates.get_template(name)
        return tpl.render(**kwargs)

    @contextmanager
    def cd(self, *args):
        with self.ctx.cd(*args):
            yield

    def run(self, *args, **kwargs):
        return self.ctx.run(*args, **kwargs)

    def get_product_file(self, *args):
        return os.path.join(self.product_env, *args)

    def get_jarvis_file(self, *args):
        return os.path.join(self.jarvis_env, *args)

    def get_mbed_file(self, *args):
        return os.path.join(self.mbed_env, *args)

    @contextmanager
    def intermediate(self, name, cleanup=False):
        """
        Create an intermediate build directory, then change directory to it.
        """
        intermediate = os.path.join(self.build_intermediate, name)
        self.ensure_dir(intermediate)
        if os.listdir(intermediate) and cleanup:
            shutil.rmtree(intermediate)
            self.ensure_dir(intermediate)

        with self.cd(intermediate):
            yield intermediate

        if cleanup:
            shutil.rmtree(intermediate)
def run_eks_pytorch_multi_node_training(namespace, app_name, job_name,
                                        remote_yaml_file_path, unique_id):
    """Run PyTorch distributed training on EKS using PyTorch Operator
    Args:
    namespace, app_name, job_name, remote_yaml_file_path
    """
    KUBEFLOW_VERSION = "v0.6.1"
    home_dir = run("echo $HOME").stdout.strip("\n")
    path_to_ksonnet_app = os.path.join(
        home_dir, f"pytorch_multi_node_eks_test-{unique_id}")
    env = f"{namespace}-env"

    ctx = Context()

    # Namespaces will allow parallel runs on the same cluster. Create namespace if it doesnt exist.
    does_namespace_exist = run(f"kubectl get namespace | grep {namespace}",
                               warn=True)
    if not does_namespace_exist:
        run(f"kubectl create namespace {namespace}")

    if not os.path.exists(path_to_ksonnet_app):
        ctx.run(f"mkdir -p {path_to_ksonnet_app}")

    with ctx.cd(path_to_ksonnet_app):
        ctx.run(f"rm -rf {app_name}")
        # Create a new ksonnet app.
        github_handler = GitHubHandler("aws", "kubeflow")
        github_handler.set_ksonnet_env()
        ctx.run(f"ks init {app_name} --namespace {namespace}")

        with ctx.cd(app_name):
            ctx.run(f"ks env add {env} --namespace {namespace}")

            # Check if the kubeflow registry exists and create. Registry will be available in each pod.
            does_registry_exist = ctx.run("ks registry list | grep kubeflow",
                                          warn=True)
            if not does_registry_exist:
                ctx.run(
                    f"ks registry add kubeflow github.com/kubeflow/kubeflow/tree/{KUBEFLOW_VERSION}/kubeflow",
                )
                ctx.run(
                    f"ks pkg install kubeflow/pytorch-job@{KUBEFLOW_VERSION}",
                )
                ctx.run(f"ks generate pytorch-operator pytorch-operator")
                try:
                    # use `$ks show default` to see details.
                    ctx.run(f"kubectl get pods -n {namespace} -o wide")
                    LOGGER.debug(
                        f"ks apply {env} -c pytorch-operator -n {namespace}")
                    ctx.run(
                        f"ks apply {env} -c pytorch-operator -n {namespace}")
                    # Delete old job with same name if exists
                    ctx.run(f"kubectl delete -f {remote_yaml_file_path}",
                            warn=True)
                    ctx.run(
                        f"kubectl create -f {remote_yaml_file_path} -n {namespace}"
                    )
                    training_result = is_pytorch_eks_multinode_training_complete(
                        job_name, namespace)
                    if training_result:
                        run_out = run(
                            f"kubectl logs {job_name}-master-0 -n {namespace}",
                            warn=True).stdout
                        if "accuracy" in run_out:
                            training_result = True
                        else:
                            eks_utils.LOGGER.info("**** training output ****")
                            eks_utils.LOGGER.debug(run_out)
                    assert training_result, f"Training for eks pytorch multinode failed"
                finally:
                    eks_utils.eks_multinode_cleanup(ctx, "", job_name,
                                                    namespace, env)
Exemplo n.º 24
0
def test_tensorflow_sagemaker_training_performance(tensorflow_training,
                                                   num_nodes, region):
    """
    Run TF sagemaker training performance tests

    Additonal context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs
    some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file

    :param tensorflow_training: ECR image URI
    :param num_nodes: Number of nodes to run on
    :param region: AWS region
    """
    framework_version = re.search(r"[1,2](\.\d+){2}",
                                  tensorflow_training).group()
    if framework_version.startswith("1."):
        pytest.skip("Skipping benchmark test on TF 1.x images.")

    processor = "gpu" if "gpu" in tensorflow_training else "cpu"

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3"

    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training", processor,
                                          py_version)
    training_job_name = (
        f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}"
        f"-{commit_info[:7]}-{time_str}")

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = f"results-{commit_info}-{time_str}-{framework_version}-{processor}-{py_version}-{num_nodes}-node.txt"
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {tensorflow_training} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name}"
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True)

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    _print_results_of_test(os.path.join(test_dir, log_file), processor)

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )