def test_git_secrets(): ctx = Context() repository_path = os.getenv("CODEBUILD_SRC_DIR") if not repository_path: repository_path = _recursive_find_repo_path() LOGGER.info(f"repository_path = {repository_path}") # Replace the regex pattern below with a matching string to run test that makes scan fail: SOME_FAKE_CREDENTIALS = "ASIA[A-Z0-9]{16}" WHITELISTED_CREDENTIALS = "AKIAIOSFODNN7EXAMPLE" # End of Test Section with ctx.cd(repository_path): ctx.run("git clone https://github.com/awslabs/git-secrets.git") with ctx.cd("git-secrets"): ctx.run("make install") ctx.run("git secrets --install") ctx.run("git secrets --register-aws") output = ctx.run("git secrets --list") LOGGER.info(f"\n--COMMAND--\n{output.command}\n" f"--STDOUT--\n{output.stdout}\n" f"--STDERR--\n{output.stderr}\n" f"----------") scan_results = ctx.run("git secrets --scan", hide=True, warn=True) LOGGER.info(f"\n--COMMAND--\n{scan_results.command}\n" f"--STDOUT--\n{scan_results.stdout}\n" f"--STDERR--\n{scan_results.stderr}" f"----------") assert scan_results.ok, scan_results.stderr
def daemon_runner(pytestconfig, data_dir, downloads_dir, working_dir): """ Provide an invoke's `Local` object that has started the arduino-cli in daemon mode. This way is simple to start and kill the daemon when the test is finished via the kill() function Useful reference: http://docs.pyinvoke.org/en/1.4/api/runners.html#invoke.runners.Local http://docs.pyinvoke.org/en/1.4/api/runners.html """ cli_full_line = os.path.join(str(pytestconfig.rootdir), "..", "arduino-cli daemon") env = { "ARDUINO_DATA_DIR": data_dir, "ARDUINO_DOWNLOADS_DIR": downloads_dir, "ARDUINO_SKETCHBOOK_DIR": data_dir, } os.makedirs(os.path.join(data_dir, "packages")) run_context = Context() run_context.cd(working_dir) # Local Class is the implementation of a Runner abstract class runner = Local(run_context) runner.run(cli_full_line, echo=False, hide=True, warn=True, env=env, asynchronous=True) # we block here until the test function using this fixture has returned yield runner # Kill the runner's process as we finished our test (platform dependent) os_signal = signal.SIGTERM if platform.system() != "Windows": os_signal = signal.SIGKILL os.kill(runner.process.pid, os_signal)
def _run_eks_mxnet_multi_node_training(namespace, app_name, job_name, remote_yaml_file_path, unique_id): """Run MXNet distributed training on EKS using MXNet Operator Args: namespace, app_name, job_name, remote_yaml_file_path """ kubeflow_version = "v0.4.1" home_dir = run("echo $HOME").stdout.strip("\n") path_to_ksonnet_app = os.path.join(home_dir, f"mxnet_multi_node_eks_test-{unique_id}") env = f"{namespace}-env" training_result = False ctx = Context() # Namespaces will allow parallel runs on the same cluster. Create namespace if it doesnt exist. does_namespace_exist = ctx.run(f"kubectl get namespace | grep {namespace}", warn=True) if not does_namespace_exist: ctx.run(f"kubectl create namespace {namespace}") if not os.path.exists(path_to_ksonnet_app): ctx.run(f"mkdir -p {path_to_ksonnet_app}") with ctx.cd(f"{path_to_ksonnet_app}"): ctx.run(f"rm -rf {app_name}") github_handler = GitHubHandler("aws", "kubeflow") github_token = github_handler.get_auth_token() ctx.run(f"ks init {app_name} --namespace {namespace}") with ctx.cd(app_name): ctx.run(f"ks env add {env} --namespace {namespace}") # Check if the kubeflow registry exists and create. Registry will be available in each pod. does_registry_exist = ctx.run("ks registry list | grep kubeflow", warn=True) if not does_registry_exist: ctx.run( f"ks registry add kubeflow github.com/kubeflow/kubeflow/tree/{kubeflow_version}/kubeflow", env={"GITHUB_TOKEN": github_token}, hide=True, ) ctx.run( f"ks pkg install kubeflow/mxnet-job@{kubeflow_version}", env={"GITHUB_TOKEN": github_token}, hide=True, ) ctx.run("ks generate mxnet-operator mxnet-operator", hide=True) try: ctx.run(f"kubectl get pods -n {namespace} -o wide") LOGGER.debug(f"ks apply {env} -c mxnet-operator -n {namespace}") ctx.run(f"ks apply {env} -c mxnet-operator -n {namespace}") # Delete old job with same name if exists ctx.run(f"kubectl delete -f {remote_yaml_file_path}", warn=True) ctx.run(f"kubectl create -f {remote_yaml_file_path} -n {namespace}") if is_mxnet_eks_multinode_training_complete(job_name, namespace): training_result = True finally: eks_utils.eks_multinode_cleanup("", job_name, namespace, env) return training_result
def generate_sagemaker_reports(self): """ Append SageMaker data to the report """ ctx = Context() git_repo_path = get_repository_local_path() for repo in self.SM_REPOS: framework, job_type = repo.split(os.sep) pytest_framework_path = os.path.join(git_repo_path, "test", "sagemaker_tests", framework, job_type) with ctx.cd(pytest_framework_path): # We need to install requirements in order to use the SM pytest frameworks venv = os.path.join(pytest_framework_path, f".{repo.replace('/', '-')}") ctx.run(f"virtualenv {venv}") with ctx.prefix( f"source {os.path.join(venv, 'bin', 'activate')}"): ctx.run("pip install -r requirements.txt", warn=True) # TF inference separates remote/local conftests, and must be handled differently if framework == "tensorflow" and job_type == "inference": with ctx.cd( os.path.join(pytest_framework_path, "test", "integration")): # Handle local tests ctx.run( f"{self.COVERAGE_DOC_COMMAND} --framework-version 2 local/", hide=True) # Handle remote integration tests ctx.run(f"{self.COVERAGE_DOC_COMMAND} sagemaker/", hide=True) else: ctx.run(f"{self.COVERAGE_DOC_COMMAND} integration/", hide=True) # Handle TF inference remote tests tf_inf_path = os.path.join(git_repo_path, "test", "sagemaker_tests", "tensorflow", "inference") with ctx.cd(tf_inf_path): # Install TF inference pip requirements ctx.run(f"virtualenv .tf_inference") with ctx.prefix( f"source {os.path.join(tf_inf_path, '.tf_inference', 'bin', 'activate')}" ): ctx.run("pip install -r requirements.txt", warn=True) with ctx.cd(os.path.join(tf_inf_path, "test", "integration")): # Handle local tests ctx.run( f"{self.COVERAGE_DOC_COMMAND} --framework-version 2 local/" ) # Handle remote integration tests ctx.run(f"{self.COVERAGE_DOC_COMMAND} sagemaker/")
def build_library(model: Enclave, mode: str): model.generate_state() model.generate_forward(mode, ) context = Context() with context.cd(cfg.get_ennclave_home()): if mode == 'sgx': model.generate_config() context.run('build/backend_sgx_encryptor') with context.cd("build"): # TODO: make more robust context.run(f"make backend_{mode}")
def run_sagemaker_test_in_executor(image, num_of_instances, instance_type): """ Run pytest in a virtual env for a particular image Expected to run under multi-threading :param num_of_instances: <int> number of instances the image test requires :param instance_type: type of sagemaker instance the test needs :param image: ECR url :return: """ import log_return LOGGER.info("Started running SageMaker test.....") pytest_command, path, tag, job_type = sm_utils.generate_sagemaker_pytest_cmd(image, "sagemaker") # update resource pool accordingly, then add a try-catch statement here to update the pool in case of failure try: log_return.update_pool("running", instance_type, num_of_instances, job_type) context = Context() with context.cd(path): context.run(f"python3 -m virtualenv {tag}") with context.prefix(f"source {tag}/bin/activate"): context.run("pip install -r requirements.txt", warn=True) context.run(pytest_command) except Exception as e: LOGGER.error(e) return False return True
def _run(cmd_string): cli_full_line = "{} {}".format(cli_path, cmd_string) run_context = Context() with run_context.cd(working_dir): return run_context.run( cli_full_line, echo=False, hide=True, warn=True, env=env )
def test_tensorflow_sagemaker_training_performance(tensorflow_training, num_nodes, region): # This sleep has been inserted because all the parametrized training jobs are automatically created # by SageMaker with the same name, due to being started around the same time, and with the same image uri. time.sleep( random.Random(x=f"{tensorflow_training}{num_nodes}").random() * 60) framework_version = re.search(r"[1,2](\.\d+){2}", tensorflow_training).group() processor = "gpu" if "gpu" in tensorflow_training else "cpu" ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3" time_str = time.strftime('%Y-%m-%d-%H-%M-%S') commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", processor, py_version) test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt" run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {tensorflow_training} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"> {log_file}", warn=True, echo=True) if not (run_out.ok or run_out.return_code == 124): target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {log_file} {os.path.join(target_upload_location, log_file)}" ) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" )
def build_bai_docker_container(): """ Builds docker container with necessary script requirements (bash 5.0+,conda) """ # Assuming we are in dlc_tests directory docker_dir = os.path.join("benchmark", "bai", "docker") ctx = Context() with ctx.cd(docker_dir): ctx.run("docker build -t bai_env_container -f Dockerfile .")
def common(backend: str): target_dir = join(cfg.get_ennclave_home(), 'backend', 'generated') preamble_backend = backend if backend == 'sgx': preamble_backend = 'sgx_enclave' with open(join(target_dir, f'{backend}_forward.cpp'), 'w+') as forward_file: forward_file.write(templates.preamble.render(backend=preamble_backend)) forward_file.write( f"print_out(\"Hello, this is backend {backend}\\n\");") forward_file.write(templates.postamble) with open(join(target_dir, 'parameters.bin'), 'w') as parameter_file: pass with open(join(target_dir, 'sgx_config.xml'), 'w') as config_file: config_file.write(""" <EnclaveConfiguration> <ProdID>0</ProdID> <ISVSVN>0</ISVSVN> <StackMaxSize>0x40000</StackMaxSize> <HeapInitSize>0x7e00000</HeapInitSize> <HeapMaxSize>0x7e00000</HeapMaxSize> <TCSNum>10</TCSNum> <TCSPolicy>1</TCSPolicy> <!-- Recommend changing 'DisableDebug' to 1 to make the sgx undebuggable for sgx release --> <DisableDebug>0</DisableDebug> <MiscSelect>0</MiscSelect> <MiscMask>0xFFFFFFFF</MiscMask> </EnclaveConfiguration>""") context = Context() with context.cd(cfg.get_ennclave_home()): context.run('mkdir -p build') with context.cd('build'): # context.run('cmake ..') context.run(f'make backend_{backend}') if backend == 'native': ennclave.native_forward(b'', 0, 0) else: ennclave.sgx_forward(b'', 0, 0)
def setup_sm_benchmark_tf_train_env(resources_location, setup_tf1_env, setup_tf2_env): """ Create a virtual environment for benchmark tests if it doesn't already exist, and download all necessary scripts :param resources_location: <str> directory in which test resources should be placed :param setup_tf1_env: <bool> True if tf1 resources need to be setup :param setup_tf2_env: <bool> True if tf2 resources need to be setup :return: absolute path to the location of the virtual environment """ ctx = Context() tf_resource_dir_list = [] if setup_tf1_env: tf_resource_dir_list.append("tensorflow1") if setup_tf2_env: tf_resource_dir_list.append("tensorflow2") for resource_dir in tf_resource_dir_list: with ctx.cd(os.path.join(resources_location, resource_dir)): if not os.path.isdir( os.path.join(resources_location, resource_dir, "horovod")): # v0.19.4 is the last version for which horovod example tests are py2 compatible ctx.run( "git clone -b v0.19.4 https://github.com/horovod/horovod.git" ) if not os.path.isdir( os.path.join(resources_location, resource_dir, "deep-learning-models")): # We clone branch tf2 for both 1.x and 2.x tests because tf2 branch contains all necessary files ctx.run( f"git clone -b tf2 https://github.com/aws-samples/deep-learning-models.git" ) venv_dir = os.path.join(resources_location, "sm_benchmark_venv") if not os.path.isdir(venv_dir): ctx.run(f"virtualenv {venv_dir}") with ctx.prefix(f"source {venv_dir}/bin/activate"): ctx.run( "pip install 'sagemaker>=2,<3' awscli boto3 botocore six==1.11" ) # SageMaker TF estimator is coded to only accept framework versions up to 2.1.0 as py2 compatible. # Fixing this through the following changes: estimator_location = ctx.run( "echo $(pip3 show sagemaker |grep 'Location' |sed s/'Location: '//g)/sagemaker/tensorflow/estimator.py" ).stdout.strip("\n") system = ctx.run("uname -s").stdout.strip("\n") sed_input_arg = "'' " if system == "Darwin" else "" ctx.run( f"sed -i {sed_input_arg}'s/\[2, 1, 0\]/\[2, 1, 1\]/g' {estimator_location}" ) return venv_dir
def execute_sagemaker_remote_tests(image): """ Run pytest in a virtual env for a particular image Expected to run via multiprocessing :param image: ECR url """ pytest_command, path, tag, job_type = generate_sagemaker_pytest_cmd( image, SAGEMAKER_REMOTE_TEST_TYPE) context = Context() with context.cd(path): context.run(f"virtualenv {tag}") with context.prefix(f"source {tag}/bin/activate"): context.run("pip install -r requirements.txt", warn=True) res = context.run(pytest_command, warn=True) metrics_utils.send_test_result_metrics(res.return_code)
def run_sagemaker_pytest_cmd(image): """ Run pytest in a virtual env for a particular image Expected to run via multiprocessing :param image: ECR url """ pytest_command, path, tag = generate_sagemaker_pytest_cmd(image) context = Context() with context.cd(path): context.run(f"virtualenv {tag}") with context.prefix(f"source {tag}/bin/activate"): context.run("pip install -r requirements.txt", warn=True) context.run(pytest_command)
def autoformat( context: Context = CONTEXT, filepaths: Optional[Iterable[str]] = None, staged: bool = False, ): """Autoformat Python code.""" if get_staged_status is not None and stash_unstaged_changes is not None: pass else: print('Cannot autoformat; missing required autohooks module.') commands = [ # https://isort.readthedocs.io/en/latest/ 'isort', # https://github.com/psf/black 'black -S -q', # https://github.com/myint/autoflake 'autoflake --imports=apps,django,requests,typing,urllib3 --ignore-init-module-imports -i -r', # noqa: E501 ] filepaths: Iterable[str] = filepaths or [] if staged: staged_filepaths = get_staged_status() filepaths += staged_filepaths filepaths = [ filepath for filepath in filepaths if filepath.endswith('.py') ] if filepaths: commands.append( 'unify --in-place') # does not support recursion (directories) if staged: if not filepaths: return with stash_unstaged_changes(staged_filepaths): for filepath in filepaths: for command in commands: context.run(f'{command} {filepath}', warn=True) elif filepaths: for filepath in filepaths: for command in commands: context.run(f'{command} {filepath}', warn=True) else: with context.cd(settings.BASE_DIR): for command in commands: context.run(f'{command} .')
def execute_sagemaker_remote_tests(process_index, image, global_pytest_cache, pytest_cache_params): """ Run pytest in a virtual env for a particular image. Creates a custom directory for each thread for pytest cache file. Stores pytest cache in a shared dict. Expected to run via multiprocessing :param process_index - id for process. Used to create a custom cache dir :param image - ECR url :param global_pytest_cache - shared Manager().dict() for cache merging :param pytest_cache_params - parameters required for s3 file path building """ account_id = os.getenv( "ACCOUNT_ID", boto3.client("sts").get_caller_identity()["Account"]) pytest_cache_util = PytestCache(boto3.client("s3"), account_id) pytest_command, path, tag, job_type = generate_sagemaker_pytest_cmd( image, SAGEMAKER_REMOTE_TEST_TYPE) context = Context() with context.cd(path): context.run(f"virtualenv {tag}") with context.prefix(f"source {tag}/bin/activate"): context.run("pip install -r requirements.txt", warn=True) pytest_cache_util.download_pytest_cache_from_s3_to_local( path, **pytest_cache_params, custom_cache_directory=str(process_index)) # adding -o cache_dir with a custom directory name pytest_command += f" -o cache_dir={os.path.join(str(process_index), '.pytest_cache')}" res = context.run(pytest_command, warn=True) metrics_utils.send_test_result_metrics(res.return_code) cache_json = pytest_cache_util.convert_pytest_cache_file_to_json( path, custom_cache_directory=str(process_index)) global_pytest_cache.update(cache_json) if res.failed: raise DLCSageMakerRemoteTestFailure( f"{pytest_command} failed with error code: {res.return_code}\n" f"Traceback:\n{res.stdout}") return None
def setup_sm_benchmark_tf_train_env(resources_location, setup_tf1_env, setup_tf2_env): """ Create a virtual environment for benchmark tests if it doesn't already exist, and download all necessary scripts :param resources_location: <str> directory in which test resources should be placed :param setup_tf1_env: <bool> True if tf1 resources need to be setup :param setup_tf2_env: <bool> True if tf2 resources need to be setup :return: absolute path to the location of the virtual environment """ ctx = Context() tf_resource_dir_list = [] if setup_tf1_env: tf_resource_dir_list.append("tensorflow1") if setup_tf2_env: tf_resource_dir_list.append("tensorflow2") for resource_dir in tf_resource_dir_list: with ctx.cd(os.path.join(resources_location, resource_dir)): if not os.path.isdir( os.path.join(resources_location, resource_dir, "horovod")): ctx.run("git clone https://github.com/horovod/horovod.git") if not os.path.isdir( os.path.join(resources_location, resource_dir, "deep-learning-models")): # We clone branch tf2 for both 1.x and 2.x tests because tf2 branch contains all necessary files ctx.run( f"git clone -b tf2 https://github.com/aws-samples/deep-learning-models.git" ) venv_dir = os.path.join(resources_location, "sm_benchmark_venv") if not os.path.isdir(venv_dir): ctx.run(f"virtualenv {venv_dir}") with ctx.prefix(f"source {venv_dir}/bin/activate"): ctx.run("pip install -U sagemaker awscli boto3 botocore six==1.11") return venv_dir
def run_sm_perf_test(image_uri, num_nodes, region): """ Run TF sagemaker training performance tests Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv. TODO: Refactor the above setup function to be more obviously connected to this function, TODO: and install requirements via a requirements.txt file :param image_uri: ECR image URI :param num_nodes: Number of nodes to run on :param region: AWS region """ _, framework_version = get_framework_and_version_from_tag(image_uri) if framework_version.startswith("1."): pytest.skip("Skipping benchmark test on TF 1.x images.") processor = "gpu" if "gpu" in image_uri else "cpu" device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3" time_str = time.strftime("%Y-%m-%d-%H-%M-%S") commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version) training_job_name = ( f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" ) # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in # a throttling error for SageMaker APIs. time.sleep(Random(x=training_job_name).random() * 60) test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = ( f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt" ) run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {image_uri} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name}" f"2>&1 | tee {log_file}", warn=True, echo=True, ) if not (run_out.ok or run_out.return_code == 124): target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}" ) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) result_statement, throughput = _print_results_of_test( os.path.join(test_dir, log_file), processor) throughput /= num_nodes assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) threshold_table = ((TENSORFLOW_SM_TRAINING_CPU_1NODE_THRESHOLD if num_nodes == 1 else TENSORFLOW_SM_TRAINING_CPU_4NODE_THRESHOLD) if processor == "cpu" else TENSORFLOW_SM_TRAINING_GPU_1NODE_THRESHOLD if num_nodes == 1 else TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD) threshold = get_threshold_for_image(framework_version, threshold_table) LOGGER.info( f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} " f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec" ) assert throughput > threshold, ( f"tensorflow {framework_version} sagemaker training {processor} {py_version} imagenet {num_nodes} nodes " f"Benchmark Result {throughput} does not reach the threshold {threshold}" )
def run_sm_perf_test(image_uri, xla, num_nodes, region, threshold=None): """ Run TF sagemaker training performance tests Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv. TODO: Refactor the above setup function to be more obviously connected to this function, TODO: and install requirements via a requirements.txt file :param image_uri: ECR image URI :param xla: [ True | False ] Enable XLA acceleration :param num_nodes: Number of nodes to run on :param region: AWS region This function was inspired by deep-learning-containers/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py """ _, framework_version = get_framework_and_version_from_tag(image_uri) processor = "xla" if xla else "gpu" device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" ''' TODO: Switch to p3.16xlarge when EC2 availability issues are resolved ''' ec2_instance_type = "p3.8xlarge" py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3" time_str = time.strftime("%Y-%m-%d-%H-%M-%S") commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join( BENCHMARK_RESULTS_S3_BUCKET, "xla", "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version ) training_job_name = ( f"opt-tf{framework_version[0]}-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" ) # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in # a throttling error for SageMaker APIs. time.sleep(Random(x=training_job_name).random() * 60) test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = ( f"results-{commit_info}-{time_str}-optimized-tf{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt" ) run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {image_uri} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name} " f"--xla-{'on' if xla else 'off'} " f"2>&1 | tee {log_file}", warn=True, echo=True, ) if not (run_out.ok or run_out.return_code == 124): target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run(f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}") LOGGER.info(f"Test results can be found at {os.path.join(target_upload_location, log_file)}") result_statement, throughput = _print_results_of_test(os.path.join(test_dir, log_file)) throughput /= num_nodes assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) LOGGER.info( f"optimized-tensorflow-{framework_version} sagemaker training {ec2_instance_type} {device_cuda_str} {py_version} " f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec" ) if threshold: assert throughput > threshold, ( f"optimized-tensorflow-{framework_version} sagemaker training {ec2_instance_type} {device_cuda_str} {py_version} imagenet {num_nodes} nodes " f"Regression Benchmark Result {throughput} does not reach the threshold {threshold}" ) return throughput
def test_tensorflow_sagemaker_training_performance(tensorflow_training, num_nodes, region): framework_version = re.search(r"[1,2](\.\d+){2}", tensorflow_training).group() if framework_version.startswith("1."): pytest.skip("Skipping benchmark test on TF 1.x images.") processor = "gpu" if "gpu" in tensorflow_training else "cpu" ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3" time_str = time.strftime('%Y-%m-%d-%H-%M-%S') commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", processor, py_version) training_job_name = ( f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}" f"-{commit_info[:7]}-{time_str}") # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in # a throttling error for SageMaker APIs. time.sleep(Random(x=training_job_name).random() * 60) test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt" run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {tensorflow_training} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name}" f"2>&1 > {log_file}", warn=True, echo=True) if not (run_out.ok or run_out.return_code == 124): target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}" ) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" )
def _run_eks_tensorflow_multi_node_training_mpijob( namespace, app_name, custom_image, job_name, command_to_run, args_to_pass, path_to_ksonnet_app, cluster_size, eks_gpus_per_worker): """ Run Tensorflow distributed training on EKS using horovod docker images using MPIJob :param namespace: :param app_name: :param custom_image: :param job_name: :param command_to_run: :param args_to_pass: :param path_to_ksonnet_app: :param cluster_size: :param eks_gpus_per_worker: :return: None """ KUBEFLOW_VERSION = "v0.5.1" pod_name = None env = f"{namespace}-env" ctx = Context() github_handler = GitHubHandler("aws", "kubeflow") github_handler.set_ksonnet_env() ctx.run(f"kubectl create namespace {namespace}") if not os.path.exists(path_to_ksonnet_app): ctx.run(f"mkdir -p {path_to_ksonnet_app}") with ctx.cd(path_to_ksonnet_app): ctx.run(f"rm -rf {app_name}") ctx.run(f"ks init {app_name} --namespace {namespace}") with ctx.cd(app_name): ctx.run(f"ks env add {env} --namespace {namespace}") # Check if the kubeflow registry exists and create. Registry will be available in each pod. registry_not_exist = ctx.run("ks registry list | grep kubeflow", warn=True) if registry_not_exist.return_code: ctx.run( f"ks registry add kubeflow github.com/kubeflow/kubeflow/tree/{KUBEFLOW_VERSION}/kubeflow", ) ctx.run(f"ks pkg install kubeflow/common@{KUBEFLOW_VERSION}") ctx.run(f"ks pkg install kubeflow/mpi-job@{KUBEFLOW_VERSION}") try: ctx.run("ks generate mpi-operator mpi-operator") # The latest mpi-operator docker image does not accept the gpus-per-node parameter # which is specified by the older spec file from v0.5.1. ctx.run( "ks param set mpi-operator image mpioperator/mpi-operator:0.2.0" ) ctx.run( "ks param set mpi-operator kubectlDeliveryImage mpioperator/kubectl-delivery:0.2.0" ) mpi_operator_start = ctx.run(f"ks apply {env} -c mpi-operator", warn=True) if mpi_operator_start.return_code: raise RuntimeError( f"Failed to start mpi-operator:\n{mpi_operator_start.stderr}" ) eks_utils.LOGGER.info( f"The mpi-operator package must be applied to {env} env before we can use mpiJob. " f"Check status before moving on.") ctx.run("kubectl get crd") # Use Ksonnet to generate manifest files which are then applied to the default context. ctx.run(f"ks generate mpi-job-custom {job_name}") ctx.run(f"ks param set {job_name} replicas {cluster_size}") ctx.run( f"ks param set {job_name} gpusPerReplica {eks_gpus_per_worker}" ) ctx.run(f"ks param set {job_name} image {custom_image}") ctx.run(f"ks param set {job_name} command {command_to_run}") ctx.run(f"ks param set {job_name} args {args_to_pass}") # use `$ks show default` to see details. ctx.run(f"kubectl get pods -n {namespace} -o wide") eks_utils.LOGGER.info( f"Apply the generated manifest to the {env} env.") training_job_start = ctx.run(f"ks apply {env} -c {job_name}", warn=True) if training_job_start.return_code: raise RuntimeError( f"Failed to start {job_name}:\n{training_job_start.stderr}" ) eks_utils.LOGGER.info("Check pods") ctx.run(f"kubectl get pods -n {namespace} -o wide") eks_utils.LOGGER.info( "First the mpi-operator and the n-worker pods will be created and then " "the launcher pod is created in the end. Use retries until launcher " "pod's name is available to read logs.") complete_pod_name = eks_utils.is_mpijob_launcher_pod_ready( ctx, namespace, job_name) _, pod_name = complete_pod_name.split("/") eks_utils.LOGGER.info( f"The Pods have been created and the name of the launcher pod is {pod_name}" ) eks_utils.LOGGER.info( f"Wait for the {job_name} job to complete") if eks_utils.is_eks_multinode_training_complete( ctx, namespace, env, pod_name, job_name): eks_utils.LOGGER.info( f"Wait for the {pod_name} pod to reach completion") distributed_out = ctx.run( f"kubectl logs -n {namespace} -f {complete_pod_name}" ).stdout eks_utils.LOGGER.info(distributed_out) finally: eks_utils.eks_multinode_cleanup(ctx, pod_name, job_name, namespace, env)
def test_mxnet_sagemaker_training_performance(mxnet_training, num_nodes, region, gpu_only, py3_only): """ Run MX sagemaker training performance test Additional context: Setup for this function is performed by 'setup_sm_benchmark_mx_train_env' -- this installs some prerequisite packages, pulls required script, and creates a virtualenv called sm_benchmark_venv. The training script mxnet_imagenet_resnet50.py is invoked via a shell script smtrain-resnet50-imagenet.sh The shell script sets num-epochs to 40. This parameter is configurable. TODO: Refactor the above setup function to be more obviously connected to this function, TODO: and install requirements via a requirements.txt file TODO: Change latency [time/epoch] metric to Throughput metric :param mxnet_training: ECR image URI :param num_nodes: Number of nodes to run on :param region: AWS region """ _, framework_version = get_framework_and_version_from_tag(mxnet_training) device_cuda_str = f"gpu-{get_cuda_version_from_tag(mxnet_training)}" py_version = "py37" if "py37" in mxnet_training else "py2" if "py2" in mxnet_training else "py3" ec2_instance_type = "p3.16xlarge" time_str = time.strftime('%Y-%m-%d-%H-%M-%S') commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "manual") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", device_cuda_str, py_version) training_job_name = f"mx-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt" run_out = ctx.run( f"timeout 90m python mx_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {mxnet_training} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name} " f"2>&1 | tee {log_file}", warn=True, echo=True) if not run_out.ok: target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}", warn=True, echo=True) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) result_statement, time_val, accuracy = _print_results_of_test( os.path.join(test_dir, log_file)) accuracy_threshold = get_threshold_for_image( framework_version, MXNET_TRAINING_GPU_IMAGENET_ACCURACY_THRESHOLD) assert accuracy > accuracy_threshold, ( f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes " f"Benchmark Result {accuracy} does not reach the threshold accuracy {accuracy_threshold}" ) time_threshold = get_threshold_for_image( framework_version, MXNET_TRAINING_GPU_IMAGENET_LATENCY_THRESHOLD) assert time_val < time_threshold, ( f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes " f"Benchmark Result {time_val} does not reach the threshold latency {time_threshold}" )
class WorkspaceContext: def __init__(self, root): self.root = root self.mrover_build_root = os.path.join(os.path.expanduser('~'), '.mrover') self.jarvis_root = os.path.join(root, 'jarvis_files') self.third_party_root = os.path.join(root, '3rdparty') self.build_intermediate = os.path.join(self.mrover_build_root, 'scratch') self.product_env = os.path.join(self.mrover_build_root, 'build_env') self.jarvis_env = os.path.join(self.mrover_build_root, 'jarvis_env') self.mbed_env = os.path.join(self.mrover_build_root, 'mbed_env') self.hash_store = os.path.join(self.mrover_build_root, 'project_hashes') self.templates = Environment(loader=FileSystemLoader( os.path.join(self.jarvis_root, 'templates'))) self.ctx = Context() def ensure_dir(self, d): """ Creates a directory if it does not exist. After invocation of this function, you can be ensured the directory exists. Parameters: d - the path to the directory to create. Raises: BuildError if there is a file named `d`. """ if not os.path.exists(d): os.makedirs(d) else: if not os.path.isdir(d): raise BuildError("{} already exists and is a file".format(d)) def ensure_build_dirs(self): """ Ensures the build directory structure exists. """ self.ensure_dir(self.mrover_build_root) self.ensure_dir(self.hash_store) def ensure_product_env(self, clear=False): """ Ensures the product venv existence. If clear is True, re-creates the product venv. """ self.ensure_build_dirs() if not os.path.isdir(self.product_env) and not clear: venv.create(self.product_env, clear=clear, symlinks=True, with_pip=True) def ensure_mbed_env(self): """ Ensures the mbed venv exitence. """ self.ensure_build_dirs() if not os.path.isdir(self.mbed_env): self.ctx.run('virtualenv --python=python2 {}'.format( self.mbed_env)) @contextmanager def inside_product_env(self): """ A context manager for activating the product venv. """ with self.ctx.prefix("source {}/bin/activate".format( self.product_env)): yield @contextmanager def inside_mbed_env(self): """ A context manager for activating the mbed venv. """ with self.ctx.prefix("source {}/bin/activate".format(self.mbed_env)): yield def template(self, name, **kwargs): """ Templates out a file and returns the rendered copy. """ tpl = self.templates.get_template(name) return tpl.render(**kwargs) @contextmanager def cd(self, *args): with self.ctx.cd(*args): yield def run(self, *args, **kwargs): return self.ctx.run(*args, **kwargs) def get_product_file(self, *args): return os.path.join(self.product_env, *args) def get_jarvis_file(self, *args): return os.path.join(self.jarvis_env, *args) def get_mbed_file(self, *args): return os.path.join(self.mbed_env, *args) @contextmanager def intermediate(self, name, cleanup=False): """ Create an intermediate build directory, then change directory to it. """ intermediate = os.path.join(self.build_intermediate, name) self.ensure_dir(intermediate) if os.listdir(intermediate) and cleanup: shutil.rmtree(intermediate) self.ensure_dir(intermediate) with self.cd(intermediate): yield intermediate if cleanup: shutil.rmtree(intermediate)
def run_eks_pytorch_multi_node_training(namespace, app_name, job_name, remote_yaml_file_path, unique_id): """Run PyTorch distributed training on EKS using PyTorch Operator Args: namespace, app_name, job_name, remote_yaml_file_path """ KUBEFLOW_VERSION = "v0.6.1" home_dir = run("echo $HOME").stdout.strip("\n") path_to_ksonnet_app = os.path.join( home_dir, f"pytorch_multi_node_eks_test-{unique_id}") env = f"{namespace}-env" ctx = Context() # Namespaces will allow parallel runs on the same cluster. Create namespace if it doesnt exist. does_namespace_exist = run(f"kubectl get namespace | grep {namespace}", warn=True) if not does_namespace_exist: run(f"kubectl create namespace {namespace}") if not os.path.exists(path_to_ksonnet_app): ctx.run(f"mkdir -p {path_to_ksonnet_app}") with ctx.cd(path_to_ksonnet_app): ctx.run(f"rm -rf {app_name}") # Create a new ksonnet app. github_handler = GitHubHandler("aws", "kubeflow") github_handler.set_ksonnet_env() ctx.run(f"ks init {app_name} --namespace {namespace}") with ctx.cd(app_name): ctx.run(f"ks env add {env} --namespace {namespace}") # Check if the kubeflow registry exists and create. Registry will be available in each pod. does_registry_exist = ctx.run("ks registry list | grep kubeflow", warn=True) if not does_registry_exist: ctx.run( f"ks registry add kubeflow github.com/kubeflow/kubeflow/tree/{KUBEFLOW_VERSION}/kubeflow", ) ctx.run( f"ks pkg install kubeflow/pytorch-job@{KUBEFLOW_VERSION}", ) ctx.run(f"ks generate pytorch-operator pytorch-operator") try: # use `$ks show default` to see details. ctx.run(f"kubectl get pods -n {namespace} -o wide") LOGGER.debug( f"ks apply {env} -c pytorch-operator -n {namespace}") ctx.run( f"ks apply {env} -c pytorch-operator -n {namespace}") # Delete old job with same name if exists ctx.run(f"kubectl delete -f {remote_yaml_file_path}", warn=True) ctx.run( f"kubectl create -f {remote_yaml_file_path} -n {namespace}" ) training_result = is_pytorch_eks_multinode_training_complete( job_name, namespace) if training_result: run_out = run( f"kubectl logs {job_name}-master-0 -n {namespace}", warn=True).stdout if "accuracy" in run_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(run_out) assert training_result, f"Training for eks pytorch multinode failed" finally: eks_utils.eks_multinode_cleanup(ctx, "", job_name, namespace, env)
def test_tensorflow_sagemaker_training_performance(tensorflow_training, num_nodes, region): """ Run TF sagemaker training performance tests Additonal context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv. TODO: Refactor the above setup function to be more obviously connected to this function, TODO: and install requirements via a requirements.txt file :param tensorflow_training: ECR image URI :param num_nodes: Number of nodes to run on :param region: AWS region """ framework_version = re.search(r"[1,2](\.\d+){2}", tensorflow_training).group() if framework_version.startswith("1."): pytest.skip("Skipping benchmark test on TF 1.x images.") processor = "gpu" if "gpu" in tensorflow_training else "cpu" ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3" time_str = time.strftime('%Y-%m-%d-%H-%M-%S') commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", processor, py_version) training_job_name = ( f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}" f"-{commit_info[:7]}-{time_str}") # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in # a throttling error for SageMaker APIs. time.sleep(Random(x=training_job_name).random() * 60) test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = f"results-{commit_info}-{time_str}-{framework_version}-{processor}-{py_version}-{num_nodes}-node.txt" run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {tensorflow_training} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name}" f"2>&1 | tee {log_file}", warn=True, echo=True) if not (run_out.ok or run_out.return_code == 124): target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}" ) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) _print_results_of_test(os.path.join(test_dir, log_file), processor) assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" )