def test_disable_compiler_config( time, name_from_base, sagemaker_session, huggingface_training_compiler_version, huggingface_training_compiler_pytorch_version, ): compiler_config = TrainingCompilerConfig(enabled=False) hf = HuggingFace( py_version="py38", entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, transformers_version=huggingface_training_compiler_version, pytorch_version=huggingface_training_compiler_pytorch_version, enable_sagemaker_metrics=False, compiler_config=TrainingCompilerConfig(enabled=False), ) inputs = "s3://mybucket/train" hf.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == ["train", "logs_for_job"] boto_call_names = [ c[0] for c in sagemaker_session.boto_session.method_calls ] assert boto_call_names == ["resource"] expected_train_args = _create_train_job( huggingface_training_compiler_version, f"pytorch{huggingface_training_compiler_pytorch_version}", INSTANCE_TYPE, compiler_config, ) expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][ "S3Uri"] = inputs expected_train_args["enable_sagemaker_metrics"] = False expected_train_args["hyperparameters"][ TrainingCompilerConfig.HP_ENABLE_COMPILER] = json.dumps(False) expected_train_args["hyperparameters"][ TrainingCompilerConfig.HP_ENABLE_DEBUG] = json.dumps(False) actual_train_args = sagemaker_session.method_calls[0][2] assert ( actual_train_args == expected_train_args ), f"{json.dumps(actual_train_args, indent=2)} != {json.dumps(expected_train_args, indent=2)}"
def test_trcomp_enabled(self, patched, sagemaker_session, ecr_image, tmpdir, capsys): ''' Tests the explicit enabled configuration of SM trcomp ''' instance_type = "ml.p3.2xlarge" instance_count = 1 estimator = HuggingFace( compiler_config=TrainingCompilerConfig(enabled=True), entry_point="train.py", source_dir=BERT_PATH, role="SageMakerRole", instance_type=instance_type, instance_count=instance_count, image_uri=ecr_image, py_version=py_version, sagemaker_session=sagemaker_session, hyperparameters=hyperparameters, debugger_hook_config=False, # currently needed max_retry_attempts=15, ) estimator.fit( job_name=unique_name_from_base("hf-tf-trcomp-single-gpu-enabled"), logs=True) captured = capsys.readouterr() logs = captured.out + captured.err assert "Found configuration for Training Compiler" in logs
def test_single_node_single_gpu_tcc_default(patched, docker_image, processor, instance_type, sagemaker_local_session, py_version, capsys): ''' Single GPU test that tests the local_gpu instance type with default TCC. All local mode tests (PT and TF) are run serially on a single instance. ''' hyperparameters = { "max_steps": 3, "train_batch_size": 4, "model_name": "distilbert-base-uncased" } estimator = HuggingFace( compiler_config=TrainingCompilerConfig(), entry_point=distrilbert_script, instance_type="local_gpu", sagemaker_session=sagemaker_local_session, image_uri=docker_image, instance_count=1, role=ROLE, hyperparameters=hyperparameters, environment={ 'GPU_NUM_DEVICES': '1' }, #https://github.com/aws/sagemaker-training-toolkit/issues/107 py_version=py_version, ) estimator.fit()
def test_trcomp_debug(self, patched, ecr_image, sagemaker_session, tmpdir, py_version, capsys): ''' Tests the debug mode configuration of SM trcomp ''' transformers_version = get_transformers_version(ecr_image) git_config = { 'repo': 'https://github.com/huggingface/transformers.git', 'branch': 'v' + transformers_version } instance_count = 1 instance_type = "ml.p3.2xlarge" source_dir = ("./examples/question-answering" if Version(transformers_version) < Version("4.6") else "./examples/pytorch/question-answering") with timeout(minutes=DEFAULT_TIMEOUT): estimator = HuggingFace( compiler_config=TrainingCompilerConfig(debug=True), entry_point='run_qa.py', source_dir=source_dir, git_config=git_config, metric_definitions=metric_definitions, role='SageMakerRole', image_uri=ecr_image, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters=hyperparameters, environment={'GPU_NUM_DEVICES': '1'}, py_version=py_version, max_retry_attempts=15, ) estimator.fit(job_name=sagemaker.utils.unique_name_from_base( 'hf-pt-trcomp-single-gpu-debug'), logs=True) captured = capsys.readouterr() logs = captured.out + captured.err assert "Found configuration for Training Compiler" in logs assert "Training Compiler set to debug mode" in logs assert "Configuring SM Training Compiler" in logs assert "device: xla" in logs debug_artifact_path = estimator.model_data.replace( 'model.tar.gz', 'output.tar.gz') debug_artifact = os.path.join(tmpdir, 'output.tar.gz') subprocess.check_output( ['aws', 's3', 'cp', debug_artifact_path, debug_artifact]) with tarfile.open(debug_artifact, 'r:gz') as tarball: tarball.extractall(path=tmpdir) xla_metrics_file = os.path.join(tmpdir, 'compiler', 'XLA_METRICS_FILE.txt') assert os.path.exists(xla_metrics_file)
def test_unsupported_framework_mxnet(huggingface_training_compiler_version, ): with pytest.raises(ValueError): HuggingFace( py_version="py38", entry_point=SCRIPT_PATH, role=ROLE, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, transformers_version=huggingface_training_compiler_version, mxnet_version=".".join( ["99"] * len(huggingface_training_compiler_version.split("."))), enable_sagemaker_metrics=False, compiler_config=TrainingCompilerConfig(), ).fit()
def test_unsupported_python_2( huggingface_training_compiler_version, huggingface_training_compiler_pytorch_version, ): with pytest.raises(ValueError): HuggingFace( py_version="py27", entry_point=SCRIPT_PATH, role=ROLE, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, transformers_version=huggingface_training_compiler_version, pytorch_version=huggingface_training_compiler_pytorch_version, enable_sagemaker_metrics=False, compiler_config=TrainingCompilerConfig(), ).fit()
def test_trcomp_enabled(self, patched, ecr_image, sagemaker_session, tmpdir, py_version, capsys): ''' Tests the explicit enabled configuration of SM trcomp ''' transformers_version = get_transformers_version(ecr_image) git_config = { 'repo': 'https://github.com/huggingface/transformers.git', 'branch': 'v' + transformers_version } instance_count = 1 instance_type = "ml.p3.2xlarge" source_dir = ("./examples/question-answering" if Version(transformers_version) < Version("4.6") else "./examples/pytorch/question-answering") with timeout(minutes=DEFAULT_TIMEOUT): estimator = HuggingFace( compiler_config=TrainingCompilerConfig(enabled=True), entry_point='run_qa.py', source_dir=source_dir, git_config=git_config, metric_definitions=metric_definitions, role='SageMakerRole', image_uri=ecr_image, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters=hyperparameters, environment={'GPU_NUM_DEVICES': '1'}, py_version=py_version, max_retry_attempts=15, ) estimator.fit(job_name=sagemaker.utils.unique_name_from_base( 'hf-pt-trcomp-single-gpu-enabled'), logs=True) captured = capsys.readouterr() logs = captured.out + captured.err assert "Found configuration for Training Compiler" in logs assert "Configuring SM Training Compiler" in logs assert "device: xla" in logs
def test_huggingface_pytorch( sagemaker_session, gpu_instance_type, huggingface_training_compiler_latest_version, huggingface_training_compiler_pytorch_latest_version, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "huggingface") hf = HuggingFace( py_version="py38", entry_point=os.path.join(data_path, "run_glue.py"), role="SageMakerRole", transformers_version=huggingface_training_compiler_latest_version, pytorch_version= huggingface_training_compiler_pytorch_latest_version, instance_count=1, instance_type=gpu_instance_type, hyperparameters={ "model_name_or_path": "distilbert-base-cased", "task_name": "wnli", "do_train": True, "do_eval": True, "max_seq_length": 128, "fp16": True, "per_device_train_batch_size": 128, "output_dir": "/opt/ml/model", }, environment={"GPU_NUM_DEVICES": "1"}, sagemaker_session=sagemaker_session, disable_profiler=True, compiler_config=TrainingCompilerConfig(), ) train_input = hf.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/huggingface/train", ) hf.fit(train_input)
def test_unsupported_BYOC( huggingface_training_compiler_version, huggingface_training_compiler_pytorch_version, ): byoc = ( "1.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-trcomp-training:" "1.9.0-" "transformers4.10.2-gpu-" "py38-cu111-ubuntu20.04") with pytest.raises(ValueError): HuggingFace( image_uri=byoc, py_version="py38", entry_point=SCRIPT_PATH, role=ROLE, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, transformers_version=huggingface_training_compiler_version, pytorch_version=huggingface_training_compiler_pytorch_version, enable_sagemaker_metrics=False, compiler_config=TrainingCompilerConfig(), ).fit()
def test_huggingface_tensorflow( sagemaker_session, gpu_instance_type, huggingface_training_compiler_latest_version, huggingface_training_compiler_tensorflow_latest_version, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "huggingface") hf = HuggingFace( py_version="py38", entry_point=os.path.join(data_path, "run_tf.py"), role="SageMakerRole", transformers_version=huggingface_training_compiler_latest_version, tensorflow_version= huggingface_training_compiler_tensorflow_latest_version, instance_count=1, instance_type=gpu_instance_type, hyperparameters={ "model_name_or_path": "distilbert-base-cased", "per_device_train_batch_size": 128, "per_device_eval_batch_size": 128, "output_dir": "/opt/ml/model", "overwrite_output_dir": True, "save_steps": 5500, }, sagemaker_session=sagemaker_session, disable_profiler=True, compiler_config=TrainingCompilerConfig(), ) train_input = hf.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/huggingface/train") hf.fit(train_input)