def test_hf_smdp_multi(instance_types, ecr_image, py_version, sagemaker_session, tmpdir, framework_version): """ Tests smddprun command via Estimator API distribution parameter """ _, image_framework_version = get_framework_and_version_from_tag(ecr_image) image_cuda_version = get_cuda_version_from_tag(ecr_image) if Version(image_framework_version) < Version("2.3.1") or image_cuda_version != "cu110": pytest.skip("Data Parallelism is only supported on CUDA 11, and on TensorFlow 2.3.1 or higher") distribution = {"smdistributed": {"dataparallel": {"enabled": True}}} instance_type = "ml.p3.16xlarge" instance_count = 2 estimator = HuggingFace(entry_point='train.py', source_dir=BERT_PATH, role='SageMakerRole', instance_type=instance_type, instance_count=instance_count, image_uri=ecr_image, framework_version=framework_version, py_version=py_version, sagemaker_session=sagemaker_session, hyperparameters=hyperparameters, distribution=distribution, debugger_hook_config=False, # currently needed ) estimator.fit(job_name=unique_name_from_base('test-tf-hf-smdp-multi'))
def _test_hf_smdp_function(ecr_image, sagemaker_session, instance_type, framework_version, py_version, tmpdir, instance_count): _, image_framework_version = get_framework_and_version_from_tag(ecr_image) image_cuda_version = get_cuda_version_from_tag(ecr_image) instance_type = "ml.p3.16xlarge" distribution = {"smdistributed": {"dataparallel": {"enabled": True}}} estimator = HuggingFace( entry_point='train.py', source_dir=BERT_PATH, role='SageMakerRole', instance_type=instance_type, instance_count=instance_count, image_uri=ecr_image, framework_version=framework_version, py_version=py_version, sagemaker_session=sagemaker_session, hyperparameters=hyperparameters, distribution=distribution, debugger_hook_config=False, # currently needed ) estimator.fit(job_name=unique_name_from_base("test-tf-hf-smdp-multi"))
def _test_smmp_gpu_function(ecr_image, sagemaker_session, py_version, instances_quantity): instance_type = "ml.p3.16xlarge" instance_count = instances_quantity volume_size = 400 transformers_version = get_transformers_version(ecr_image) git_config = { "repo": "https://github.com/huggingface/transformers.git", "branch": "v" + transformers_version } huggingface_estimator = HuggingFace( entry_point="run_glue.py", source_dir="./examples/pytorch/text-classification", git_config=git_config, instance_type=instance_type, instance_count=instance_count, volume_size=volume_size, role="SageMakerRole", image_uri=ecr_image, distribution=distribution, py_version=py_version, hyperparameters=hyperparameters, sagemaker_session=sagemaker_session, ) huggingface_estimator.fit( job_name=sagemaker.utils.unique_name_from_base("test-hf-pt-qa-smmp"))
def test_hf_smdp(sagemaker_session, instance_type, ecr_image, tmpdir, framework_version): """ Tests SMDataParallel single-node command via script mode """ _, image_framework_version = get_framework_and_version_from_tag(ecr_image) image_cuda_version = get_cuda_version_from_tag(ecr_image) if Version(image_framework_version) < Version("2.3.1") or image_cuda_version != "cu110": pytest.skip("Data Parallelism is only supported on CUDA 11, and on TensorFlow 2.3.1 or higher") # configuration for running training on smdistributed Data Parallel distribution = {'smdistributed': {'dataparallel': {'enabled': True}}} instance_type = "ml.p3.16xlarge" instance_count = 1 estimator = HuggingFace( entry_point='train.py', source_dir=BERT_PATH, role='SageMakerRole', instance_type=instance_type, instance_count=instance_count, image_uri=ecr_image, framework_version=framework_version, py_version=py_version, distribution=distribution, sagemaker_session=sagemaker_session, hyperparameters=hyperparameters, debugger_hook_config=False, # currently needed ) estimator.fit(job_name=unique_name_from_base('test-tf-hf-smdp'))
def _test_smdp_question_answering_function(ecr_image, sagemaker_session, py_version, instances_quantity): transformers_version = get_transformers_version(ecr_image) git_config = { 'repo': 'https://github.com/huggingface/transformers.git', 'branch': 'v' + transformers_version } validate_or_skip_smdataparallel(ecr_image) instance_count = instances_quantity instance_type = "ml.p3.16xlarge" source_dir = ("./examples/question-answering" if Version(transformers_version) < Version("4.6") else "./examples/pytorch/question-answering") with timeout(minutes=DEFAULT_TIMEOUT): estimator = HuggingFace( entry_point='run_qa.py', source_dir=source_dir, git_config=git_config, metric_definitions=metric_definitions, role='SageMakerRole', image_uri=ecr_image, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, py_version=py_version, distribution=distribution, hyperparameters=hyperparameters, ) estimator.fit(job_name=sagemaker.utils.unique_name_from_base( 'test-hf-pt-qa-smdp'))
def test_trcomp_enabled(self, patched, sagemaker_session, ecr_image, tmpdir, capsys): ''' Tests the explicit enabled configuration of SM trcomp ''' instance_type = "ml.p3.2xlarge" instance_count = 1 estimator = HuggingFace( compiler_config=TrainingCompilerConfig(enabled=True), entry_point="train.py", source_dir=BERT_PATH, role="SageMakerRole", instance_type=instance_type, instance_count=instance_count, image_uri=ecr_image, py_version=py_version, sagemaker_session=sagemaker_session, hyperparameters=hyperparameters, debugger_hook_config=False, # currently needed max_retry_attempts=15, ) estimator.fit( job_name=unique_name_from_base("hf-tf-trcomp-single-gpu-enabled"), logs=True) captured = capsys.readouterr() logs = captured.out + captured.err assert "Found configuration for Training Compiler" in logs
def test_single_node_single_gpu_tcc_default(patched, docker_image, processor, instance_type, sagemaker_local_session, py_version, capsys): ''' Single GPU test that tests the local_gpu instance type with default TCC. All local mode tests (PT and TF) are run serially on a single instance. ''' hyperparameters = { "max_steps": 3, "train_batch_size": 4, "model_name": "distilbert-base-uncased" } estimator = HuggingFace( compiler_config=TrainingCompilerConfig(), entry_point=distrilbert_script, instance_type="local_gpu", sagemaker_session=sagemaker_local_session, image_uri=docker_image, instance_count=1, role=ROLE, hyperparameters=hyperparameters, environment={ 'GPU_NUM_DEVICES': '1' }, #https://github.com/aws/sagemaker-training-toolkit/issues/107 py_version=py_version, ) estimator.fit()
def test_huggingface_training_tf( sagemaker_session, gpu_instance_type, huggingface_training_latest_version, huggingface_tensorflow_latest_version, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "huggingface") hf = HuggingFace( py_version="py37", entry_point=os.path.join(data_path, "run_tf.py"), role="SageMakerRole", transformers_version=huggingface_training_latest_version, tensorflow_version=huggingface_tensorflow_latest_version, instance_count=1, instance_type=gpu_instance_type, hyperparameters={ "model_name_or_path": "distilbert-base-cased", "per_device_train_batch_size": 128, "per_device_eval_batch_size": 128, "output_dir": "/opt/ml/model", "overwrite_output_dir": True, "save_steps": 5500, }, sagemaker_session=sagemaker_session, disable_profiler=True, ) train_input = hf.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/huggingface/train") hf.fit(train_input)
def test_smdp_question_answering(ecr_image, instance_type, py_version, sagemaker_session, tmpdir): """ Tests SM Distributed DataParallel single-node via script mode """ transformers_version = get_transformers_version(ecr_image) git_config = { 'repo': 'https://github.com/huggingface/transformers.git', 'branch': 'v' + transformers_version } validate_or_skip_smdataparallel(ecr_image) instance_count = 1 instance_type = "ml.p3.16xlarge" with timeout(minutes=DEFAULT_TIMEOUT): estimator = HuggingFace( entry_point='run_qa.py', source_dir='./examples/question-answering', git_config=git_config, metric_definitions=metric_definitions, role='SageMakerRole', image_uri=ecr_image, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, py_version=py_version, distribution=distribution, hyperparameters=hyperparameters, ) estimator.fit(job_name=sagemaker.utils.unique_name_from_base( 'test-hf-pt-qa-smdp'))
def test_trcomp_debug(self, patched, ecr_image, sagemaker_session, tmpdir, py_version, capsys): ''' Tests the debug mode configuration of SM trcomp ''' transformers_version = get_transformers_version(ecr_image) git_config = { 'repo': 'https://github.com/huggingface/transformers.git', 'branch': 'v' + transformers_version } instance_count = 1 instance_type = "ml.p3.2xlarge" source_dir = ("./examples/question-answering" if Version(transformers_version) < Version("4.6") else "./examples/pytorch/question-answering") with timeout(minutes=DEFAULT_TIMEOUT): estimator = HuggingFace( compiler_config=TrainingCompilerConfig(debug=True), entry_point='run_qa.py', source_dir=source_dir, git_config=git_config, metric_definitions=metric_definitions, role='SageMakerRole', image_uri=ecr_image, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters=hyperparameters, environment={'GPU_NUM_DEVICES': '1'}, py_version=py_version, max_retry_attempts=15, ) estimator.fit(job_name=sagemaker.utils.unique_name_from_base( 'hf-pt-trcomp-single-gpu-debug'), logs=True) captured = capsys.readouterr() logs = captured.out + captured.err assert "Found configuration for Training Compiler" in logs assert "Training Compiler set to debug mode" in logs assert "Configuring SM Training Compiler" in logs assert "device: xla" in logs debug_artifact_path = estimator.model_data.replace( 'model.tar.gz', 'output.tar.gz') debug_artifact = os.path.join(tmpdir, 'output.tar.gz') subprocess.check_output( ['aws', 's3', 'cp', debug_artifact_path, debug_artifact]) with tarfile.open(debug_artifact, 'r:gz') as tarball: tarball.extractall(path=tmpdir) xla_metrics_file = os.path.join(tmpdir, 'compiler', 'XLA_METRICS_FILE.txt') assert os.path.exists(xla_metrics_file)
def test_default_compiler_config( time, name_from_base, sagemaker_session, huggingface_training_compiler_version, huggingface_training_compiler_pytorch_version, instance_class, ): compiler_config = TrainingCompilerConfig() instance_type = f"ml.{instance_class}.xlarge" hf = HuggingFace( py_version="py38", entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=instance_type, transformers_version=huggingface_training_compiler_version, pytorch_version=huggingface_training_compiler_pytorch_version, enable_sagemaker_metrics=False, compiler_config=compiler_config, ) inputs = "s3://mybucket/train" hf.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == ["train", "logs_for_job"] boto_call_names = [ c[0] for c in sagemaker_session.boto_session.method_calls ] assert boto_call_names == ["resource"] expected_train_args = _create_train_job( huggingface_training_compiler_version, f"pytorch{huggingface_training_compiler_pytorch_version}", instance_type, compiler_config, ) expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][ "S3Uri"] = inputs expected_train_args["enable_sagemaker_metrics"] = False expected_train_args["hyperparameters"][ TrainingCompilerConfig.HP_ENABLE_COMPILER] = json.dumps(True) expected_train_args["hyperparameters"][ TrainingCompilerConfig.HP_ENABLE_DEBUG] = json.dumps(False) actual_train_args = sagemaker_session.method_calls[0][2] assert ( actual_train_args == expected_train_args ), f"{json.dumps(actual_train_args, indent=2)} != {json.dumps(expected_train_args, indent=2)}"
def test_huggingface_invalid_args(): with pytest.raises(ValueError) as error: HuggingFace( py_version="py36", entry_point=SCRIPT_PATH, role=ROLE, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, transformers_version="4.2.1", pytorch_version="1.6", enable_sagemaker_metrics=False, ) assert "use either full version or shortened version" in str(error) with pytest.raises(ValueError) as error: HuggingFace( py_version="py36", entry_point=SCRIPT_PATH, role=ROLE, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, pytorch_version="1.6", enable_sagemaker_metrics=False, ) assert "transformers_version, and image_uri are both None." in str(error) with pytest.raises(ValueError) as error: HuggingFace( py_version="py36", entry_point=SCRIPT_PATH, role=ROLE, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, transformers_version="4.2.1", enable_sagemaker_metrics=False, ) assert "tensorflow_version and pytorch_version are both None." in str( error) with pytest.raises(ValueError) as error: HuggingFace( py_version="py36", entry_point=SCRIPT_PATH, role=ROLE, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, transformers_version="4.2", pytorch_version="1.6", tensorflow_version="2.3", enable_sagemaker_metrics=False, ) assert "tensorflow_version and pytorch_version are both not None." in str( error)
def create_estimator(self, instance_count): job_name = f"{self.env.base_job_name}-{instance_count}-{'ddp' if 'ddp' in self.script else 'smd'}" # distributed data settings distribution = { "smdistributed": { "dataparallel": { "enabled": True } } } if self.script != "run_ddp.py" else None # creates estimator return HuggingFace( entry_point=self.script, source_dir=self.env.test_path, role=self.env.role, image_uri=self.env.image_uri, base_job_name=job_name, instance_count=instance_count, instance_type=self.instance_type, debugger_hook_config=False, hyperparameters={ **self.env.distributed_hyperparameters, "model_name_or_path": self.model_name_or_path }, metric_definitions=self.env.metric_definitions, distribution=distribution, py_version="py36", )
def test_attach_custom_image(sagemaker_session): training_image = "pytorch:latest" returned_job_description = { "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"', "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', "training_steps": "100", "sagemaker_region": '"us-east-1"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description ) estimator = HuggingFace.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == "neo" assert estimator.image_uri == training_image
def test_distilbert_base(docker_image, processor, instance_type, sagemaker_local_session, py_version): # hyperparameters, which are passed into the training job hyperparameters = {"max_steps": 5, "train_batch_size": 4, "model_name": "distilbert-base-uncased"} estimator = HuggingFace( entry_point=distrilbert_script, instance_type="local_gpu", sagemaker_session=sagemaker_local_session, image_uri=docker_image, instance_count=1, role=ROLE, py_version=py_version, hyperparameters=hyperparameters, ) estimator.fit()
def test_smmp_gpu_multinode(sagemaker_session, framework_version, ecr_image, instance_type, py_version, dist_gpu_backend): instance_type = 'ml.p3.16xlarge' instance_count = 2 volume_size = 400 huggingface_estimator = HuggingFace(entry_point='run_glue.py', source_dir='./sagemaker/04_distributed_training_model_parallelism/scripts/', git_config=git_config, instance_type=instance_type, instance_count=instance_count, volume_size=volume_size, role='SageMakerRole', image_uri=ecr_image, distribution=distribution, py_version=py_version, hyperparameters=hyperparameters, sagemaker_session=sagemaker_session) huggingface_estimator.fit(job_name=sagemaker.utils.unique_name_from_base('test-hf-pt-qa-smmp-multi'))
def test_trcomp_enabled(self, patched, ecr_image, sagemaker_session, tmpdir, py_version, capsys): ''' Tests the explicit enabled configuration of SM trcomp ''' transformers_version = get_transformers_version(ecr_image) git_config = { 'repo': 'https://github.com/huggingface/transformers.git', 'branch': 'v' + transformers_version } instance_count = 1 instance_type = "ml.p3.2xlarge" source_dir = ("./examples/question-answering" if Version(transformers_version) < Version("4.6") else "./examples/pytorch/question-answering") with timeout(minutes=DEFAULT_TIMEOUT): estimator = HuggingFace( compiler_config=TrainingCompilerConfig(enabled=True), entry_point='run_qa.py', source_dir=source_dir, git_config=git_config, metric_definitions=metric_definitions, role='SageMakerRole', image_uri=ecr_image, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters=hyperparameters, environment={'GPU_NUM_DEVICES': '1'}, py_version=py_version, max_retry_attempts=15, ) estimator.fit(job_name=sagemaker.utils.unique_name_from_base( 'hf-pt-trcomp-single-gpu-enabled'), logs=True) captured = capsys.readouterr() logs = captured.out + captured.err assert "Found configuration for Training Compiler" in logs assert "Configuring SM Training Compiler" in logs assert "device: xla" in logs
def test_huggingface( time, name_from_base, sagemaker_session, huggingface_training_version, huggingface_pytorch_training_version, huggingface_pytorch_training_py_version, ): hf = HuggingFace( py_version=huggingface_pytorch_training_py_version, entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=GPU_INSTANCE_TYPE, transformers_version=huggingface_training_version, pytorch_version=huggingface_pytorch_training_version, enable_sagemaker_metrics=False, ) inputs = "s3://mybucket/train" hf.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == ["train", "logs_for_job"] boto_call_names = [ c[0] for c in sagemaker_session.boto_session.method_calls ] assert boto_call_names == ["resource"] expected_train_args = _create_train_job( huggingface_training_version, f"pytorch{huggingface_pytorch_training_version}") expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][ "S3Uri"] = inputs expected_train_args["experiment_config"] = EXPERIMENT_CONFIG expected_train_args["enable_sagemaker_metrics"] = False actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args
def test_huggingface_training( sagemaker_session, gpu_instance_type, huggingface_training_latest_version, huggingface_pytorch_latest_version, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "huggingface") hf = HuggingFace( py_version="py36", entry_point="examples/text-classification/run_glue.py", role="SageMakerRole", transformers_version=huggingface_training_latest_version, pytorch_version=huggingface_pytorch_latest_version, instance_count=1, instance_type=gpu_instance_type, hyperparameters={ "model_name_or_path": "distilbert-base-cased", "task_name": "wnli", "do_train": True, "do_eval": True, "max_seq_length": 128, "fp16": True, "per_device_train_batch_size": 128, "output_dir": "/opt/ml/model", }, sagemaker_session=sagemaker_session, git_config={ "repo": "https://github.com/huggingface/transformers.git", "branch": f"v{huggingface_training_latest_version}", }, disable_profiler=True, ) train_input = hf.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/huggingface/train", ) hf.fit(train_input)
def test_huggingface_pytorch( sagemaker_session, gpu_instance_type, huggingface_training_compiler_latest_version, huggingface_training_compiler_pytorch_latest_version, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "huggingface") hf = HuggingFace( py_version="py38", entry_point=os.path.join(data_path, "run_glue.py"), role="SageMakerRole", transformers_version=huggingface_training_compiler_latest_version, pytorch_version= huggingface_training_compiler_pytorch_latest_version, instance_count=1, instance_type=gpu_instance_type, hyperparameters={ "model_name_or_path": "distilbert-base-cased", "task_name": "wnli", "do_train": True, "do_eval": True, "max_seq_length": 128, "fp16": True, "per_device_train_batch_size": 128, "output_dir": "/opt/ml/model", }, environment={"GPU_NUM_DEVICES": "1"}, sagemaker_session=sagemaker_session, disable_profiler=True, compiler_config=TrainingCompilerConfig(), ) train_input = hf.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/huggingface/train", ) hf.fit(train_input)
def test_attach( sagemaker_session, huggingface_training_version, huggingface_pytorch_training_version ): training_image = ( f"1.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:" f"{huggingface_pytorch_training_version}-" f"transformers{huggingface_training_version}-gpu-py36-cu110-ubuntu18.04" ) returned_job_description = { "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"', "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', "training_steps": "100", "sagemaker_region": '"us-east-1"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description ) estimator = HuggingFace.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == "neo" assert estimator.py_version == "py36" assert estimator.framework_version == huggingface_training_version assert estimator.pytorch_version == huggingface_pytorch_training_version assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" assert estimator.instance_count == 1 assert estimator.max_run == 24 * 60 * 60 assert estimator.input_mode == "File" assert estimator.base_job_name == "neo" assert estimator.output_path == "s3://place/output/neo" assert estimator.output_kms_key == "" assert estimator.hyperparameters()["training_steps"] == "100" assert estimator.source_dir == "s3://some/sourcedir.tar.gz" assert estimator.entry_point == "iris-dnn-classifier.py"
def create_estimator(self, instance_count=1): # creates estimator return HuggingFace( entry_point=self.script, source_dir=self.env.test_path, role=self.env.role, image_uri=self.env.image_uri, base_job_name=f"{self.env.base_job_name}-single", instance_count=instance_count, instance_type=self.instance_type, debugger_hook_config=False, hyperparameters={**self.env.hyperparameters, "model_name_or_path": self.model_name_or_path}, metric_definitions=self.env.metric_definitions, py_version="py36", )
def test_unsupported_framework_mxnet(huggingface_training_compiler_version, ): with pytest.raises(ValueError): HuggingFace( py_version="py38", entry_point=SCRIPT_PATH, role=ROLE, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, transformers_version=huggingface_training_compiler_version, mxnet_version=".".join( ["99"] * len(huggingface_training_compiler_version.split("."))), enable_sagemaker_metrics=False, compiler_config=TrainingCompilerConfig(), ).fit()
def test_unsupported_python_2( huggingface_training_compiler_version, huggingface_training_compiler_pytorch_version, ): with pytest.raises(ValueError): HuggingFace( py_version="py27", entry_point=SCRIPT_PATH, role=ROLE, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, transformers_version=huggingface_training_compiler_version, pytorch_version=huggingface_training_compiler_pytorch_version, enable_sagemaker_metrics=False, compiler_config=TrainingCompilerConfig(), ).fit()
def create_estimator(self, instance_count): # configuration for running training on smdistributed Model Parallel mpi_options = { "enabled": True, "processes_per_host": 8, } smp_options = { "enabled": True, "parameters": { "microbatches": 4, "placement_strategy": "spread", "pipeline": "interleaved", "optimize": "speed", "partitions": 4, "ddp": True, }, } distribution = { "smdistributed": { "modelparallel": smp_options }, "mpi": mpi_options } name_extension = "trainer" if self.script == "run_glue.py" else "smtrainer" # creates estimator return HuggingFace( entry_point=self.script, source_dir=self.env.test_path, role=self.env.role, image_uri=self.env.image_uri, base_job_name= f"{self.env.base_job_name}-{instance_count}-smp-{name_extension}", instance_count=instance_count, instance_type=self.instance_type, debugger_hook_config=False, hyperparameters={ **self.env.hyperparameters, "model_name_or_path": self.model_name_or_path, "max_steps": 500, }, metric_definitions=self.env.metric_definitions, distribution=distribution, py_version="py36", )
def test_unsupported_BYOC( huggingface_training_compiler_version, huggingface_training_compiler_pytorch_version, ): byoc = ( "1.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-trcomp-training:" "1.9.0-" "transformers4.10.2-gpu-" "py38-cu111-ubuntu20.04") with pytest.raises(ValueError): HuggingFace( image_uri=byoc, py_version="py38", entry_point=SCRIPT_PATH, role=ROLE, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, transformers_version=huggingface_training_compiler_version, pytorch_version=huggingface_training_compiler_pytorch_version, enable_sagemaker_metrics=False, compiler_config=TrainingCompilerConfig(), ).fit()
def _huggingface_estimator( sagemaker_session, framework_version, pytorch_version, tensorflow_version, py_version, instance_type=None, base_job_name=None, **kwargs, ): return HuggingFace( entry_point=SCRIPT_PATH, framework_version=framework_version, py_version=py_version, pytorch_version=pytorch_version, tensorflow_version=tensorflow_version, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=instance_type if instance_type else INSTANCE_TYPE, base_job_name=base_job_name, **kwargs, )
def sagemaker_launcher(sagemaker_config: SageMakerConfig, args): if not is_sagemaker_available(): raise ImportError( "Please install sagemaker to be able to launch training on Amazon SageMaker with `pip install accelerate[sagemaker]`" ) from sagemaker.huggingface import HuggingFace # configure environment print("Configuring Amazon SageMaker environment") os.environ["AWS_DEFAULT_REGION"] = sagemaker_config.region # configure credentials if sagemaker_config.profile is not None: os.environ["AWS_PROFILE"] = sagemaker_config.profile elif args.aws_access_key_id is not None and args.aws_secret_access_key is not None: os.environ["AWS_ACCESS_KEY_ID"] = args.aws_access_key_id os.environ["AWS_SECRET_ACCESS_KEY"] = args.aws_secret_access_key else: raise EnvironmentError( "You need to provide an aws_access_key_id and aws_secret_access_key when not using aws_profile" ) # extract needed arguments source_dir = os.path.dirname(args.training_script) if not source_dir: # checks if string is empty source_dir = "." entry_point = os.path.basename(args.training_script) if not entry_point.endswith(".py"): raise ValueError( f'Your training script should be a python script and not "{entry_point}"' ) print("Converting Arguments to Hyperparameters") hyperparameters = _convert_nargs_to_dict(args.training_script_args) environment = { "USE_FP16": args.fp16 } # Environment variables to be set for use during training job # configure distribution set up distribution = None # TODO: not yet implemented # configure session print("Creating Estimator") huggingface_estimator = HuggingFace( entry_point=entry_point, source_dir=source_dir, role=sagemaker_config.iam_role_name, transformers_version="4.4", pytorch_version="1.6", py_version="py36", base_job_name=sagemaker_config.base_job_name, instance_count=sagemaker_config.num_machines, instance_type=sagemaker_config.ec2_instance_type, debugger_hook_config=False, distribution=distribution, hyperparameters=hyperparameters, environment=environment, ) huggingface_estimator.fit() print( f"You can find your model data at: {huggingface_estimator.model_data}")
# instance configurations # instance_type = "ml.p3.16xlarge" instance_type = "ml.p3.2xlarge" # instance_type = "ml.p3dn.24xlarge" instance_count = 1 # volume_size=200 image_uri = "570106654206.dkr.ecr.us-east-1.amazonaws.com/keras-smddp-private-preview:tf-2-4-1-hf-keras-05-27-06-37-16-a10645a1" huggingface_estimator = HuggingFace( # distibuted script, entry_point="train.py", # single_node script, # entry_point="singe_node_train.py", source_dir="./scripts", instance_type=instance_type, role=role, session=sess, instance_count=instance_count, image_uri=image_uri, # transformers_version="4.5.0", # tensorflow_version="2.4.1", py_version="py37", distribution=distribution, hyperparameters=hyperparameters, base_job_name="hf-tf-bert-" + str(instance_count) + "node-" + instance_type.replace(".", "-"), debugger_hook_config=False, # currently needed ) huggingface_estimator.fit()
def test_distilbert_base(docker_image, processor, instance_type, sagemaker_local_session, py_version): from datasets import load_dataset from transformers import AutoTokenizer # tokenizer used in preprocessing tokenizer_name = 'distilbert-base-uncased' # dataset used dataset_name = 'imdb' # s3 key prefix for the data s3_prefix = 'samples/datasets/imdb' # load dataset dataset = load_dataset(dataset_name) # download tokenizer tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) # tokenizer helper function def tokenize(batch): return tokenizer(batch['text'], padding='max_length', truncation=True) # load dataset train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test']) test_dataset = test_dataset.shuffle().select( range(100)) # smaller the size for test dataset to 10k # tokenize dataset train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset)) test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset)) # set format for pytorch train_dataset.rename_column_("label", "labels") train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels']) test_dataset.rename_column_("label", "labels") test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels']) # hyperparameters, which are passed into the training job hyperparameters = { 'max_steps': 5, 'train_batch_size': 4, 'model_name': 'distilbert-base-uncased' } s3 = S3FileSystem() # save train_dataset to s3 training_input_path = f's3://{sagemaker_local_session.default_bucket()}/{s3_prefix}/train' train_dataset.save_to_disk(training_input_path, fs=s3) # save test_dataset to s3 test_input_path = f's3://{sagemaker_local_session.default_bucket()}/{s3_prefix}/test' test_dataset.save_to_disk(test_input_path, fs=s3) estimator = HuggingFace(entry_point=distrilbert_script, instance_type='local_gpu', sagemaker_session=sagemaker_local_session, image_uri=docker_image, instance_count=1, role=ROLE, py_version=py_version, hyperparameters=hyperparameters) estimator.fit({ 'train': f's3://{sagemaker_local_session.default_bucket()}/{s3_prefix}/train', 'test': f's3://{sagemaker_local_session.default_bucket()}/{s3_prefix}/test' })