def test_hf_smdp_multi(instance_types, ecr_image, py_version, sagemaker_session, tmpdir, framework_version):
    """
    Tests smddprun command via Estimator API distribution parameter
    """
    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
    image_cuda_version = get_cuda_version_from_tag(ecr_image)
    if Version(image_framework_version) < Version("2.3.1") or image_cuda_version != "cu110":
        pytest.skip("Data Parallelism is only supported on CUDA 11, and on TensorFlow 2.3.1 or higher")

    distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}
    instance_type = "ml.p3.16xlarge"
    instance_count = 2

    estimator = HuggingFace(entry_point='train.py',
                           source_dir=BERT_PATH,
                           role='SageMakerRole',
                           instance_type=instance_type,
                           instance_count=instance_count,
                           image_uri=ecr_image,
                           framework_version=framework_version,
                           py_version=py_version,
                           sagemaker_session=sagemaker_session,
                           hyperparameters=hyperparameters,
                           distribution=distribution,
                           debugger_hook_config=False,  # currently needed
                           )

    estimator.fit(job_name=unique_name_from_base('test-tf-hf-smdp-multi'))
def _test_hf_smdp_function(ecr_image, sagemaker_session, instance_type,
                           framework_version, py_version, tmpdir,
                           instance_count):
    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
    image_cuda_version = get_cuda_version_from_tag(ecr_image)

    instance_type = "ml.p3.16xlarge"
    distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}

    estimator = HuggingFace(
        entry_point='train.py',
        source_dir=BERT_PATH,
        role='SageMakerRole',
        instance_type=instance_type,
        instance_count=instance_count,
        image_uri=ecr_image,
        framework_version=framework_version,
        py_version=py_version,
        sagemaker_session=sagemaker_session,
        hyperparameters=hyperparameters,
        distribution=distribution,
        debugger_hook_config=False,  # currently needed
    )

    estimator.fit(job_name=unique_name_from_base("test-tf-hf-smdp-multi"))
Пример #3
0
def _test_smmp_gpu_function(ecr_image, sagemaker_session, py_version,
                            instances_quantity):
    instance_type = "ml.p3.16xlarge"
    instance_count = instances_quantity
    volume_size = 400

    transformers_version = get_transformers_version(ecr_image)
    git_config = {
        "repo": "https://github.com/huggingface/transformers.git",
        "branch": "v" + transformers_version
    }

    huggingface_estimator = HuggingFace(
        entry_point="run_glue.py",
        source_dir="./examples/pytorch/text-classification",
        git_config=git_config,
        instance_type=instance_type,
        instance_count=instance_count,
        volume_size=volume_size,
        role="SageMakerRole",
        image_uri=ecr_image,
        distribution=distribution,
        py_version=py_version,
        hyperparameters=hyperparameters,
        sagemaker_session=sagemaker_session,
    )
    huggingface_estimator.fit(
        job_name=sagemaker.utils.unique_name_from_base("test-hf-pt-qa-smmp"))
def test_hf_smdp(sagemaker_session, instance_type, ecr_image, tmpdir, framework_version):
    """
    Tests SMDataParallel single-node command via script mode
    """
    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
    image_cuda_version = get_cuda_version_from_tag(ecr_image)
    if Version(image_framework_version) < Version("2.3.1") or image_cuda_version != "cu110":
        pytest.skip("Data Parallelism is only supported on CUDA 11, and on TensorFlow 2.3.1 or higher")

    # configuration for running training on smdistributed Data Parallel
    distribution = {'smdistributed': {'dataparallel': {'enabled': True}}}

    instance_type = "ml.p3.16xlarge"
    instance_count = 1

    estimator = HuggingFace(
        entry_point='train.py',
        source_dir=BERT_PATH,
        role='SageMakerRole',
        instance_type=instance_type,
        instance_count=instance_count,
        image_uri=ecr_image,
        framework_version=framework_version,
        py_version=py_version,
        distribution=distribution,
        sagemaker_session=sagemaker_session,
        hyperparameters=hyperparameters,
        debugger_hook_config=False,  # currently needed
    )

    estimator.fit(job_name=unique_name_from_base('test-tf-hf-smdp'))
def _test_smdp_question_answering_function(ecr_image, sagemaker_session,
                                           py_version, instances_quantity):
    transformers_version = get_transformers_version(ecr_image)
    git_config = {
        'repo': 'https://github.com/huggingface/transformers.git',
        'branch': 'v' + transformers_version
    }

    validate_or_skip_smdataparallel(ecr_image)

    instance_count = instances_quantity
    instance_type = "ml.p3.16xlarge"

    source_dir = ("./examples/question-answering"
                  if Version(transformers_version) < Version("4.6") else
                  "./examples/pytorch/question-answering")

    with timeout(minutes=DEFAULT_TIMEOUT):
        estimator = HuggingFace(
            entry_point='run_qa.py',
            source_dir=source_dir,
            git_config=git_config,
            metric_definitions=metric_definitions,
            role='SageMakerRole',
            image_uri=ecr_image,
            instance_count=instance_count,
            instance_type=instance_type,
            sagemaker_session=sagemaker_session,
            py_version=py_version,
            distribution=distribution,
            hyperparameters=hyperparameters,
        )
        estimator.fit(job_name=sagemaker.utils.unique_name_from_base(
            'test-hf-pt-qa-smdp'))
Пример #6
0
    def test_trcomp_enabled(self, patched, sagemaker_session, ecr_image,
                            tmpdir, capsys):
        '''
        Tests the explicit enabled configuration of SM trcomp
        '''
        instance_type = "ml.p3.2xlarge"
        instance_count = 1

        estimator = HuggingFace(
            compiler_config=TrainingCompilerConfig(enabled=True),
            entry_point="train.py",
            source_dir=BERT_PATH,
            role="SageMakerRole",
            instance_type=instance_type,
            instance_count=instance_count,
            image_uri=ecr_image,
            py_version=py_version,
            sagemaker_session=sagemaker_session,
            hyperparameters=hyperparameters,
            debugger_hook_config=False,  # currently needed
            max_retry_attempts=15,
        )

        estimator.fit(
            job_name=unique_name_from_base("hf-tf-trcomp-single-gpu-enabled"),
            logs=True)

        captured = capsys.readouterr()
        logs = captured.out + captured.err
        assert "Found configuration for Training Compiler" in logs
Пример #7
0
def test_single_node_single_gpu_tcc_default(patched, docker_image, processor,
                                            instance_type,
                                            sagemaker_local_session,
                                            py_version, capsys):
    '''
    Single GPU test that tests the local_gpu instance type with default TCC.
    All local mode tests (PT and TF) are run serially on a single instance.
    '''
    hyperparameters = {
        "max_steps": 3,
        "train_batch_size": 4,
        "model_name": "distilbert-base-uncased"
    }

    estimator = HuggingFace(
        compiler_config=TrainingCompilerConfig(),
        entry_point=distrilbert_script,
        instance_type="local_gpu",
        sagemaker_session=sagemaker_local_session,
        image_uri=docker_image,
        instance_count=1,
        role=ROLE,
        hyperparameters=hyperparameters,
        environment={
            'GPU_NUM_DEVICES': '1'
        },  #https://github.com/aws/sagemaker-training-toolkit/issues/107
        py_version=py_version,
    )

    estimator.fit()
Пример #8
0
def test_huggingface_training_tf(
    sagemaker_session,
    gpu_instance_type,
    huggingface_training_latest_version,
    huggingface_tensorflow_latest_version,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "huggingface")

        hf = HuggingFace(
            py_version="py37",
            entry_point=os.path.join(data_path, "run_tf.py"),
            role="SageMakerRole",
            transformers_version=huggingface_training_latest_version,
            tensorflow_version=huggingface_tensorflow_latest_version,
            instance_count=1,
            instance_type=gpu_instance_type,
            hyperparameters={
                "model_name_or_path": "distilbert-base-cased",
                "per_device_train_batch_size": 128,
                "per_device_eval_batch_size": 128,
                "output_dir": "/opt/ml/model",
                "overwrite_output_dir": True,
                "save_steps": 5500,
            },
            sagemaker_session=sagemaker_session,
            disable_profiler=True,
        )

        train_input = hf.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/huggingface/train")

        hf.fit(train_input)
Пример #9
0
def test_smdp_question_answering(ecr_image, instance_type, py_version,
                                 sagemaker_session, tmpdir):
    """
    Tests SM Distributed DataParallel single-node via script mode
    """
    transformers_version = get_transformers_version(ecr_image)
    git_config = {
        'repo': 'https://github.com/huggingface/transformers.git',
        'branch': 'v' + transformers_version
    }

    validate_or_skip_smdataparallel(ecr_image)
    instance_count = 1
    instance_type = "ml.p3.16xlarge"
    with timeout(minutes=DEFAULT_TIMEOUT):
        estimator = HuggingFace(
            entry_point='run_qa.py',
            source_dir='./examples/question-answering',
            git_config=git_config,
            metric_definitions=metric_definitions,
            role='SageMakerRole',
            image_uri=ecr_image,
            instance_count=instance_count,
            instance_type=instance_type,
            sagemaker_session=sagemaker_session,
            py_version=py_version,
            distribution=distribution,
            hyperparameters=hyperparameters,
        )
        estimator.fit(job_name=sagemaker.utils.unique_name_from_base(
            'test-hf-pt-qa-smdp'))
Пример #10
0
    def test_trcomp_debug(self, patched, ecr_image, sagemaker_session, tmpdir,
                          py_version, capsys):
        '''
        Tests the debug mode configuration of SM trcomp
        '''
        transformers_version = get_transformers_version(ecr_image)
        git_config = {
            'repo': 'https://github.com/huggingface/transformers.git',
            'branch': 'v' + transformers_version
        }

        instance_count = 1
        instance_type = "ml.p3.2xlarge"

        source_dir = ("./examples/question-answering"
                      if Version(transformers_version) < Version("4.6") else
                      "./examples/pytorch/question-answering")

        with timeout(minutes=DEFAULT_TIMEOUT):
            estimator = HuggingFace(
                compiler_config=TrainingCompilerConfig(debug=True),
                entry_point='run_qa.py',
                source_dir=source_dir,
                git_config=git_config,
                metric_definitions=metric_definitions,
                role='SageMakerRole',
                image_uri=ecr_image,
                instance_count=instance_count,
                instance_type=instance_type,
                sagemaker_session=sagemaker_session,
                hyperparameters=hyperparameters,
                environment={'GPU_NUM_DEVICES': '1'},
                py_version=py_version,
                max_retry_attempts=15,
            )
            estimator.fit(job_name=sagemaker.utils.unique_name_from_base(
                'hf-pt-trcomp-single-gpu-debug'),
                          logs=True)

        captured = capsys.readouterr()
        logs = captured.out + captured.err
        assert "Found configuration for Training Compiler" in logs
        assert "Training Compiler set to debug mode" in logs
        assert "Configuring SM Training Compiler" in logs
        assert "device: xla" in logs

        debug_artifact_path = estimator.model_data.replace(
            'model.tar.gz', 'output.tar.gz')
        debug_artifact = os.path.join(tmpdir, 'output.tar.gz')
        subprocess.check_output(
            ['aws', 's3', 'cp', debug_artifact_path, debug_artifact])
        with tarfile.open(debug_artifact, 'r:gz') as tarball:
            tarball.extractall(path=tmpdir)
        xla_metrics_file = os.path.join(tmpdir, 'compiler',
                                        'XLA_METRICS_FILE.txt')
        assert os.path.exists(xla_metrics_file)
def test_default_compiler_config(
    time,
    name_from_base,
    sagemaker_session,
    huggingface_training_compiler_version,
    huggingface_training_compiler_pytorch_version,
    instance_class,
):
    compiler_config = TrainingCompilerConfig()
    instance_type = f"ml.{instance_class}.xlarge"

    hf = HuggingFace(
        py_version="py38",
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=instance_type,
        transformers_version=huggingface_training_compiler_version,
        pytorch_version=huggingface_training_compiler_pytorch_version,
        enable_sagemaker_metrics=False,
        compiler_config=compiler_config,
    )

    inputs = "s3://mybucket/train"

    hf.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG)

    sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls]
    assert sagemaker_call_names == ["train", "logs_for_job"]
    boto_call_names = [
        c[0] for c in sagemaker_session.boto_session.method_calls
    ]
    assert boto_call_names == ["resource"]

    expected_train_args = _create_train_job(
        huggingface_training_compiler_version,
        f"pytorch{huggingface_training_compiler_pytorch_version}",
        instance_type,
        compiler_config,
    )
    expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][
        "S3Uri"] = inputs
    expected_train_args["enable_sagemaker_metrics"] = False
    expected_train_args["hyperparameters"][
        TrainingCompilerConfig.HP_ENABLE_COMPILER] = json.dumps(True)
    expected_train_args["hyperparameters"][
        TrainingCompilerConfig.HP_ENABLE_DEBUG] = json.dumps(False)

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert (
        actual_train_args == expected_train_args
    ), f"{json.dumps(actual_train_args, indent=2)} != {json.dumps(expected_train_args, indent=2)}"
Пример #12
0
def test_huggingface_invalid_args():
    with pytest.raises(ValueError) as error:
        HuggingFace(
            py_version="py36",
            entry_point=SCRIPT_PATH,
            role=ROLE,
            instance_count=INSTANCE_COUNT,
            instance_type=INSTANCE_TYPE,
            transformers_version="4.2.1",
            pytorch_version="1.6",
            enable_sagemaker_metrics=False,
        )
    assert "use either full version or shortened version" in str(error)

    with pytest.raises(ValueError) as error:
        HuggingFace(
            py_version="py36",
            entry_point=SCRIPT_PATH,
            role=ROLE,
            instance_count=INSTANCE_COUNT,
            instance_type=INSTANCE_TYPE,
            pytorch_version="1.6",
            enable_sagemaker_metrics=False,
        )
    assert "transformers_version, and image_uri are both None." in str(error)

    with pytest.raises(ValueError) as error:
        HuggingFace(
            py_version="py36",
            entry_point=SCRIPT_PATH,
            role=ROLE,
            instance_count=INSTANCE_COUNT,
            instance_type=INSTANCE_TYPE,
            transformers_version="4.2.1",
            enable_sagemaker_metrics=False,
        )
    assert "tensorflow_version and pytorch_version are both None." in str(
        error)

    with pytest.raises(ValueError) as error:
        HuggingFace(
            py_version="py36",
            entry_point=SCRIPT_PATH,
            role=ROLE,
            instance_count=INSTANCE_COUNT,
            instance_type=INSTANCE_TYPE,
            transformers_version="4.2",
            pytorch_version="1.6",
            tensorflow_version="2.3",
            enable_sagemaker_metrics=False,
        )
    assert "tensorflow_version and pytorch_version are both not None." in str(
        error)
    def create_estimator(self, instance_count):
        job_name = f"{self.env.base_job_name}-{instance_count}-{'ddp' if 'ddp' in self.script else 'smd'}"
        # distributed data settings
        distribution = {
            "smdistributed": {
                "dataparallel": {
                    "enabled": True
                }
            }
        } if self.script != "run_ddp.py" else None

        # creates estimator
        return HuggingFace(
            entry_point=self.script,
            source_dir=self.env.test_path,
            role=self.env.role,
            image_uri=self.env.image_uri,
            base_job_name=job_name,
            instance_count=instance_count,
            instance_type=self.instance_type,
            debugger_hook_config=False,
            hyperparameters={
                **self.env.distributed_hyperparameters, "model_name_or_path":
                self.model_name_or_path
            },
            metric_definitions=self.env.metric_definitions,
            distribution=distribution,
            py_version="py36",
        )
Пример #14
0
def test_attach_custom_image(sagemaker_session):
    training_image = "pytorch:latest"
    returned_job_description = {
        "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image},
        "HyperParameters": {
            "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
            "sagemaker_program": '"iris-dnn-classifier.py"',
            "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"',
            "sagemaker_container_log_level": '"logging.INFO"',
            "sagemaker_job_name": '"neo"',
            "training_steps": "100",
            "sagemaker_region": '"us-east-1"',
        },
        "RoleArn": "arn:aws:iam::366:role/SageMakerRole",
        "ResourceConfig": {
            "VolumeSizeInGB": 30,
            "InstanceCount": 1,
            "InstanceType": "ml.c4.xlarge",
        },
        "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
        "TrainingJobName": "neo",
        "TrainingJobStatus": "Completed",
        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
        "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"},
        "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name="describe_training_job", return_value=returned_job_description
    )

    estimator = HuggingFace.attach(training_job_name="neo", sagemaker_session=sagemaker_session)
    assert estimator.latest_training_job.job_name == "neo"
    assert estimator.image_uri == training_image
def test_distilbert_base(docker_image, processor, instance_type, sagemaker_local_session, py_version):

    # hyperparameters, which are passed into the training job
    hyperparameters = {"max_steps": 5, "train_batch_size": 4, "model_name": "distilbert-base-uncased"}

    estimator = HuggingFace(
        entry_point=distrilbert_script,
        instance_type="local_gpu",
        sagemaker_session=sagemaker_local_session,
        image_uri=docker_image,
        instance_count=1,
        role=ROLE,
        py_version=py_version,
        hyperparameters=hyperparameters,
    )

    estimator.fit()
Пример #16
0
def test_smmp_gpu_multinode(sagemaker_session, framework_version, ecr_image, instance_type, py_version, dist_gpu_backend):
    instance_type = 'ml.p3.16xlarge'
    instance_count = 2
    volume_size = 400

    huggingface_estimator = HuggingFace(entry_point='run_glue.py',
                                        source_dir='./sagemaker/04_distributed_training_model_parallelism/scripts/',
                                        git_config=git_config,
                                        instance_type=instance_type,
                                        instance_count=instance_count,
                                        volume_size=volume_size,
                                        role='SageMakerRole',
                                        image_uri=ecr_image,
                                        distribution=distribution,
                                        py_version=py_version,
                                        hyperparameters=hyperparameters,
                                        sagemaker_session=sagemaker_session)
    huggingface_estimator.fit(job_name=sagemaker.utils.unique_name_from_base('test-hf-pt-qa-smmp-multi'))
Пример #17
0
    def test_trcomp_enabled(self, patched, ecr_image, sagemaker_session,
                            tmpdir, py_version, capsys):
        '''
        Tests the explicit enabled configuration of SM trcomp
        '''
        transformers_version = get_transformers_version(ecr_image)
        git_config = {
            'repo': 'https://github.com/huggingface/transformers.git',
            'branch': 'v' + transformers_version
        }

        instance_count = 1
        instance_type = "ml.p3.2xlarge"

        source_dir = ("./examples/question-answering"
                      if Version(transformers_version) < Version("4.6") else
                      "./examples/pytorch/question-answering")

        with timeout(minutes=DEFAULT_TIMEOUT):
            estimator = HuggingFace(
                compiler_config=TrainingCompilerConfig(enabled=True),
                entry_point='run_qa.py',
                source_dir=source_dir,
                git_config=git_config,
                metric_definitions=metric_definitions,
                role='SageMakerRole',
                image_uri=ecr_image,
                instance_count=instance_count,
                instance_type=instance_type,
                sagemaker_session=sagemaker_session,
                hyperparameters=hyperparameters,
                environment={'GPU_NUM_DEVICES': '1'},
                py_version=py_version,
                max_retry_attempts=15,
            )
            estimator.fit(job_name=sagemaker.utils.unique_name_from_base(
                'hf-pt-trcomp-single-gpu-enabled'),
                          logs=True)
        captured = capsys.readouterr()
        logs = captured.out + captured.err
        assert "Found configuration for Training Compiler" in logs
        assert "Configuring SM Training Compiler" in logs
        assert "device: xla" in logs
Пример #18
0
def test_huggingface(
    time,
    name_from_base,
    sagemaker_session,
    huggingface_training_version,
    huggingface_pytorch_training_version,
    huggingface_pytorch_training_py_version,
):
    hf = HuggingFace(
        py_version=huggingface_pytorch_training_py_version,
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=GPU_INSTANCE_TYPE,
        transformers_version=huggingface_training_version,
        pytorch_version=huggingface_pytorch_training_version,
        enable_sagemaker_metrics=False,
    )

    inputs = "s3://mybucket/train"

    hf.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG)

    sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls]
    assert sagemaker_call_names == ["train", "logs_for_job"]
    boto_call_names = [
        c[0] for c in sagemaker_session.boto_session.method_calls
    ]
    assert boto_call_names == ["resource"]

    expected_train_args = _create_train_job(
        huggingface_training_version,
        f"pytorch{huggingface_pytorch_training_version}")
    expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][
        "S3Uri"] = inputs
    expected_train_args["experiment_config"] = EXPERIMENT_CONFIG
    expected_train_args["enable_sagemaker_metrics"] = False

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args
Пример #19
0
def test_huggingface_training(
    sagemaker_session,
    gpu_instance_type,
    huggingface_training_latest_version,
    huggingface_pytorch_latest_version,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "huggingface")

        hf = HuggingFace(
            py_version="py36",
            entry_point="examples/text-classification/run_glue.py",
            role="SageMakerRole",
            transformers_version=huggingface_training_latest_version,
            pytorch_version=huggingface_pytorch_latest_version,
            instance_count=1,
            instance_type=gpu_instance_type,
            hyperparameters={
                "model_name_or_path": "distilbert-base-cased",
                "task_name": "wnli",
                "do_train": True,
                "do_eval": True,
                "max_seq_length": 128,
                "fp16": True,
                "per_device_train_batch_size": 128,
                "output_dir": "/opt/ml/model",
            },
            sagemaker_session=sagemaker_session,
            git_config={
                "repo": "https://github.com/huggingface/transformers.git",
                "branch": f"v{huggingface_training_latest_version}",
            },
            disable_profiler=True,
        )

        train_input = hf.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/huggingface/train",
        )

        hf.fit(train_input)
Пример #20
0
def test_huggingface_pytorch(
    sagemaker_session,
    gpu_instance_type,
    huggingface_training_compiler_latest_version,
    huggingface_training_compiler_pytorch_latest_version,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "huggingface")

        hf = HuggingFace(
            py_version="py38",
            entry_point=os.path.join(data_path, "run_glue.py"),
            role="SageMakerRole",
            transformers_version=huggingface_training_compiler_latest_version,
            pytorch_version=
            huggingface_training_compiler_pytorch_latest_version,
            instance_count=1,
            instance_type=gpu_instance_type,
            hyperparameters={
                "model_name_or_path": "distilbert-base-cased",
                "task_name": "wnli",
                "do_train": True,
                "do_eval": True,
                "max_seq_length": 128,
                "fp16": True,
                "per_device_train_batch_size": 128,
                "output_dir": "/opt/ml/model",
            },
            environment={"GPU_NUM_DEVICES": "1"},
            sagemaker_session=sagemaker_session,
            disable_profiler=True,
            compiler_config=TrainingCompilerConfig(),
        )

        train_input = hf.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/huggingface/train",
        )

        hf.fit(train_input)
Пример #21
0
def test_attach(
    sagemaker_session, huggingface_training_version, huggingface_pytorch_training_version
):
    training_image = (
        f"1.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:"
        f"{huggingface_pytorch_training_version}-"
        f"transformers{huggingface_training_version}-gpu-py36-cu110-ubuntu18.04"
    )
    returned_job_description = {
        "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image},
        "HyperParameters": {
            "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
            "sagemaker_program": '"iris-dnn-classifier.py"',
            "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"',
            "sagemaker_container_log_level": '"logging.INFO"',
            "sagemaker_job_name": '"neo"',
            "training_steps": "100",
            "sagemaker_region": '"us-east-1"',
        },
        "RoleArn": "arn:aws:iam::366:role/SageMakerRole",
        "ResourceConfig": {
            "VolumeSizeInGB": 30,
            "InstanceCount": 1,
            "InstanceType": "ml.c4.xlarge",
        },
        "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
        "TrainingJobName": "neo",
        "TrainingJobStatus": "Completed",
        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
        "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"},
        "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name="describe_training_job", return_value=returned_job_description
    )

    estimator = HuggingFace.attach(training_job_name="neo", sagemaker_session=sagemaker_session)
    assert estimator.latest_training_job.job_name == "neo"
    assert estimator.py_version == "py36"
    assert estimator.framework_version == huggingface_training_version
    assert estimator.pytorch_version == huggingface_pytorch_training_version
    assert estimator.role == "arn:aws:iam::366:role/SageMakerRole"
    assert estimator.instance_count == 1
    assert estimator.max_run == 24 * 60 * 60
    assert estimator.input_mode == "File"
    assert estimator.base_job_name == "neo"
    assert estimator.output_path == "s3://place/output/neo"
    assert estimator.output_kms_key == ""
    assert estimator.hyperparameters()["training_steps"] == "100"
    assert estimator.source_dir == "s3://some/sourcedir.tar.gz"
    assert estimator.entry_point == "iris-dnn-classifier.py"
Пример #22
0
 def create_estimator(self, instance_count=1):
     # creates estimator
     return HuggingFace(
         entry_point=self.script,
         source_dir=self.env.test_path,
         role=self.env.role,
         image_uri=self.env.image_uri,
         base_job_name=f"{self.env.base_job_name}-single",
         instance_count=instance_count,
         instance_type=self.instance_type,
         debugger_hook_config=False,
         hyperparameters={**self.env.hyperparameters, "model_name_or_path": self.model_name_or_path},
         metric_definitions=self.env.metric_definitions,
         py_version="py36",
     )
def test_unsupported_framework_mxnet(huggingface_training_compiler_version, ):
    with pytest.raises(ValueError):
        HuggingFace(
            py_version="py38",
            entry_point=SCRIPT_PATH,
            role=ROLE,
            instance_count=INSTANCE_COUNT,
            instance_type=INSTANCE_TYPE,
            transformers_version=huggingface_training_compiler_version,
            mxnet_version=".".join(
                ["99"] *
                len(huggingface_training_compiler_version.split("."))),
            enable_sagemaker_metrics=False,
            compiler_config=TrainingCompilerConfig(),
        ).fit()
def test_unsupported_python_2(
    huggingface_training_compiler_version,
    huggingface_training_compiler_pytorch_version,
):
    with pytest.raises(ValueError):
        HuggingFace(
            py_version="py27",
            entry_point=SCRIPT_PATH,
            role=ROLE,
            instance_count=INSTANCE_COUNT,
            instance_type=INSTANCE_TYPE,
            transformers_version=huggingface_training_compiler_version,
            pytorch_version=huggingface_training_compiler_pytorch_version,
            enable_sagemaker_metrics=False,
            compiler_config=TrainingCompilerConfig(),
        ).fit()
    def create_estimator(self, instance_count):

        # configuration for running training on smdistributed Model Parallel
        mpi_options = {
            "enabled": True,
            "processes_per_host": 8,
        }
        smp_options = {
            "enabled": True,
            "parameters": {
                "microbatches": 4,
                "placement_strategy": "spread",
                "pipeline": "interleaved",
                "optimize": "speed",
                "partitions": 4,
                "ddp": True,
            },
        }

        distribution = {
            "smdistributed": {
                "modelparallel": smp_options
            },
            "mpi": mpi_options
        }

        name_extension = "trainer" if self.script == "run_glue.py" else "smtrainer"
        # creates estimator
        return HuggingFace(
            entry_point=self.script,
            source_dir=self.env.test_path,
            role=self.env.role,
            image_uri=self.env.image_uri,
            base_job_name=
            f"{self.env.base_job_name}-{instance_count}-smp-{name_extension}",
            instance_count=instance_count,
            instance_type=self.instance_type,
            debugger_hook_config=False,
            hyperparameters={
                **self.env.hyperparameters,
                "model_name_or_path": self.model_name_or_path,
                "max_steps": 500,
            },
            metric_definitions=self.env.metric_definitions,
            distribution=distribution,
            py_version="py36",
        )
def test_unsupported_BYOC(
    huggingface_training_compiler_version,
    huggingface_training_compiler_pytorch_version,
):
    byoc = (
        "1.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-trcomp-training:"
        "1.9.0-"
        "transformers4.10.2-gpu-"
        "py38-cu111-ubuntu20.04")
    with pytest.raises(ValueError):
        HuggingFace(
            image_uri=byoc,
            py_version="py38",
            entry_point=SCRIPT_PATH,
            role=ROLE,
            instance_count=INSTANCE_COUNT,
            instance_type=INSTANCE_TYPE,
            transformers_version=huggingface_training_compiler_version,
            pytorch_version=huggingface_training_compiler_pytorch_version,
            enable_sagemaker_metrics=False,
            compiler_config=TrainingCompilerConfig(),
        ).fit()
Пример #27
0
def _huggingface_estimator(
    sagemaker_session,
    framework_version,
    pytorch_version,
    tensorflow_version,
    py_version,
    instance_type=None,
    base_job_name=None,
    **kwargs,
):
    return HuggingFace(
        entry_point=SCRIPT_PATH,
        framework_version=framework_version,
        py_version=py_version,
        pytorch_version=pytorch_version,
        tensorflow_version=tensorflow_version,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=instance_type if instance_type else INSTANCE_TYPE,
        base_job_name=base_job_name,
        **kwargs,
    )
Пример #28
0
def sagemaker_launcher(sagemaker_config: SageMakerConfig, args):
    if not is_sagemaker_available():
        raise ImportError(
            "Please install sagemaker to be able to launch training on Amazon SageMaker with `pip install accelerate[sagemaker]`"
        )
    from sagemaker.huggingface import HuggingFace

    # configure environment
    print("Configuring Amazon SageMaker environment")
    os.environ["AWS_DEFAULT_REGION"] = sagemaker_config.region

    # configure credentials
    if sagemaker_config.profile is not None:
        os.environ["AWS_PROFILE"] = sagemaker_config.profile
    elif args.aws_access_key_id is not None and args.aws_secret_access_key is not None:
        os.environ["AWS_ACCESS_KEY_ID"] = args.aws_access_key_id
        os.environ["AWS_SECRET_ACCESS_KEY"] = args.aws_secret_access_key
    else:
        raise EnvironmentError(
            "You need to provide an aws_access_key_id and aws_secret_access_key when not using aws_profile"
        )

    # extract needed arguments
    source_dir = os.path.dirname(args.training_script)
    if not source_dir:  # checks if string is empty
        source_dir = "."
    entry_point = os.path.basename(args.training_script)
    if not entry_point.endswith(".py"):
        raise ValueError(
            f'Your training script should be a python script and not "{entry_point}"'
        )

    print("Converting Arguments to Hyperparameters")
    hyperparameters = _convert_nargs_to_dict(args.training_script_args)

    environment = {
        "USE_FP16": args.fp16
    }  # Environment variables to be set for use during training job

    # configure distribution set up
    distribution = None  # TODO: not yet implemented

    # configure session
    print("Creating Estimator")
    huggingface_estimator = HuggingFace(
        entry_point=entry_point,
        source_dir=source_dir,
        role=sagemaker_config.iam_role_name,
        transformers_version="4.4",
        pytorch_version="1.6",
        py_version="py36",
        base_job_name=sagemaker_config.base_job_name,
        instance_count=sagemaker_config.num_machines,
        instance_type=sagemaker_config.ec2_instance_type,
        debugger_hook_config=False,
        distribution=distribution,
        hyperparameters=hyperparameters,
        environment=environment,
    )

    huggingface_estimator.fit()
    print(
        f"You can find your model data at: {huggingface_estimator.model_data}")
# instance configurations
# instance_type = "ml.p3.16xlarge"
instance_type = "ml.p3.2xlarge"
# instance_type = "ml.p3dn.24xlarge"
instance_count = 1
# volume_size=200

image_uri = "570106654206.dkr.ecr.us-east-1.amazonaws.com/keras-smddp-private-preview:tf-2-4-1-hf-keras-05-27-06-37-16-a10645a1"

huggingface_estimator = HuggingFace(
    # distibuted script,
    entry_point="train.py",
    # single_node script,
    # entry_point="singe_node_train.py",
    source_dir="./scripts",
    instance_type=instance_type,
    role=role,
    session=sess,
    instance_count=instance_count,
    image_uri=image_uri,
    # transformers_version="4.5.0",
    # tensorflow_version="2.4.1",
    py_version="py37",
    distribution=distribution,
    hyperparameters=hyperparameters,
    base_job_name="hf-tf-bert-" + str(instance_count) + "node-" +
    instance_type.replace(".", "-"),
    debugger_hook_config=False,  # currently needed
)
huggingface_estimator.fit()
def test_distilbert_base(docker_image, processor, instance_type,
                         sagemaker_local_session, py_version):
    from datasets import load_dataset
    from transformers import AutoTokenizer

    # tokenizer used in preprocessing
    tokenizer_name = 'distilbert-base-uncased'

    # dataset used
    dataset_name = 'imdb'

    # s3 key prefix for the data
    s3_prefix = 'samples/datasets/imdb'
    # load dataset
    dataset = load_dataset(dataset_name)

    # download tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    # tokenizer helper function
    def tokenize(batch):
        return tokenizer(batch['text'], padding='max_length', truncation=True)

    # load dataset
    train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])
    test_dataset = test_dataset.shuffle().select(
        range(100))  # smaller the size for test dataset to 10k

    # tokenize dataset
    train_dataset = train_dataset.map(tokenize,
                                      batched=True,
                                      batch_size=len(train_dataset))
    test_dataset = test_dataset.map(tokenize,
                                    batched=True,
                                    batch_size=len(test_dataset))

    # set format for pytorch
    train_dataset.rename_column_("label", "labels")
    train_dataset.set_format('torch',
                             columns=['input_ids', 'attention_mask', 'labels'])
    test_dataset.rename_column_("label", "labels")
    test_dataset.set_format('torch',
                            columns=['input_ids', 'attention_mask', 'labels'])

    # hyperparameters, which are passed into the training job
    hyperparameters = {
        'max_steps': 5,
        'train_batch_size': 4,
        'model_name': 'distilbert-base-uncased'
    }

    s3 = S3FileSystem()

    # save train_dataset to s3
    training_input_path = f's3://{sagemaker_local_session.default_bucket()}/{s3_prefix}/train'
    train_dataset.save_to_disk(training_input_path, fs=s3)

    # save test_dataset to s3
    test_input_path = f's3://{sagemaker_local_session.default_bucket()}/{s3_prefix}/test'
    test_dataset.save_to_disk(test_input_path, fs=s3)

    estimator = HuggingFace(entry_point=distrilbert_script,
                            instance_type='local_gpu',
                            sagemaker_session=sagemaker_local_session,
                            image_uri=docker_image,
                            instance_count=1,
                            role=ROLE,
                            py_version=py_version,
                            hyperparameters=hyperparameters)

    estimator.fit({
        'train':
        f's3://{sagemaker_local_session.default_bucket()}/{s3_prefix}/train',
        'test':
        f's3://{sagemaker_local_session.default_bucket()}/{s3_prefix}/test'
    })