def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    enable_cloudwatch_metrics = 'true'
    image = 'pytorch:9000'
    pytorch = PyTorch(entry_point=SCRIPT_PATH,
                      role=ROLE,
                      sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT,
                      train_instance_type=INSTANCE_TYPE,
                      container_log_level=container_log_level,
                      image_name=image,
                      base_job_name='job',
                      source_dir=source_dir,
                      enable_cloudwatch_metrics=enable_cloudwatch_metrics)

    job_name = 'new_name'
    pytorch.fit(inputs='s3://mybucket/train', job_name='new_name')
    model = pytorch.create_model()

    assert model.sagemaker_session == sagemaker_session
    assert model.image == image
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
    assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics
예제 #2
0
def train_in_sagemaker(role, data_channels: dict, server_source_dir: str,
                       aws_account_id: str, aws_region: str, device: str,
                       debug: bool, hyperparameters: dict):
    instance_type, image_version = __get_instance_info(device=device,
                                                       debug=debug,
                                                       mode="training")

    # create estimator
    image_url_training = "{}.dkr.ecr.{}.amazonaws.com/youyakuman:{}".format(
        aws_account_id, aws_region, image_version)
    print("image_url : {}".format(image_url_training))
    estimator = PyTorch(entry_point="youyakuman_train_and_deploy.py",
                        source_dir=server_source_dir,
                        role=role,
                        framework_version='1.5.0',
                        train_instance_count=1,
                        train_instance_type=instance_type,
                        hyperparameters=hyperparameters,
                        image_name=image_url_training)

    # start to train
    date_str = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    job_name = "youyakuman-{}-{}".format(device, date_str)
    print("job_name is {}".format(job_name))
    estimator.fit(data_channels, job_name=job_name)

    return estimator, job_name
def test_smdataparallel_pt_mnist(
    sagemaker_session,
    pytorch_training_latest_version,
    pytorch_training_latest_py_version,
):
    job_name = sagemaker.utils.unique_name_from_base(
        "pt-sm-distributed-dataparallel")
    estimator = PyTorch(
        entry_point="mnist_pt.py",
        role="SageMakerRole",
        source_dir=smdataparallel_dir,
        instance_count=2,
        instance_type="ml.p3.16xlarge",
        sagemaker_session=sagemaker_session,
        framework_version=pytorch_training_latest_version,
        py_version=pytorch_training_latest_py_version,
        distribution={"smdistributed": {
            "dataparallel": {
                "enabled": True
            }
        }},
    )

    with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit({"training": _upload_training_data(estimator)},
                      job_name=job_name)
예제 #4
0
def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    image = "pytorch:9000"
    pytorch = PyTorch(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        container_log_level=container_log_level,
        image_name=image,
        base_job_name="job",
        source_dir=source_dir,
    )

    job_name = "new_name"
    pytorch.fit(inputs="s3://mybucket/train", job_name="new_name")
    model = pytorch.create_model()

    assert model.sagemaker_session == sagemaker_session
    assert model.image == image
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
예제 #5
0
def test_create_model_with_optional_params(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    enable_cloudwatch_metrics = "true"
    pytorch = PyTorch(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        container_log_level=container_log_level,
        base_job_name="job",
        source_dir=source_dir,
        enable_cloudwatch_metrics=enable_cloudwatch_metrics,
    )

    pytorch.fit(inputs="s3://mybucket/train", job_name="new_name")

    new_role = "role"
    model_server_workers = 2
    vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]}
    model = pytorch.create_model(
        role=new_role,
        model_server_workers=model_server_workers,
        vpc_config_override=vpc_config,
        entry_point=SERVING_SCRIPT_FILE,
    )

    assert model.role == new_role
    assert model.model_server_workers == model_server_workers
    assert model.vpc_config == vpc_config
    assert model.entry_point == SERVING_SCRIPT_FILE
예제 #6
0
def test_attach_wrong_framework(sagemaker_session):
    rjd = {'AlgorithmSpecification':
           {'TrainingInputMode': 'File',
            'TrainingImage': '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4'},
           'HyperParameters':
               {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
                'checkpoint_path': '"s3://other/1508872349"',
                'sagemaker_program': '"iris-dnn-classifier.py"',
                'sagemaker_enable_cloudwatch_metrics': 'false',
                'sagemaker_container_log_level': '"logging.INFO"',
                'training_steps': '100',
                'sagemaker_region': '"us-west-2"'},
           'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
           'ResourceConfig':
               {'VolumeSizeInGB': 30,
                'InstanceCount': 1,
                'InstanceType': 'ml.c4.xlarge'},
           'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
           'TrainingJobName': 'neo',
           'TrainingJobStatus': 'Completed',
           'OutputDataConfig': {'KmsKeyId': '',
                                'S3OutputPath': 's3://place/output/neo'},
           'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
    sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd)

    with pytest.raises(ValueError) as error:
        PyTorch.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert "didn't use image for requested framework" in str(error)
def test_training_smdebug(sagemaker_session, framework_version, ecr_image,
                          instance_type):
    hyperparameters = {
        'random_seed': True,
        'num_steps': 50,
        'smdebug_path': '/tmp/ml/output/tensors',
        'epochs': 1,
        'data_dir': training_dir,
    }

    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(
            entry_point=smdebug_mnist_script,
            role='SageMakerRole',
            instance_count=1,
            instance_type=instance_type,
            sagemaker_session=sagemaker_session,
            image_uri=ecr_image,
            framework_version=framework_version,
            hyperparameters=hyperparameters,
        )
        training_input = pytorch.sagemaker_session.upload_data(
            path=training_dir, key_prefix='pytorch/mnist')
        pytorch.fit(
            {'training': training_input},
            job_name=utils.unique_name_from_base('test-pt-smdebug-training'))
예제 #8
0
def test_dist_operations_fastai_gpu(sagemaker_session, framework_version,
                                    ecr_image):
    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
    if Version(image_framework_version) == Version("1.9"):
        pytest.skip("Fast ai is not supported on PyTorch v1.9 ")

    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(
            entry_point='train_cifar.py',
            source_dir=os.path.join(fastai_path, 'cifar'),
            role='SageMakerRole',
            instance_count=1,
            instance_type=MULTI_GPU_INSTANCE,
            sagemaker_session=sagemaker_session,
            image_uri=ecr_image,
            framework_version=framework_version,
        )
        pytorch.sagemaker_session.default_bucket()
        training_input = pytorch.sagemaker_session.upload_data(
            path=os.path.join(fastai_path, 'cifar_tiny', 'training'),
            key_prefix='pytorch/distributed_operations')
        pytorch.fit({'training': training_input},
                    job_name=utils.unique_name_from_base('test-pt-fastai'))

    model_s3_url = pytorch.create_model().model_data
    _assert_s3_file_exists(sagemaker_session.boto_region_name, model_s3_url)
예제 #9
0
def test_create_model(sagemaker_session, pytorch_version):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    pytorch = PyTorch(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        framework_version=pytorch_version,
        container_log_level=container_log_level,
        base_job_name="job",
        source_dir=source_dir,
    )

    job_name = "new_name"
    pytorch.fit(inputs="s3://mybucket/train", job_name="new_name")
    model = pytorch.create_model()

    assert model.sagemaker_session == sagemaker_session
    assert model.framework_version == pytorch_version
    assert model.py_version == pytorch.py_version
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
    assert model.vpc_config is None
예제 #10
0
def test_attach_wrong_framework(sagemaker_session):
    rjd = {'AlgorithmSpecification':
           {'TrainingInputMode': 'File',
            'TrainingImage': '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4'},
           'HyperParameters':
               {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
                'checkpoint_path': '"s3://other/1508872349"',
                'sagemaker_program': '"iris-dnn-classifier.py"',
                'sagemaker_enable_cloudwatch_metrics': 'false',
                'sagemaker_container_log_level': '"logging.INFO"',
                'training_steps': '100',
                'sagemaker_region': '"us-west-2"'},
           'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
           'ResourceConfig':
               {'VolumeSizeInGB': 30,
                'InstanceCount': 1,
                'InstanceType': 'ml.c4.xlarge'},
           'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
           'TrainingJobName': 'neo',
           'TrainingJobStatus': 'Completed',
           'OutputDataConfig': {'KmsKeyId': '',
                                'S3OutputPath': 's3://place/output/neo'},
           'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
    sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd)

    with pytest.raises(ValueError) as error:
        PyTorch.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert "didn't use image for requested framework" in str(error)
예제 #11
0
def test_create_model_with_custom_image(name_from_base, sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    image = "pytorch:9000"
    base_job_name = "job"

    pytorch = PyTorch(
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        container_log_level=container_log_level,
        image_uri=image,
        base_job_name=base_job_name,
        source_dir=source_dir,
    )

    pytorch.fit(inputs="s3://mybucket/train", job_name="new_name")

    model_name = "model_name"
    name_from_base.return_value = model_name
    model = pytorch.create_model()

    assert model.sagemaker_session == sagemaker_session
    assert model.image_uri == image
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == model_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir

    name_from_base.assert_called_with(base_job_name)
예제 #12
0
def test_smdataparallel_throughput(n_virginia_sagemaker_session, framework_version, n_virginia_ecr_image, instance_types, tmpdir):
    with timeout(minutes=DEFAULT_TIMEOUT):
        validate_or_skip_smdataparallel_efa(n_virginia_ecr_image)
        hyperparameters = {
            "size": 64,
            "num_tensors": 20,
            "iterations": 100,
            "warmup": 10,
            "bucket_size": 25,
            "info": "PT-{}-N{}".format(instance_types, 2)
        }
        distribution = {'smdistributed': {'dataparallel': {'enabled': True}}}
        pytorch = PyTorch(
            entry_point='smdataparallel_throughput.py',
            role='SageMakerRole',
            instance_count=2,
            instance_type=instance_types,
            source_dir=throughput_path,
            sagemaker_session=n_virginia_sagemaker_session,
            image_uri=n_virginia_ecr_image,
            framework_version=framework_version,
            hyperparameters=hyperparameters,
            distribution=distribution
        )
        pytorch.fit()
def test_attach_wrong_framework(sagemaker_session):
    rjd = {
        "AlgorithmSpecification": {
            "TrainingInputMode": "File",
            "TrainingImage": "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4",
        },
        "HyperParameters": {
            "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
            "checkpoint_path": '"s3://other/1508872349"',
            "sagemaker_program": '"iris-dnn-classifier.py"',
            "sagemaker_enable_cloudwatch_metrics": "false",
            "sagemaker_container_log_level": '"logging.INFO"',
            "training_steps": "100",
            "sagemaker_region": '"us-west-2"',
        },
        "RoleArn": "arn:aws:iam::366:role/SageMakerRole",
        "ResourceConfig": {
            "VolumeSizeInGB": 30,
            "InstanceCount": 1,
            "InstanceType": "ml.c4.xlarge",
        },
        "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
        "TrainingJobName": "neo",
        "TrainingJobStatus": "Completed",
        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
        "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"},
        "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name="describe_training_job", return_value=rjd
    )

    with pytest.raises(ValueError) as error:
        PyTorch.attach(training_job_name="neo", sagemaker_session=sagemaker_session)
    assert "didn't use image for requested framework" in str(error)
예제 #14
0
def test_smmodelparallel_smdataparallel_mnist(instance_types, ecr_image, py_version, sagemaker_session, tmpdir):
    """
    Tests SM Distributed DataParallel and ModelParallel single-node via script mode
    This test has been added for SM DataParallelism and ModelParallelism tests for re:invent.
    TODO: Consider reworking these tests after re:Invent releases are done
    """
    can_run_modelparallel = can_run_smmodelparallel(ecr_image)
    can_run_dataparallel = can_run_smdataparallel(ecr_image)
    if can_run_dataparallel and can_run_modelparallel:
        entry_point = 'smdataparallel_smmodelparallel_mnist_script_mode.sh'
    elif can_run_dataparallel:
        entry_point = 'smdataparallel_mnist_script_mode.sh'
    elif can_run_modelparallel:
        entry_point = 'smmodelparallel_mnist_script_mode.sh'
    else:
        pytest.skip("Both modelparallel and dataparallel dont support this image, nothing to run")

    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point=entry_point,
                          role='SageMakerRole',
                          image_uri=ecr_image,
                          source_dir=mnist_path,
                          instance_count=1,
                          instance_type=instance_types,
                          sagemaker_session=sagemaker_session)

        pytorch = _disable_sm_profiler(sagemaker_session.boto_region_name, pytorch)

        pytorch.fit()
def test_horovod_training(
    instances,
    processes,
    train_instance_type,
    sagemaker_session,
    image_uri,
    framework_version,
    tmpdir,
):
    estimator = PyTorch(
        entry_point=os.path.join(resources_path, "horovod", "train.py"),
        role="SageMakerRole",
        train_instance_type=train_instance_type,
        sagemaker_session=sagemaker_session,
        train_instance_count=instances,
        image_name=image_uri,
        framework_version=framework_version,
        hyperparameters={
            "sagemaker_mpi_enabled": True,
            "sagemaker_mpi_num_of_processes_per_host": processes,
            "epochs": 1,
        },
    )

    with timeout(minutes=DEFAULT_TIMEOUT):
        estimator.fit()
def test_create_model_with_optional_params(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    enable_cloudwatch_metrics = 'true'
    pytorch = PyTorch(entry_point=SCRIPT_PATH,
                      role=ROLE,
                      sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT,
                      train_instance_type=INSTANCE_TYPE,
                      container_log_level=container_log_level,
                      base_job_name='job',
                      source_dir=source_dir,
                      enable_cloudwatch_metrics=enable_cloudwatch_metrics)

    pytorch.fit(inputs='s3://mybucket/train', job_name='new_name')

    new_role = 'role'
    model_server_workers = 2
    vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']}
    model = pytorch.create_model(role=new_role,
                                 model_server_workers=model_server_workers,
                                 vpc_config_override=vpc_config)

    assert model.role == new_role
    assert model.model_server_workers == model_server_workers
    assert model.vpc_config == vpc_config
예제 #17
0
def _test_mnist_distributed(sagemaker_session, ecr_image, instance_type,
                            dist_backend):
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point=mnist_script,
                          role='SageMakerRole',
                          train_instance_count=2,
                          train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session,
                          image_name=ecr_image,
                          hyperparameters={
                              'backend': dist_backend,
                              'epochs': 1
                          })
        training_input = pytorch.sagemaker_session.upload_data(
            path=training_dir, key_prefix='pytorch/mnist')
        pytorch.fit({'training': training_input})

    with timeout_and_delete_endpoint(estimator=pytorch, minutes=30):
        predictor = pytorch.deploy(initial_instance_count=1,
                                   instance_type=instance_type)

        batch_size = 100
        data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32)
        output = predictor.predict(data)

        assert output.shape == (batch_size, 10)
예제 #18
0
def test_smmodelparallel_mnist_multigpu_multinode(ecr_image, instance_type,
                                                  py_version,
                                                  sagemaker_session, tmpdir):
    """
    Tests pt mnist command via script mode
    """
    instance_type = "ml.p3.16xlarge"
    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
    image_cuda_version = get_cuda_version_from_tag(ecr_image)
    if not (Version(image_framework_version)
            in SpecifierSet(">=1.6,<1.8")) or image_cuda_version != "cu110":
        pytest.skip(
            "Model Parallelism only supports CUDA 11 on PyTorch 1.6 and PyTorch 1.7"
        )

    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point='smmodelparallel_pt_mnist_multinode.sh',
                          role='SageMakerRole',
                          image_uri=ecr_image,
                          source_dir=mnist_path,
                          instance_count=2,
                          instance_type=instance_type,
                          sagemaker_session=sagemaker_session)

        pytorch.fit()
예제 #19
0
def main():
    sagemaker_session = sagemaker.Session()
    stepfunctions.set_stream_logger(level=logging.INFO)

    bucket = 's3://pixiv-image-backet'

    sagemaker_execution_role = 'arn:aws:iam::829044821271:role/service-role/AmazonSageMaker-ExecutionRole-20200412T194702'
    workflow_execution_role = 'arn:aws:iam::829044821271:role/StepFunctionsWorkflowExecutionRole'

    estimator1 = PyTorch(entry_point='train.py',
                         source_dir='projection_discriminator',
                         role=sagemaker_execution_role,
                         framework_version='1.4.0',
                         train_instance_count=2,
                         train_instance_type='ml.m5.2xlarge',
                         hyperparameters={
                             'train_epoch': 1,
                         })

    estimator2 = PyTorch(entry_point='train.py',
                         source_dir='wgan_gp',
                         role=sagemaker_execution_role,
                         framework_version='1.4.0',
                         train_instance_count=2,
                         train_instance_type='ml.m5.2xlarge',
                         hyperparameters={
                             'train_epoch': 1,
                         })

    training_step1 = steps.TrainingStep(state_id='Train Step1',
                                        estimator=estimator1,
                                        data={
                                            'training': bucket,
                                        },
                                        job_name='PD-Train-{0}'.format(
                                            uuid.uuid4()))

    training_step2 = steps.TrainingStep(state_id='Train Step2',
                                        estimator=estimator2,
                                        data={
                                            'training': bucket,
                                        },
                                        job_name='PD-Train-{0}'.format(
                                            uuid.uuid4()))

    parallel_state = steps.Parallel(state_id='Parallel', )

    parallel_state.add_branch(training_step1)
    parallel_state.add_branch(training_step2)

    workflow_definition = steps.Chain([parallel_state])

    workflow = Workflow(
        name='MyTraining-{0}'.format(uuid.uuid4()),
        definition=workflow_definition,
        role=workflow_execution_role,
    )

    workflow.create()
    workflow.execute()
예제 #20
0
def test_smmodelparallel_smdataparallel_mnist(instance_types, ecr_image,
                                              py_version, sagemaker_session,
                                              tmpdir):
    """
    Tests SM Distributed DataParallel and ModelParallel single-node via script mode
    This test has been added for SM DataParallelism and ModelParallelism tests for re:invent.
    TODO: Consider reworking these tests after re:Invent releases are done
    """
    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
    image_cuda_version = get_cuda_version_from_tag(ecr_image)
    if not (Version(image_framework_version)
            in SpecifierSet(">=1.6,<1.8")) or image_cuda_version != "cu110":
        pytest.skip(
            "Model Parallelism only supports CUDA 11 on PyTorch 1.6 and PyTorch 1.7"
        )
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(
            entry_point='smdataparallel_smmodelparallel_mnist_script_mode.sh',
            role='SageMakerRole',
            image_uri=ecr_image,
            source_dir=mnist_path,
            instance_count=1,
            instance_type=instance_types,
            sagemaker_session=sagemaker_session)

        pytorch = _disable_sm_profiler(sagemaker_session.boto_region_name,
                                       pytorch)

        pytorch.fit()
예제 #21
0
def main():
    testloader = download_training_data()

    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}

    # For local training a dummy role will be sufficient
    role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001'

    print('Starting model training')
    print(
        'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.'
    )
    cifar10_estimator = PyTorch(entry_point='cifar10_pytorch.py',
                                source_dir='./code',
                                role=role,
                                framework_version='1.7.1',
                                py_version='py3',
                                instance_count=1,
                                instance_type='local',
                                hyperparameters={
                                    'epochs': 1,
                                })

    cifar10_estimator.fit('file://./data/')

    print('Deploying local mode endpoint')
    predictor = cifar10_estimator.deploy(initial_instance_count=1,
                                         instance_type='local')

    do_inference_on_local_endpoint(predictor, testloader)

    predictor.delete_endpoint(predictor.endpoint)
    predictor.delete_model()
예제 #22
0
def _test_dist_operations(sagemaker_session,
                          framework_version,
                          ecr_image,
                          instance_type,
                          dist_backend,
                          instance_count=3):
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(
            entry_point=dist_operations_path,
            role='SageMakerRole',
            instance_count=instance_count,
            instance_type=instance_type,
            sagemaker_session=sagemaker_session,
            image_uri=ecr_image,
            framework_version=framework_version,
            hyperparameters={'backend': dist_backend},
        )

        pytorch = _disable_sm_profiler(sagemaker_session.boto_region_name,
                                       pytorch)

        pytorch.sagemaker_session.default_bucket()
        fake_input = pytorch.sagemaker_session.upload_data(
            path=dist_operations_path,
            key_prefix='pytorch/distributed_operations')
        pytorch.fit({'required_argument': fake_input})
def test_train_image_default(sagemaker_session):
    pytorch = PyTorch(entry_point=SCRIPT_PATH,
                      role=ROLE,
                      sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT,
                      train_instance_type=INSTANCE_TYPE)

    assert _get_full_cpu_image_uri(
        defaults.PYTORCH_VERSION,
        defaults.PYTHON_VERSION) in pytorch.train_image()
예제 #24
0
def _test_dgl_training(sagemaker_session, ecr_image, instance_type):
    dgl = PyTorch(entry_point=DGL_SCRIPT_PATH,
                  role='SageMakerRole',
                  train_instance_count=1,
                  train_instance_type=instance_type,
                  sagemaker_session=sagemaker_session,
                  image_name=ecr_image)
    with timeout(minutes=DEFAULT_TIMEOUT):
        job_name = utils.unique_name_from_base('test-pytorch-dgl-image')
        dgl.fit(job_name=job_name)
예제 #25
0
def _test_dgl_LT_09x_training(ecr_image, sagemaker_session, instance_type):
    dgl = PyTorch(
        entry_point=DGL_LT_09x_SCRIPT_PATH,
        role="SageMakerRole",
        instance_count=1,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        image_uri=ecr_image,
    )
    with timeout(minutes=DEFAULT_TIMEOUT):
        job_name = utils.unique_name_from_base("test-pytorch-dgl-image")
        dgl.fit(job_name=job_name)
예제 #26
0
def _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, dist_backend):
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point=mnist_script,
                          role='SageMakerRole',
                          train_instance_count=2,
                          train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session,
                          image_name=ecr_image,
                          hyperparameters={'backend': dist_backend, 'epochs': 1})
        training_input = pytorch.sagemaker_session.upload_data(path=training_dir,
                                                               key_prefix='pytorch/mnist')
        pytorch.fit({'training': training_input})
예제 #27
0
def _test_dist_operations(sagemaker_session, ecr_image, instance_type, dist_backend, train_instance_count=3):
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point=dist_operations_path,
                          role='SageMakerRole',
                          train_instance_count=train_instance_count,
                          train_instance_type=instance_type,
                          sagemaker_session=sagemaker_session,
                          image_name=ecr_image,
                          hyperparameters={'backend': dist_backend})
        pytorch.sagemaker_session.default_bucket()
        fake_input = pytorch.sagemaker_session.upload_data(path=dist_operations_path,
                                                           key_prefix='pytorch/distributed_operations')
        pytorch.fit({'required_argument': fake_input})
예제 #28
0
def test_mnist_gpu(sagemaker_session, ecr_image, py_version, dist_gpu_backend):
    pytorch = PyTorch(entry_point=mnist_script,
                      role='SageMakerRole',
                      train_instance_count=2,
                      image_name=ecr_image,
                      train_instance_type=MULTI_GPU_INSTANCE,
                      sagemaker_session=sagemaker_session,
                      hyperparameters={'backend': dist_gpu_backend})

    training_input = sagemaker_session.upload_data(path=os.path.join(
        data_dir, 'training'),
                                                   key_prefix='pytorch/mnist')
    pytorch.fit({'training': training_input})
def test_pt_s3_plugin_sm_cpu(sagemaker_session, framework_version, ecr_image):
    validate_or_skip_s3_plugin(ecr_image)
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point="main.py",
                          source_dir=resnet18_path,
                          image_uri=ecr_image,
                          role='SageMakerRole',
                          instance_count=1,
                          instance_type=CPU_INSTANCE,
                          sagemaker_session=sagemaker_session,
                          framework_version=framework_version)
        job_name = utils.unique_name_from_base('test-pytorch-s3-plugin-cpu')
        pytorch.fit(job_name=job_name)
def test_horovod_simple(
    instances,
    processes,
    train_instance_type,
    sagemaker_session,
    image_uri,
    framework_version,
    tmpdir,
):
    default_bucket = sagemaker_session.default_bucket()
    output_path = "s3://" + os.path.join(default_bucket, "pytorch/horovod")

    estimator = PyTorch(
        entry_point=os.path.join(resources_path, "horovod", "simple.py"),
        role="SageMakerRole",
        train_instance_type=train_instance_type,
        sagemaker_session=sagemaker_session,
        train_instance_count=instances,
        image_name=image_uri,
        output_path=output_path,
        framework_version=framework_version,
        hyperparameters={
            "sagemaker_mpi_enabled": True,
            "sagemaker_mpi_num_of_processes_per_host": processes,
        },
    )

    with timeout(minutes=DEFAULT_TIMEOUT):
        estimator.fit()

    bucket, key_prefix = estimator.model_data.replace("s3://",
                                                      "").split("/", 1)
    sagemaker_session.download_data(path=str(tmpdir),
                                    bucket=bucket,
                                    key_prefix=key_prefix)

    with tarfile.open(os.path.join(str(tmpdir), "model.tar.gz")) as tar:
        tar.extractall(tmpdir)

    size = instances * processes

    for rank in range(size):
        local_rank = rank % processes
        # The simple.py script should create a JSON file with this name
        filename = "local-rank-%s-rank-%s.json" % (local_rank, rank)

        with open(os.path.join(str(tmpdir), filename)) as file:
            actual = json.load(file)
        expected = {"local-rank": local_rank, "rank": rank, "size": size}

        assert actual == expected
예제 #31
0
def test_smmodelparallel_mnist_multigpu_multinode(n_virginia_ecr_image,
                                                  instance_type, py_version,
                                                  n_virginia_sagemaker_session,
                                                  tmpdir, test_script,
                                                  num_processes):
    """
    Tests pt mnist command via script mode
    """
    instance_type = "ml.p3.16xlarge"
    validate_or_skip_smmodelparallel(n_virginia_ecr_image)
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(
            entry_point=test_script,
            role='SageMakerRole',
            image_uri=n_virginia_ecr_image,
            source_dir=mnist_path,
            instance_count=2,
            instance_type=instance_type,
            sagemaker_session=n_virginia_sagemaker_session,
            hyperparameters={
                "assert-losses": 1,
                "amp": 1,
                "ddp": 1,
                "data-dir": "data/training",
                "epochs": 5
            },
            distribution={
                "smdistributed": {
                    "modelparallel": {
                        "enabled": True,
                        "parameters": {
                            "partitions": 2,
                            "microbatches": 4,
                            "optimize": "speed",
                            "pipeline": "interleaved",
                            "ddp": True,
                        },
                    }
                },
                "mpi": {
                    "enabled":
                    True,
                    "processes_per_host":
                    num_processes,
                    "custom_mpi_options":
                    "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none ",
                },
            },
        )
        pytorch.fit(
            job_name=utils.unique_name_from_base('test-pt-smdmp-multinode'))
예제 #32
0
def test_attach_custom_image(sagemaker_session):
    training_image = 'pytorch:latest'
    returned_job_description = {'AlgorithmSpecification':
                                {'TrainingInputMode': 'File',
                                 'TrainingImage': training_image},
                                'HyperParameters':
                                    {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
                                     'sagemaker_program': '"iris-dnn-classifier.py"',
                                     'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"',
                                     'sagemaker_enable_cloudwatch_metrics': 'false',
                                     'sagemaker_container_log_level': '"logging.INFO"',
                                     'sagemaker_job_name': '"neo"',
                                     'training_steps': '100',
                                     'sagemaker_region': '"us-west-2"'},
                                'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
                                'ResourceConfig':
                                    {'VolumeSizeInGB': 30,
                                     'InstanceCount': 1,
                                     'InstanceType': 'ml.c4.xlarge'},
                                'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
                                'TrainingJobName': 'neo',
                                'TrainingJobStatus': 'Completed',
                                'OutputDataConfig': {'KmsKeyId': '',
                                                     'S3OutputPath': 's3://place/output/neo'},
                                'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
    sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job',
                                                                    return_value=returned_job_description)

    estimator = PyTorch.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert estimator.latest_training_job.job_name == 'neo'
    assert estimator.image_name == training_image
    assert estimator.train_image() == training_image
예제 #33
0
def test_create_model_with_optional_params(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    enable_cloudwatch_metrics = 'true'
    pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                      container_log_level=container_log_level, base_job_name='job', source_dir=source_dir,
                      enable_cloudwatch_metrics=enable_cloudwatch_metrics)

    pytorch.fit(inputs='s3://mybucket/train', job_name='new_name')

    new_role = 'role'
    model_server_workers = 2
    model = pytorch.create_model(role=new_role, model_server_workers=model_server_workers)

    assert model.role == new_role
    assert model.model_server_workers == model_server_workers
예제 #34
0
def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    image = 'pytorch:9000'
    pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                      container_log_level=container_log_level, image_name=image,
                      base_job_name='job', source_dir=source_dir)

    job_name = 'new_name'
    pytorch.fit(inputs='s3://mybucket/train', job_name='new_name')
    model = pytorch.create_model()

    assert model.sagemaker_session == sagemaker_session
    assert model.image == image
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
예제 #35
0
def test_pytorch(strftime, sagemaker_session, pytorch_version):
    pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                      framework_version=pytorch_version, py_version=PYTHON_VERSION)

    inputs = 's3://mybucket/train'

    pytorch.fit(inputs=inputs)

    sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls]
    assert sagemaker_call_names == ['train', 'logs_for_job']
    boto_call_names = [c[0] for c in sagemaker_session.boto_session.method_calls]
    assert boto_call_names == ['resource']

    expected_train_args = _create_train_job(pytorch_version)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = pytorch.create_model()

    expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-pytorch:{}-gpu-{}'
    assert {'Environment':
            {'SAGEMAKER_SUBMIT_DIRECTORY':
             's3://mybucket/sagemaker-pytorch-{}/source/sourcedir.tar.gz'.format(TIMESTAMP),
             'SAGEMAKER_PROGRAM': 'dummy_script.py',
             'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false',
             'SAGEMAKER_REGION': 'us-west-2',
             'SAGEMAKER_CONTAINER_LOG_LEVEL': '20'},
            'Image': expected_image_base.format(pytorch_version, PYTHON_VERSION),
            'ModelDataUrl': 's3://m/m.tar.gz'} == model.prepare_container_def(GPU)

    assert 'cpu' in model.prepare_container_def(CPU)['Image']
    predictor = pytorch.deploy(1, GPU)
    assert isinstance(predictor, PyTorchPredictor)
예제 #36
0
def test_attach(sagemaker_session, pytorch_version):
    training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-pytorch:{}-cpu-{}'.format(pytorch_version,
                                                                                            PYTHON_VERSION)
    returned_job_description = {'AlgorithmSpecification':
                                {'TrainingInputMode': 'File',
                                 'TrainingImage': training_image},
                                'HyperParameters':
                                    {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
                                     'sagemaker_program': '"iris-dnn-classifier.py"',
                                     'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"',
                                     'sagemaker_enable_cloudwatch_metrics': 'false',
                                     'sagemaker_container_log_level': '"logging.INFO"',
                                     'sagemaker_job_name': '"neo"',
                                     'training_steps': '100',
                                     'sagemaker_region': '"us-west-2"'},
                                'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
                                'ResourceConfig':
                                    {'VolumeSizeInGB': 30,
                                     'InstanceCount': 1,
                                     'InstanceType': 'ml.c4.xlarge'},
                                'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60},
                                'TrainingJobName': 'neo',
                                'TrainingJobStatus': 'Completed',
                                'OutputDataConfig': {'KmsKeyId': '',
                                                     'S3OutputPath': 's3://place/output/neo'},
                                'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}}
    sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job',
                                                                    return_value=returned_job_description)

    estimator = PyTorch.attach(training_job_name='neo', sagemaker_session=sagemaker_session)
    assert estimator.latest_training_job.job_name == 'neo'
    assert estimator.py_version == PYTHON_VERSION
    assert estimator.framework_version == pytorch_version
    assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole'
    assert estimator.train_instance_count == 1
    assert estimator.train_max_run == 24 * 60 * 60
    assert estimator.input_mode == 'File'
    assert estimator.base_job_name == 'neo'
    assert estimator.output_path == 's3://place/output/neo'
    assert estimator.output_kms_key == ''
    assert estimator.hyperparameters()['training_steps'] == '100'
    assert estimator.source_dir == 's3://some/sourcedir.tar.gz'
    assert estimator.entry_point == 'iris-dnn-classifier.py'
예제 #37
0
def test_train_image_default(sagemaker_session):
    pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                      train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE)

    assert _get_full_cpu_image_uri(defaults.PYTORCH_VERSION, defaults.PYTHON_VERSION) in pytorch.train_image()