def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' image = 'pytorch:9000' pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, image_name=image, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) job_name = 'new_name' pytorch.fit(inputs='s3://mybucket/train', job_name='new_name') model = pytorch.create_model() assert model.sagemaker_session == sagemaker_session assert model.image == image assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics
def train_in_sagemaker(role, data_channels: dict, server_source_dir: str, aws_account_id: str, aws_region: str, device: str, debug: bool, hyperparameters: dict): instance_type, image_version = __get_instance_info(device=device, debug=debug, mode="training") # create estimator image_url_training = "{}.dkr.ecr.{}.amazonaws.com/youyakuman:{}".format( aws_account_id, aws_region, image_version) print("image_url : {}".format(image_url_training)) estimator = PyTorch(entry_point="youyakuman_train_and_deploy.py", source_dir=server_source_dir, role=role, framework_version='1.5.0', train_instance_count=1, train_instance_type=instance_type, hyperparameters=hyperparameters, image_name=image_url_training) # start to train date_str = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") job_name = "youyakuman-{}-{}".format(device, date_str) print("job_name is {}".format(job_name)) estimator.fit(data_channels, job_name=job_name) return estimator, job_name
def test_smdataparallel_pt_mnist( sagemaker_session, pytorch_training_latest_version, pytorch_training_latest_py_version, ): job_name = sagemaker.utils.unique_name_from_base( "pt-sm-distributed-dataparallel") estimator = PyTorch( entry_point="mnist_pt.py", role="SageMakerRole", source_dir=smdataparallel_dir, instance_count=2, instance_type="ml.p3.16xlarge", sagemaker_session=sagemaker_session, framework_version=pytorch_training_latest_version, py_version=pytorch_training_latest_py_version, distribution={"smdistributed": { "dataparallel": { "enabled": True } }}, ) with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit({"training": _upload_training_data(estimator)}, job_name=job_name)
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" image = "pytorch:9000" pytorch = PyTorch( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, image_name=image, base_job_name="job", source_dir=source_dir, ) job_name = "new_name" pytorch.fit(inputs="s3://mybucket/train", job_name="new_name") model = pytorch.create_model() assert model.sagemaker_session == sagemaker_session assert model.image == image assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" enable_cloudwatch_metrics = "true" pytorch = PyTorch( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name="job", source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics, ) pytorch.fit(inputs="s3://mybucket/train", job_name="new_name") new_role = "role" model_server_workers = 2 vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]} model = pytorch.create_model( role=new_role, model_server_workers=model_server_workers, vpc_config_override=vpc_config, entry_point=SERVING_SCRIPT_FILE, ) assert model.role == new_role assert model.model_server_workers == model_server_workers assert model.vpc_config == vpc_config assert model.entry_point == SERVING_SCRIPT_FILE
def test_attach_wrong_framework(sagemaker_session): rjd = {'AlgorithmSpecification': {'TrainingInputMode': 'File', 'TrainingImage': '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4'}, 'HyperParameters': {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'checkpoint_path': '"s3://other/1508872349"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=rjd) with pytest.raises(ValueError) as error: PyTorch.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error)
def test_training_smdebug(sagemaker_session, framework_version, ecr_image, instance_type): hyperparameters = { 'random_seed': True, 'num_steps': 50, 'smdebug_path': '/tmp/ml/output/tensors', 'epochs': 1, 'data_dir': training_dir, } with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch( entry_point=smdebug_mnist_script, role='SageMakerRole', instance_count=1, instance_type=instance_type, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, hyperparameters=hyperparameters, ) training_input = pytorch.sagemaker_session.upload_data( path=training_dir, key_prefix='pytorch/mnist') pytorch.fit( {'training': training_input}, job_name=utils.unique_name_from_base('test-pt-smdebug-training'))
def test_dist_operations_fastai_gpu(sagemaker_session, framework_version, ecr_image): _, image_framework_version = get_framework_and_version_from_tag(ecr_image) if Version(image_framework_version) == Version("1.9"): pytest.skip("Fast ai is not supported on PyTorch v1.9 ") with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch( entry_point='train_cifar.py', source_dir=os.path.join(fastai_path, 'cifar'), role='SageMakerRole', instance_count=1, instance_type=MULTI_GPU_INSTANCE, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, ) pytorch.sagemaker_session.default_bucket() training_input = pytorch.sagemaker_session.upload_data( path=os.path.join(fastai_path, 'cifar_tiny', 'training'), key_prefix='pytorch/distributed_operations') pytorch.fit({'training': training_input}, job_name=utils.unique_name_from_base('test-pt-fastai')) model_s3_url = pytorch.create_model().model_data _assert_s3_file_exists(sagemaker_session.boto_region_name, model_s3_url)
def test_create_model(sagemaker_session, pytorch_version): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" pytorch = PyTorch( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=pytorch_version, container_log_level=container_log_level, base_job_name="job", source_dir=source_dir, ) job_name = "new_name" pytorch.fit(inputs="s3://mybucket/train", job_name="new_name") model = pytorch.create_model() assert model.sagemaker_session == sagemaker_session assert model.framework_version == pytorch_version assert model.py_version == pytorch.py_version assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.vpc_config is None
def test_create_model_with_custom_image(name_from_base, sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" image = "pytorch:9000" base_job_name = "job" pytorch = PyTorch( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, container_log_level=container_log_level, image_uri=image, base_job_name=base_job_name, source_dir=source_dir, ) pytorch.fit(inputs="s3://mybucket/train", job_name="new_name") model_name = "model_name" name_from_base.return_value = model_name model = pytorch.create_model() assert model.sagemaker_session == sagemaker_session assert model.image_uri == image assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == model_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir name_from_base.assert_called_with(base_job_name)
def test_smdataparallel_throughput(n_virginia_sagemaker_session, framework_version, n_virginia_ecr_image, instance_types, tmpdir): with timeout(minutes=DEFAULT_TIMEOUT): validate_or_skip_smdataparallel_efa(n_virginia_ecr_image) hyperparameters = { "size": 64, "num_tensors": 20, "iterations": 100, "warmup": 10, "bucket_size": 25, "info": "PT-{}-N{}".format(instance_types, 2) } distribution = {'smdistributed': {'dataparallel': {'enabled': True}}} pytorch = PyTorch( entry_point='smdataparallel_throughput.py', role='SageMakerRole', instance_count=2, instance_type=instance_types, source_dir=throughput_path, sagemaker_session=n_virginia_sagemaker_session, image_uri=n_virginia_ecr_image, framework_version=framework_version, hyperparameters=hyperparameters, distribution=distribution ) pytorch.fit()
def test_attach_wrong_framework(sagemaker_session): rjd = { "AlgorithmSpecification": { "TrainingInputMode": "File", "TrainingImage": "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0.4", }, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "checkpoint_path": '"s3://other/1508872349"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_container_log_level": '"logging.INFO"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=rjd ) with pytest.raises(ValueError) as error: PyTorch.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error)
def test_smmodelparallel_smdataparallel_mnist(instance_types, ecr_image, py_version, sagemaker_session, tmpdir): """ Tests SM Distributed DataParallel and ModelParallel single-node via script mode This test has been added for SM DataParallelism and ModelParallelism tests for re:invent. TODO: Consider reworking these tests after re:Invent releases are done """ can_run_modelparallel = can_run_smmodelparallel(ecr_image) can_run_dataparallel = can_run_smdataparallel(ecr_image) if can_run_dataparallel and can_run_modelparallel: entry_point = 'smdataparallel_smmodelparallel_mnist_script_mode.sh' elif can_run_dataparallel: entry_point = 'smdataparallel_mnist_script_mode.sh' elif can_run_modelparallel: entry_point = 'smmodelparallel_mnist_script_mode.sh' else: pytest.skip("Both modelparallel and dataparallel dont support this image, nothing to run") with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=entry_point, role='SageMakerRole', image_uri=ecr_image, source_dir=mnist_path, instance_count=1, instance_type=instance_types, sagemaker_session=sagemaker_session) pytorch = _disable_sm_profiler(sagemaker_session.boto_region_name, pytorch) pytorch.fit()
def test_horovod_training( instances, processes, train_instance_type, sagemaker_session, image_uri, framework_version, tmpdir, ): estimator = PyTorch( entry_point=os.path.join(resources_path, "horovod", "train.py"), role="SageMakerRole", train_instance_type=train_instance_type, sagemaker_session=sagemaker_session, train_instance_count=instances, image_name=image_uri, framework_version=framework_version, hyperparameters={ "sagemaker_mpi_enabled": True, "sagemaker_mpi_num_of_processes_per_host": processes, "epochs": 1, }, ) with timeout(minutes=DEFAULT_TIMEOUT): estimator.fit()
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) pytorch.fit(inputs='s3://mybucket/train', job_name='new_name') new_role = 'role' model_server_workers = 2 vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']} model = pytorch.create_model(role=new_role, model_server_workers=model_server_workers, vpc_config_override=vpc_config) assert model.role == new_role assert model.model_server_workers == model_server_workers assert model.vpc_config == vpc_config
def _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, dist_backend): with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=mnist_script, role='SageMakerRole', train_instance_count=2, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters={ 'backend': dist_backend, 'epochs': 1 }) training_input = pytorch.sagemaker_session.upload_data( path=training_dir, key_prefix='pytorch/mnist') pytorch.fit({'training': training_input}) with timeout_and_delete_endpoint(estimator=pytorch, minutes=30): predictor = pytorch.deploy(initial_instance_count=1, instance_type=instance_type) batch_size = 100 data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def test_smmodelparallel_mnist_multigpu_multinode(ecr_image, instance_type, py_version, sagemaker_session, tmpdir): """ Tests pt mnist command via script mode """ instance_type = "ml.p3.16xlarge" _, image_framework_version = get_framework_and_version_from_tag(ecr_image) image_cuda_version = get_cuda_version_from_tag(ecr_image) if not (Version(image_framework_version) in SpecifierSet(">=1.6,<1.8")) or image_cuda_version != "cu110": pytest.skip( "Model Parallelism only supports CUDA 11 on PyTorch 1.6 and PyTorch 1.7" ) with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point='smmodelparallel_pt_mnist_multinode.sh', role='SageMakerRole', image_uri=ecr_image, source_dir=mnist_path, instance_count=2, instance_type=instance_type, sagemaker_session=sagemaker_session) pytorch.fit()
def main(): sagemaker_session = sagemaker.Session() stepfunctions.set_stream_logger(level=logging.INFO) bucket = 's3://pixiv-image-backet' sagemaker_execution_role = 'arn:aws:iam::829044821271:role/service-role/AmazonSageMaker-ExecutionRole-20200412T194702' workflow_execution_role = 'arn:aws:iam::829044821271:role/StepFunctionsWorkflowExecutionRole' estimator1 = PyTorch(entry_point='train.py', source_dir='projection_discriminator', role=sagemaker_execution_role, framework_version='1.4.0', train_instance_count=2, train_instance_type='ml.m5.2xlarge', hyperparameters={ 'train_epoch': 1, }) estimator2 = PyTorch(entry_point='train.py', source_dir='wgan_gp', role=sagemaker_execution_role, framework_version='1.4.0', train_instance_count=2, train_instance_type='ml.m5.2xlarge', hyperparameters={ 'train_epoch': 1, }) training_step1 = steps.TrainingStep(state_id='Train Step1', estimator=estimator1, data={ 'training': bucket, }, job_name='PD-Train-{0}'.format( uuid.uuid4())) training_step2 = steps.TrainingStep(state_id='Train Step2', estimator=estimator2, data={ 'training': bucket, }, job_name='PD-Train-{0}'.format( uuid.uuid4())) parallel_state = steps.Parallel(state_id='Parallel', ) parallel_state.add_branch(training_step1) parallel_state.add_branch(training_step2) workflow_definition = steps.Chain([parallel_state]) workflow = Workflow( name='MyTraining-{0}'.format(uuid.uuid4()), definition=workflow_definition, role=workflow_execution_role, ) workflow.create() workflow.execute()
def test_smmodelparallel_smdataparallel_mnist(instance_types, ecr_image, py_version, sagemaker_session, tmpdir): """ Tests SM Distributed DataParallel and ModelParallel single-node via script mode This test has been added for SM DataParallelism and ModelParallelism tests for re:invent. TODO: Consider reworking these tests after re:Invent releases are done """ _, image_framework_version = get_framework_and_version_from_tag(ecr_image) image_cuda_version = get_cuda_version_from_tag(ecr_image) if not (Version(image_framework_version) in SpecifierSet(">=1.6,<1.8")) or image_cuda_version != "cu110": pytest.skip( "Model Parallelism only supports CUDA 11 on PyTorch 1.6 and PyTorch 1.7" ) with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch( entry_point='smdataparallel_smmodelparallel_mnist_script_mode.sh', role='SageMakerRole', image_uri=ecr_image, source_dir=mnist_path, instance_count=1, instance_type=instance_types, sagemaker_session=sagemaker_session) pytorch = _disable_sm_profiler(sagemaker_session.boto_region_name, pytorch) pytorch.fit()
def main(): testloader = download_training_data() sagemaker_session = LocalSession() sagemaker_session.config = {'local': {'local_code': True}} # For local training a dummy role will be sufficient role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001' print('Starting model training') print( 'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.' ) cifar10_estimator = PyTorch(entry_point='cifar10_pytorch.py', source_dir='./code', role=role, framework_version='1.7.1', py_version='py3', instance_count=1, instance_type='local', hyperparameters={ 'epochs': 1, }) cifar10_estimator.fit('file://./data/') print('Deploying local mode endpoint') predictor = cifar10_estimator.deploy(initial_instance_count=1, instance_type='local') do_inference_on_local_endpoint(predictor, testloader) predictor.delete_endpoint(predictor.endpoint) predictor.delete_model()
def _test_dist_operations(sagemaker_session, framework_version, ecr_image, instance_type, dist_backend, instance_count=3): with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch( entry_point=dist_operations_path, role='SageMakerRole', instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, hyperparameters={'backend': dist_backend}, ) pytorch = _disable_sm_profiler(sagemaker_session.boto_region_name, pytorch) pytorch.sagemaker_session.default_bucket() fake_input = pytorch.sagemaker_session.upload_data( path=dist_operations_path, key_prefix='pytorch/distributed_operations') pytorch.fit({'required_argument': fake_input})
def test_train_image_default(sagemaker_session): pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE) assert _get_full_cpu_image_uri( defaults.PYTORCH_VERSION, defaults.PYTHON_VERSION) in pytorch.train_image()
def _test_dgl_training(sagemaker_session, ecr_image, instance_type): dgl = PyTorch(entry_point=DGL_SCRIPT_PATH, role='SageMakerRole', train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image) with timeout(minutes=DEFAULT_TIMEOUT): job_name = utils.unique_name_from_base('test-pytorch-dgl-image') dgl.fit(job_name=job_name)
def _test_dgl_LT_09x_training(ecr_image, sagemaker_session, instance_type): dgl = PyTorch( entry_point=DGL_LT_09x_SCRIPT_PATH, role="SageMakerRole", instance_count=1, instance_type=instance_type, sagemaker_session=sagemaker_session, image_uri=ecr_image, ) with timeout(minutes=DEFAULT_TIMEOUT): job_name = utils.unique_name_from_base("test-pytorch-dgl-image") dgl.fit(job_name=job_name)
def _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, dist_backend): with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=mnist_script, role='SageMakerRole', train_instance_count=2, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters={'backend': dist_backend, 'epochs': 1}) training_input = pytorch.sagemaker_session.upload_data(path=training_dir, key_prefix='pytorch/mnist') pytorch.fit({'training': training_input})
def _test_dist_operations(sagemaker_session, ecr_image, instance_type, dist_backend, train_instance_count=3): with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=dist_operations_path, role='SageMakerRole', train_instance_count=train_instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters={'backend': dist_backend}) pytorch.sagemaker_session.default_bucket() fake_input = pytorch.sagemaker_session.upload_data(path=dist_operations_path, key_prefix='pytorch/distributed_operations') pytorch.fit({'required_argument': fake_input})
def test_mnist_gpu(sagemaker_session, ecr_image, py_version, dist_gpu_backend): pytorch = PyTorch(entry_point=mnist_script, role='SageMakerRole', train_instance_count=2, image_name=ecr_image, train_instance_type=MULTI_GPU_INSTANCE, sagemaker_session=sagemaker_session, hyperparameters={'backend': dist_gpu_backend}) training_input = sagemaker_session.upload_data(path=os.path.join( data_dir, 'training'), key_prefix='pytorch/mnist') pytorch.fit({'training': training_input})
def test_pt_s3_plugin_sm_cpu(sagemaker_session, framework_version, ecr_image): validate_or_skip_s3_plugin(ecr_image) with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point="main.py", source_dir=resnet18_path, image_uri=ecr_image, role='SageMakerRole', instance_count=1, instance_type=CPU_INSTANCE, sagemaker_session=sagemaker_session, framework_version=framework_version) job_name = utils.unique_name_from_base('test-pytorch-s3-plugin-cpu') pytorch.fit(job_name=job_name)
def test_horovod_simple( instances, processes, train_instance_type, sagemaker_session, image_uri, framework_version, tmpdir, ): default_bucket = sagemaker_session.default_bucket() output_path = "s3://" + os.path.join(default_bucket, "pytorch/horovod") estimator = PyTorch( entry_point=os.path.join(resources_path, "horovod", "simple.py"), role="SageMakerRole", train_instance_type=train_instance_type, sagemaker_session=sagemaker_session, train_instance_count=instances, image_name=image_uri, output_path=output_path, framework_version=framework_version, hyperparameters={ "sagemaker_mpi_enabled": True, "sagemaker_mpi_num_of_processes_per_host": processes, }, ) with timeout(minutes=DEFAULT_TIMEOUT): estimator.fit() bucket, key_prefix = estimator.model_data.replace("s3://", "").split("/", 1) sagemaker_session.download_data(path=str(tmpdir), bucket=bucket, key_prefix=key_prefix) with tarfile.open(os.path.join(str(tmpdir), "model.tar.gz")) as tar: tar.extractall(tmpdir) size = instances * processes for rank in range(size): local_rank = rank % processes # The simple.py script should create a JSON file with this name filename = "local-rank-%s-rank-%s.json" % (local_rank, rank) with open(os.path.join(str(tmpdir), filename)) as file: actual = json.load(file) expected = {"local-rank": local_rank, "rank": rank, "size": size} assert actual == expected
def test_smmodelparallel_mnist_multigpu_multinode(n_virginia_ecr_image, instance_type, py_version, n_virginia_sagemaker_session, tmpdir, test_script, num_processes): """ Tests pt mnist command via script mode """ instance_type = "ml.p3.16xlarge" validate_or_skip_smmodelparallel(n_virginia_ecr_image) with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch( entry_point=test_script, role='SageMakerRole', image_uri=n_virginia_ecr_image, source_dir=mnist_path, instance_count=2, instance_type=instance_type, sagemaker_session=n_virginia_sagemaker_session, hyperparameters={ "assert-losses": 1, "amp": 1, "ddp": 1, "data-dir": "data/training", "epochs": 5 }, distribution={ "smdistributed": { "modelparallel": { "enabled": True, "parameters": { "partitions": 2, "microbatches": 4, "optimize": "speed", "pipeline": "interleaved", "ddp": True, }, } }, "mpi": { "enabled": True, "processes_per_host": num_processes, "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none ", }, }, ) pytorch.fit( job_name=utils.unique_name_from_base('test-pt-smdmp-multinode'))
def test_attach_custom_image(sagemaker_session): training_image = 'pytorch:latest' returned_job_description = {'AlgorithmSpecification': {'TrainingInputMode': 'File', 'TrainingImage': training_image}, 'HyperParameters': {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=returned_job_description) estimator = PyTorch.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == 'neo' assert estimator.image_name == training_image assert estimator.train_image() == training_image
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) pytorch.fit(inputs='s3://mybucket/train', job_name='new_name') new_role = 'role' model_server_workers = 2 model = pytorch.create_model(role=new_role, model_server_workers=model_server_workers) assert model.role == new_role assert model.model_server_workers == model_server_workers
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' image = 'pytorch:9000' pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, image_name=image, base_job_name='job', source_dir=source_dir) job_name = 'new_name' pytorch.fit(inputs='s3://mybucket/train', job_name='new_name') model = pytorch.create_model() assert model.sagemaker_session == sagemaker_session assert model.image == image assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir
def test_pytorch(strftime, sagemaker_session, pytorch_version): pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=pytorch_version, py_version=PYTHON_VERSION) inputs = 's3://mybucket/train' pytorch.fit(inputs=inputs) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == ['train', 'logs_for_job'] boto_call_names = [c[0] for c in sagemaker_session.boto_session.method_calls] assert boto_call_names == ['resource'] expected_train_args = _create_train_job(pytorch_version) expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = pytorch.create_model() expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-pytorch:{}-gpu-{}' assert {'Environment': {'SAGEMAKER_SUBMIT_DIRECTORY': 's3://mybucket/sagemaker-pytorch-{}/source/sourcedir.tar.gz'.format(TIMESTAMP), 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20'}, 'Image': expected_image_base.format(pytorch_version, PYTHON_VERSION), 'ModelDataUrl': 's3://m/m.tar.gz'} == model.prepare_container_def(GPU) assert 'cpu' in model.prepare_container_def(CPU)['Image'] predictor = pytorch.deploy(1, GPU) assert isinstance(predictor, PyTorchPredictor)
def test_attach(sagemaker_session, pytorch_version): training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-pytorch:{}-cpu-{}'.format(pytorch_version, PYTHON_VERSION) returned_job_description = {'AlgorithmSpecification': {'TrainingInputMode': 'File', 'TrainingImage': training_image}, 'HyperParameters': {'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"', 'sagemaker_program': '"iris-dnn-classifier.py"', 'sagemaker_s3_uri_training': '"sagemaker-3/integ-test-data/tf_iris"', 'sagemaker_enable_cloudwatch_metrics': 'false', 'sagemaker_container_log_level': '"logging.INFO"', 'sagemaker_job_name': '"neo"', 'training_steps': '100', 'sagemaker_region': '"us-west-2"'}, 'RoleArn': 'arn:aws:iam::366:role/SageMakerRole', 'ResourceConfig': {'VolumeSizeInGB': 30, 'InstanceCount': 1, 'InstanceType': 'ml.c4.xlarge'}, 'StoppingCondition': {'MaxRuntimeInSeconds': 24 * 60 * 60}, 'TrainingJobName': 'neo', 'TrainingJobStatus': 'Completed', 'OutputDataConfig': {'KmsKeyId': '', 'S3OutputPath': 's3://place/output/neo'}, 'TrainingJobOutput': {'S3TrainingJobOutput': 's3://here/output.tar.gz'}} sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value=returned_job_description) estimator = PyTorch.attach(training_job_name='neo', sagemaker_session=sagemaker_session) assert estimator.latest_training_job.job_name == 'neo' assert estimator.py_version == PYTHON_VERSION assert estimator.framework_version == pytorch_version assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole' assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == 'File' assert estimator.base_job_name == 'neo' assert estimator.output_path == 's3://place/output/neo' assert estimator.output_kms_key == '' assert estimator.hyperparameters()['training_steps'] == '100' assert estimator.source_dir == 's3://some/sourcedir.tar.gz' assert estimator.entry_point == 'iris-dnn-classifier.py'
def test_train_image_default(sagemaker_session): pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE) assert _get_full_cpu_image_uri(defaults.PYTORCH_VERSION, defaults.PYTHON_VERSION) in pytorch.train_image()