def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) pytorch.fit(inputs='s3://mybucket/train', job_name='new_name') new_role = 'role' model_server_workers = 2 vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']} model = pytorch.create_model(role=new_role, model_server_workers=model_server_workers, vpc_config_override=vpc_config) assert model.role == new_role assert model.model_server_workers == model_server_workers assert model.vpc_config == vpc_config
def test_dist_operations_fastai_gpu(sagemaker_session, framework_version, ecr_image): _, image_framework_version = get_framework_and_version_from_tag(ecr_image) if Version(image_framework_version) == Version("1.9"): pytest.skip("Fast ai is not supported on PyTorch v1.9 ") with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch( entry_point='train_cifar.py', source_dir=os.path.join(fastai_path, 'cifar'), role='SageMakerRole', instance_count=1, instance_type=MULTI_GPU_INSTANCE, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, ) pytorch.sagemaker_session.default_bucket() training_input = pytorch.sagemaker_session.upload_data( path=os.path.join(fastai_path, 'cifar_tiny', 'training'), key_prefix='pytorch/distributed_operations') pytorch.fit({'training': training_input}, job_name=utils.unique_name_from_base('test-pt-fastai')) model_s3_url = pytorch.create_model().model_data _assert_s3_file_exists(sagemaker_session.boto_region_name, model_s3_url)
def train_in_sagemaker(role, data_channels: dict, server_source_dir: str, aws_account_id: str, aws_region: str, device: str, debug: bool, hyperparameters: dict): instance_type, image_version = __get_instance_info(device=device, debug=debug, mode="training") # create estimator image_url_training = "{}.dkr.ecr.{}.amazonaws.com/youyakuman:{}".format( aws_account_id, aws_region, image_version) print("image_url : {}".format(image_url_training)) estimator = PyTorch(entry_point="youyakuman_train_and_deploy.py", source_dir=server_source_dir, role=role, framework_version='1.5.0', train_instance_count=1, train_instance_type=instance_type, hyperparameters=hyperparameters, image_name=image_url_training) # start to train date_str = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") job_name = "youyakuman-{}-{}".format(device, date_str) print("job_name is {}".format(job_name)) estimator.fit(data_channels, job_name=job_name) return estimator, job_name
def _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, dist_backend): with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=mnist_script, role='SageMakerRole', train_instance_count=2, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters={ 'backend': dist_backend, 'epochs': 1 }) training_input = pytorch.sagemaker_session.upload_data( path=training_dir, key_prefix='pytorch/mnist') pytorch.fit({'training': training_input}) with timeout_and_delete_endpoint(estimator=pytorch, minutes=30): predictor = pytorch.deploy(initial_instance_count=1, instance_type=instance_type) batch_size = 100 data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10)
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' image = 'pytorch:9000' pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, image_name=image, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) job_name = 'new_name' pytorch.fit(inputs='s3://mybucket/train', job_name='new_name') model = pytorch.create_model() assert model.sagemaker_session == sagemaker_session assert model.image == image assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics
def test_horovod_training( instances, processes, train_instance_type, sagemaker_session, image_uri, framework_version, tmpdir, ): estimator = PyTorch( entry_point=os.path.join(resources_path, "horovod", "train.py"), role="SageMakerRole", train_instance_type=train_instance_type, sagemaker_session=sagemaker_session, train_instance_count=instances, image_name=image_uri, framework_version=framework_version, hyperparameters={ "sagemaker_mpi_enabled": True, "sagemaker_mpi_num_of_processes_per_host": processes, "epochs": 1, }, ) with timeout(minutes=DEFAULT_TIMEOUT): estimator.fit()
def _test_dgl_LT_09x_training(ecr_image, sagemaker_session, instance_type): dgl = PyTorch( entry_point=DGL_LT_09x_SCRIPT_PATH, role="SageMakerRole", instance_count=1, instance_type=instance_type, sagemaker_session=sagemaker_session, image_uri=ecr_image, ) with timeout(minutes=DEFAULT_TIMEOUT): job_name = utils.unique_name_from_base("test-pytorch-dgl-image") dgl.fit(job_name=job_name)
def test_mnist_gpu(sagemaker_session, ecr_image, py_version, dist_gpu_backend): pytorch = PyTorch(entry_point=mnist_script, role='SageMakerRole', train_instance_count=2, image_name=ecr_image, train_instance_type=MULTI_GPU_INSTANCE, sagemaker_session=sagemaker_session, hyperparameters={'backend': dist_gpu_backend}) training_input = sagemaker_session.upload_data(path=os.path.join( data_dir, 'training'), key_prefix='pytorch/mnist') pytorch.fit({'training': training_input})
def _test_dist_operations(sagemaker_session, ecr_image, instance_type, dist_backend, train_instance_count=3): with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=dist_operations_path, role='SageMakerRole', train_instance_count=train_instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters={'backend': dist_backend}) pytorch.sagemaker_session.default_bucket() fake_input = pytorch.sagemaker_session.upload_data(path=dist_operations_path, key_prefix='pytorch/distributed_operations') pytorch.fit({'required_argument': fake_input})
def test_pytorch(strftime, sagemaker_session, pytorch_version): pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=pytorch_version, py_version=PYTHON_VERSION) inputs = 's3://mybucket/train' pytorch.fit(inputs=inputs) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == ['train', 'logs_for_job'] boto_call_names = [ c[0] for c in sagemaker_session.boto_session.method_calls ] assert boto_call_names == ['resource'] expected_train_args = _create_train_job(pytorch_version) expected_train_args['input_config'][0]['DataSource']['S3DataSource'][ 'S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = pytorch.create_model() expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-pytorch:{}-gpu-{}' assert { 'Environment': { 'SAGEMAKER_SUBMIT_DIRECTORY': 's3://mybucket/sagemaker-pytorch-{}/source/sourcedir.tar.gz'. format(TIMESTAMP), 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20' }, 'Image': expected_image_base.format(pytorch_version, PYTHON_VERSION), 'ModelDataUrl': 's3://m/m.tar.gz' } == model.prepare_container_def(GPU) assert 'cpu' in model.prepare_container_def(CPU)['Image'] predictor = pytorch.deploy(1, GPU) assert isinstance(predictor, PyTorchPredictor)
def test_smmodelparallel_mnist_multigpu_multinode(n_virginia_ecr_image, instance_type, py_version, n_virginia_sagemaker_session, tmpdir, test_script, num_processes): """ Tests pt mnist command via script mode """ instance_type = "ml.p3.16xlarge" validate_or_skip_smmodelparallel(n_virginia_ecr_image) with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch( entry_point=test_script, role='SageMakerRole', image_uri=n_virginia_ecr_image, source_dir=mnist_path, instance_count=2, instance_type=instance_type, sagemaker_session=n_virginia_sagemaker_session, hyperparameters={ "assert-losses": 1, "amp": 1, "ddp": 1, "data-dir": "data/training", "epochs": 5 }, distribution={ "smdistributed": { "modelparallel": { "enabled": True, "parameters": { "partitions": 2, "microbatches": 4, "optimize": "speed", "pipeline": "interleaved", "ddp": True, }, } }, "mpi": { "enabled": True, "processes_per_host": num_processes, "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none ", }, }, ) pytorch.fit( job_name=utils.unique_name_from_base('test-pt-smdmp-multinode'))
def test_horovod_simple( instances, processes, train_instance_type, sagemaker_session, image_uri, framework_version, tmpdir, ): default_bucket = sagemaker_session.default_bucket() output_path = "s3://" + os.path.join(default_bucket, "pytorch/horovod") estimator = PyTorch( entry_point=os.path.join(resources_path, "horovod", "simple.py"), role="SageMakerRole", train_instance_type=train_instance_type, sagemaker_session=sagemaker_session, train_instance_count=instances, image_name=image_uri, output_path=output_path, framework_version=framework_version, hyperparameters={ "sagemaker_mpi_enabled": True, "sagemaker_mpi_num_of_processes_per_host": processes, }, ) with timeout(minutes=DEFAULT_TIMEOUT): estimator.fit() bucket, key_prefix = estimator.model_data.replace("s3://", "").split("/", 1) sagemaker_session.download_data(path=str(tmpdir), bucket=bucket, key_prefix=key_prefix) with tarfile.open(os.path.join(str(tmpdir), "model.tar.gz")) as tar: tar.extractall(tmpdir) size = instances * processes for rank in range(size): local_rank = rank % processes # The simple.py script should create a JSON file with this name filename = "local-rank-%s-rank-%s.json" % (local_rank, rank) with open(os.path.join(str(tmpdir), filename)) as file: actual = json.load(file) expected = {"local-rank": local_rank, "rank": rank, "size": size} assert actual == expected
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' enable_cloudwatch_metrics = 'true' pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job', source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics) pytorch.fit(inputs='s3://mybucket/train', job_name='new_name') new_role = 'role' model_server_workers = 2 model = pytorch.create_model(role=new_role, model_server_workers=model_server_workers) assert model.role == new_role assert model.model_server_workers == model_server_workers
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = 's3://mybucket/source' image = 'pytorch:9000' pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, image_name=image, base_job_name='job', source_dir=source_dir) job_name = 'new_name' pytorch.fit(inputs='s3://mybucket/train', job_name='new_name') model = pytorch.create_model() assert model.sagemaker_session == sagemaker_session assert model.image == image assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir
def test_pytorch(strftime, sagemaker_session, pytorch_version): pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=pytorch_version, py_version=PYTHON_VERSION) inputs = 's3://mybucket/train' pytorch.fit(inputs=inputs) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == ['train', 'logs_for_job'] boto_call_names = [c[0] for c in sagemaker_session.boto_session.method_calls] assert boto_call_names == ['resource'] expected_train_args = _create_train_job(pytorch_version) expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = pytorch.create_model() expected_image_base = '520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-pytorch:{}-gpu-{}' assert {'Environment': {'SAGEMAKER_SUBMIT_DIRECTORY': 's3://mybucket/sagemaker-pytorch-{}/source/sourcedir.tar.gz'.format(TIMESTAMP), 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20'}, 'Image': expected_image_base.format(pytorch_version, PYTHON_VERSION), 'ModelDataUrl': 's3://m/m.tar.gz'} == model.prepare_container_def(GPU) assert 'cpu' in model.prepare_container_def(CPU)['Image'] predictor = pytorch.deploy(1, GPU) assert isinstance(predictor, PyTorchPredictor)