def test_s3_plugin(sagemaker_session, ecr_image, instance_type, region, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') script = os.path.join(resource_path, 'mnist', 'mnist_estimator.py') estimator = TensorFlow(entry_point=script, role='SageMakerRole', hyperparameters={ # Saving a checkpoint after every 5 steps to hammer the S3 plugin 'save-checkpoint-steps': 10, # Reducing throttling for checkpoint and model saving 'throttle-secs': 1, # Without the patch training jobs would fail around 100th to # 150th step 'max-steps': 200, # Large batch size would result in a larger checkpoint file 'batch-size': 1024, # This makes the training job exporting model during training. # Stale model garbage collection will also be performed. 'export-model-during-training': True }, train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, framework_version=framework_version, script_mode=True) estimator.fit('s3://sagemaker-sample-data-{}/tensorflow/mnist'.format(region), job_name=unique_name_from_base('test-tf-sm-s3-mnist')) _assert_s3_file_exists(region, estimator.model_data) _assert_checkpoint_exists(region, estimator.model_dir, 200)
def test_distributed_training_horovod(sagemaker_session, instance_type, ecr_image, tmpdir, framework_version): mpi_options = '-verbose -x orte_base_help_aggregate=0' estimator = TensorFlow(entry_point=os.path.join(RESOURCE_PATH, 'mnist', 'horovod_mnist.py'), role='SageMakerRole', train_instance_type=instance_type, train_instance_count=2, image_name=ecr_image, framework_version=framework_version, py_version='py3', script_mode=True, hyperparameters={ 'sagemaker_mpi_enabled': True, 'sagemaker_mpi_custom_mpi_options': mpi_options, 'sagemaker_mpi_num_of_processes_per_host': 1 }, sagemaker_session=sagemaker_session) estimator.fit(job_name=unique_name_from_base('test-tf-horovod')) model_data_source = sagemaker.local.data.get_data_source_instance( estimator.model_data, sagemaker_session) for filename in model_data_source.get_file_list(): assert os.path.basename(filename) == 'model.tar.gz'
def test_tuning(sagemaker_session, ecr_image, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') script = os.path.join(resource_path, 'mnist', 'mnist.py') estimator = TensorFlow(entry_point=script, role='SageMakerRole', train_instance_type=instance_type, train_instance_count=1, sagemaker_session=sagemaker_session, image_name=ecr_image, framework_version=framework_version, script_mode=True) hyperparameter_ranges = {'epochs': IntegerParameter(1, 2)} objective_metric_name = 'accuracy' metric_definitions = [{'Name': objective_metric_name, 'Regex': 'accuracy = ([0-9\\.]+)'}] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2) with timeout(minutes=20): inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, 'mnist', 'data'), key_prefix='scriptmode/mnist') tuning_job_name = unique_name_from_base('test-tf-sm-tuning', max_length=32) tuner.fit(inputs, job_name=tuning_job_name) tuner.wait()
def run_test(sagemaker_session, ecr_image, instance_type, framework_version, test_data, record_wrapper_type=None): source_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources', 'pipemode') script = os.path.join(source_path, 'pipemode.py') estimator = TensorFlow(entry_point=script, role='SageMakerRole', train_instance_type=instance_type, train_instance_count=1, sagemaker_session=sagemaker_session, image_name=ecr_image, framework_version=framework_version, script_mode=True, input_mode='Pipe', hyperparameters={'dimension': DIMENSION}) input = s3_input(s3_data=test_data, distribution='FullyReplicated', record_wrapping=record_wrapper_type, input_mode='Pipe') with timeout(minutes=20): estimator.fit( {'elizabeth': input}, job_name=unique_name_from_base('test-sagemaker-pipemode'))
def test_distributed_mnist_no_ps(sagemaker_session, ecr_image, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') script = os.path.join(resource_path, 'mnist', 'mnist.py') estimator = TensorFlow(entry_point=script, role='SageMakerRole', train_instance_count=2, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, framework_version=framework_version, script_mode=True) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, 'mnist', 'data'), key_prefix='scriptmode/mnist') estimator.fit(inputs, job_name=unique_name_from_base('test-tf-sm-distributed-mnist')) _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data)
def test_smdebug(sagemaker_session, ecr_image, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') script = os.path.join(resource_path, 'mnist', 'mnist_smdebug.py') hyperparameters = {'smdebug_path': '/opt/ml/output/tensors'} estimator = TensorFlow(entry_point=script, role='SageMakerRole', train_instance_type=instance_type, train_instance_count=1, sagemaker_session=sagemaker_session, image_name=ecr_image, framework_version=framework_version, script_mode=True, hyperparameters=hyperparameters) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, 'mnist', 'data'), key_prefix='scriptmode/mnist_smdebug') estimator.fit(inputs, job_name=unique_name_from_base('test-sagemaker-mnist-smdebug')) _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data)
def test_model_dir_with_training_job_name(sagemaker_session, ecr_image, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '../..', 'resources') script = os.path.join(resource_path, 'tuning_model_dir', 'entry.py') estimator = TensorFlow(entry_point=script, role='SageMakerRole', train_instance_type=instance_type, train_instance_count=1, image_name=ecr_image, framework_version=framework_version, py_version='py3', sagemaker_session=sagemaker_session) tuner = HyperparameterTuner(estimator=estimator, objective_metric_name='accuracy', hyperparameter_ranges={'arbitrary_value': IntegerParameter(0, 1)}, metric_definitions=[{'Name': 'accuracy', 'Regex': 'accuracy=([01])'}], max_jobs=1, max_parallel_jobs=1) # User script has logic to check for the correct model_dir tuner.fit(job_name=unique_name_from_base('test-tf-model-dir', max_length=32)) tuner.wait()