def test_distributed_training_horovod(sagemaker_session, instance_type, image_uri, tmpdir, framework_version): mpi_options = '-verbose -x orte_base_help_aggregate=0' estimator = TensorFlow(entry_point=os.path.join(RESOURCE_PATH, 'mnist', 'horovod_mnist.py'), role='SageMakerRole', train_instance_type=instance_type, train_instance_count=2, image_name=image_uri, framework_version=framework_version, py_version='py3', script_mode=True, hyperparameters={ 'sagemaker_mpi_enabled': True, 'sagemaker_mpi_custom_mpi_options': mpi_options, 'sagemaker_mpi_num_of_processes_per_host': 1 }, sagemaker_session=sagemaker_session) estimator.fit(job_name=unique_name_from_base('test-tf-horovod')) model_data_source = sagemaker.local.data.get_data_source_instance( estimator.model_data, sagemaker_session) for filename in model_data_source.get_file_list(): assert os.path.basename(filename) == 'model.tar.gz'
def test_mnist(sagemaker_session, image_uri, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') script = os.path.join(resource_path, 'mnist', 'mnist.py') estimator = TensorFlow(entry_point=script, role='SageMakerRole', train_instance_type=instance_type, train_instance_count=1, sagemaker_session=sagemaker_session, image_name=image_uri, framework_version=framework_version, script_mode=True) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, 'mnist', 'data'), key_prefix='scriptmode/mnist') estimator.fit(inputs, job_name=unique_name_from_base('test-sagemaker-mnist')) _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data)
def test_tuning(sagemaker_session, image_uri, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') script = os.path.join(resource_path, 'mnist', 'mnist.py') estimator = TensorFlow(entry_point=script, role='SageMakerRole', train_instance_type=instance_type, train_instance_count=1, sagemaker_session=sagemaker_session, image_name=image_uri, framework_version=framework_version, script_mode=True) hyperparameter_ranges = {'epochs': IntegerParameter(1, 2)} objective_metric_name = 'accuracy' metric_definitions = [{ 'Name': objective_metric_name, 'Regex': 'accuracy = ([0-9\\.]+)' }] tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=2, max_parallel_jobs=2) with timeout(minutes=20): inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, 'mnist', 'data'), key_prefix='scriptmode/mnist') tuning_job_name = unique_name_from_base('test-tf-sm-tuning', max_length=32) tuner.fit(inputs, job_name=tuning_job_name) tuner.wait()
def test_model_dir_with_training_job_name(sagemaker_session, image_uri, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '../..', 'resources') script = os.path.join(resource_path, 'tuning_model_dir', 'entry.py') estimator = TensorFlow(entry_point=script, role='SageMakerRole', train_instance_type=instance_type, train_instance_count=1, image_name=image_uri, framework_version=framework_version, py_version='py3', sagemaker_session=sagemaker_session) tuner = HyperparameterTuner(estimator=estimator, objective_metric_name='accuracy', hyperparameter_ranges={'arbitrary_value': IntegerParameter(0, 1)}, metric_definitions=[{'Name': 'accuracy', 'Regex': 'accuracy=([01])'}], max_jobs=1, max_parallel_jobs=1) # User script has logic to check for the correct model_dir tuner.fit(job_name=unique_name_from_base('test-tf-model-dir', max_length=32)) tuner.wait()
def test_distributed_mnist_ps(sagemaker_session, image_uri, instance_type, framework_version): resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') script = os.path.join(resource_path, 'mnist', 'mnist_estimator.py') estimator = TensorFlow( entry_point=script, role='SageMakerRole', hyperparameters={'sagemaker_parameter_server_enabled': True}, train_instance_count=2, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=image_uri, framework_version=framework_version, script_mode=True) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, 'mnist', 'data-distributed'), key_prefix='scriptmode/mnist-distributed') estimator.fit( inputs, job_name=unique_name_from_base('test-tf-sm-distributed-mnist')) _assert_checkpoint_exists(sagemaker_session.boto_region_name, estimator.model_dir, 0) _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data)