def test_mnist_distributed( sagemaker_session, instance_type, tensorflow_training_latest_version, tensorflow_training_latest_py_version, ): estimator = TensorFlow( entry_point=SCRIPT, role=ROLE, instance_count=2, instance_type=instance_type, sagemaker_session=sagemaker_session, framework_version=tensorflow_training_latest_version, py_version=tensorflow_training_latest_py_version, distribution=PARAMETER_SERVER_DISTRIBUTION, disable_profiler=True, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/distributed_mnist") with tests.integ.timeout.timeout( minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=inputs, job_name=unique_name_from_base("test-tf-sm-distributed")) assert_s3_files_exist( sagemaker_session, estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"], )
def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] security_group_ids = efs_fsx_setup["security_group_ids"] estimator = TensorFlow( entry_point=SCRIPT, role=role, train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, script_mode=True, framework_version=TensorFlow.LATEST_VERSION, py_version=PY_VERSION, subnets=subnets, security_group_ids=security_group_ids, ) file_system_efs_id = efs_fsx_setup["file_system_efs_id"] content_type = "application/json" file_system_input = FileSystemInput( file_system_id=file_system_efs_id, file_system_type="EFS", directory_path=EFS_DIR_PATH, content_type=content_type, ) with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=file_system_input, job_name=unique_name_from_base("test-mnist-efs")) assert_s3_files_exist( sagemaker_session, estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"], )
def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] security_group_ids = efs_fsx_setup["security_group_ids"] kmeans = KMeans( role=role, instance_count=INSTANCE_COUNT, instance_type=cpu_instance_type, k=K, sagemaker_session=sagemaker_session, subnets=subnets, security_group_ids=security_group_ids, ) file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"] records = FileSystemRecordSet( file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH, num_records=NUM_RECORDS, feature_dim=FEATURE_DIM, ) job_name = unique_name_from_base("kmeans-fsx") kmeans.fit(records, job_name=job_name) model_path, _ = kmeans.model_data.rsplit("/", 1) assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])
def test_mnist(sagemaker_session, instance_type): estimator = TensorFlow( entry_point=SCRIPT, role="SageMakerRole", train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, script_mode=True, framework_version=TensorFlow.LATEST_VERSION, py_version=tests.integ.PYTHON_VERSION, metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}], ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist" ) with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=inputs, job_name=unique_name_from_base("test-tf-sm-mnist")) assert_s3_files_exist( sagemaker_session, estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"], ) df = estimator.training_job_analytics.dataframe() assert df.size > 0
def test_mnist_with_checkpoint_config( sagemaker_session, instance_type, tensorflow_training_latest_version, tensorflow_training_latest_py_version, ): checkpoint_s3_uri = "s3://{}/checkpoints/tf-{}".format( sagemaker_session.default_bucket(), sagemaker_timestamp() ) checkpoint_local_path = "/test/checkpoint/path" estimator = TensorFlow( entry_point=SCRIPT, role="SageMakerRole", instance_count=1, instance_type=instance_type, sagemaker_session=sagemaker_session, framework_version=tensorflow_training_latest_version, py_version=tensorflow_training_latest_py_version, metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}], checkpoint_s3_uri=checkpoint_s3_uri, checkpoint_local_path=checkpoint_local_path, environment=ENV_INPUT, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist" ) training_job_name = unique_name_from_base("test-tf-sm-mnist") with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=inputs, job_name=training_job_name) assert_s3_files_exist( sagemaker_session, estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"], ) # remove dataframe assertion to unblock PR build # TODO: add independent integration test for `training_job_analytics` expected_training_checkpoint_config = { "S3Uri": checkpoint_s3_uri, "LocalPath": checkpoint_local_path, } actual_training_checkpoint_config = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=training_job_name )["CheckpointConfig"] actual_training_environment_variable_config = ( sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=training_job_name)[ "Environment" ] ) assert actual_training_checkpoint_config == expected_training_checkpoint_config assert actual_training_environment_variable_config == ENV_INPUT
def test_mnist_with_checkpoint_config(sagemaker_session, instance_type, tf_full_version, py_version): checkpoint_s3_uri = "s3://{}/checkpoints/tf-{}".format( sagemaker_session.default_bucket(), sagemaker_timestamp()) checkpoint_local_path = "/test/checkpoint/path" estimator = TensorFlow( entry_point=SCRIPT, role="SageMakerRole", train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, script_mode=True, framework_version=tf_full_version, py_version=py_version, metric_definitions=[{ "Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)" }], checkpoint_s3_uri=checkpoint_s3_uri, checkpoint_local_path=checkpoint_local_path, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist") training_job_name = unique_name_from_base("test-tf-sm-mnist") with tests.integ.timeout.timeout( minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=inputs, job_name=training_job_name) assert_s3_files_exist( sagemaker_session, estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"], ) df = estimator.training_job_analytics.dataframe() assert df.size > 0 expected_training_checkpoint_config = { "S3Uri": checkpoint_s3_uri, "LocalPath": checkpoint_local_path, } actual_training_checkpoint_config = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=training_job_name)["CheckpointConfig"] assert actual_training_checkpoint_config == expected_training_checkpoint_config
def test_mnist_lustre( efs_fsx_setup, sagemaker_session, cpu_instance_type, tensorflow_training_latest_version, tensorflow_training_latest_py_version, ): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] security_group_ids = efs_fsx_setup["security_group_ids"] estimator = TensorFlow( entry_point=SCRIPT, role=role, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=tensorflow_training_latest_version, py_version=tensorflow_training_latest_py_version, subnets=subnets, security_group_ids=security_group_ids, ) file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"] file_system_input = FileSystemInput(file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH) with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=file_system_input, job_name=unique_name_from_base("test-mnist-lustre")) assert_s3_files_exist( sagemaker_session, estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"], )