def __init__(
        self,
        file_system_id,
        file_system_type,
        directory_path,
        num_records,
        feature_dim,
        file_system_access_mode="ro",
        channel="train",
    ):
        """Initialize a ``FileSystemRecordSet`` object.

        Args:
            file_system_id (str): An Amazon file system ID starting with 'fs-'.
            file_system_type (str): The type of file system used for the input.
                Valid values: 'EFS', 'FSxLustre'.
            directory_path (str): Absolute or normalized path to the root directory (mount point) in
                the file system. Reference:
                https://docs.aws.amazon.com/efs/latest/ug/mounting-fs.html and
                https://docs.aws.amazon.com/efs/latest/ug/wt1-test.html
            num_records (int): The number of records in the set.
            feature_dim (int): The dimensionality of "values" arrays in the Record features,
                and label (if each Record is labeled).
            file_system_access_mode (str): Permissions for read and write.
                Valid values: 'ro' or 'rw'. Defaults to 'ro'.
            channel (str): The SageMaker Training Job channel this RecordSet should be bound to
        """

        self.file_system_input = FileSystemInput(
            file_system_id, file_system_type, directory_path, file_system_access_mode
        )
        self.feature_dim = feature_dim
        self.num_records = num_records
        self.channel = channel
def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
    role = efs_fsx_setup["role_name"]
    subnets = [efs_fsx_setup["subnet_id"]]
    security_group_ids = efs_fsx_setup["security_group_ids"]

    estimator = TensorFlow(
        entry_point=SCRIPT,
        role=role,
        train_instance_count=1,
        train_instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
        script_mode=True,
        framework_version=TensorFlow.LATEST_VERSION,
        py_version=PY_VERSION,
        subnets=subnets,
        security_group_ids=security_group_ids,
    )

    file_system_efs_id = efs_fsx_setup["file_system_efs_id"]
    content_type = "application/json"
    file_system_input = FileSystemInput(
        file_system_id=file_system_efs_id,
        file_system_type="EFS",
        directory_path=EFS_DIR_PATH,
        content_type=content_type,
    )
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=file_system_input,
                      job_name=unique_name_from_base("test-mnist-efs"))

    assert_s3_files_exist(
        sagemaker_session,
        estimator.model_dir,
        ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
    )
Пример #3
0
def test_file_system_input_content_type():
    file_system_id = "fs-0a48d2a1"
    file_system_type = "FSxLustre"
    directory_path = "tensorflow"
    file_system_access_mode = "rw"
    content_type = "application/json"
    actual = FileSystemInput(
        file_system_id=file_system_id,
        file_system_type=file_system_type,
        directory_path=directory_path,
        file_system_access_mode=file_system_access_mode,
        content_type=content_type,
    )
    expected = {
        "DataSource": {
            "FileSystemDataSource": {
                "FileSystemId": file_system_id,
                "FileSystemType": file_system_type,
                "DirectoryPath": directory_path,
                "FileSystemAccessMode": "rw",
            }
        },
        "ContentType": content_type,
    }
    assert actual.config == expected
Пример #4
0
def launch_sagemaker_job(
    hyperparameters: Dict[str, Any],
    job_name: str,
    source_dir: str,
    entry_point: str,
    instance_type: str,
    instance_count: int,
    role: str,
    image_name: str,
    fsx_id: str,
    fsx_mount_name: str,
    subnet_ids: List[str],
    security_group_ids: List[str],
) -> None:
    """ Create a SageMaker job connected to FSx and Horovod. """
    assert fsx_mount_name[
        0] != "/", "fsx_mount_name should not start with a '/'"
    hvd_processes_per_host = {
        "ml.p3dn.24xlarge": 8,
        "ml.p3.16xlarge": 8,
        "ml.g4dn.12xlarge": 4,
    }[instance_type]
    distributions = {
        "mpi": {
            "enabled":
            True,
            "processes_per_host":
            hvd_processes_per_host,
            "custom_mpi_options":
            "-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none",
        }
    }
    # Create FSx input
    fsx_input = FileSystemInput(
        file_system_id=fsx_id,
        file_system_type="FSxLustre",
        directory_path=f"/{fsx_mount_name}",
        file_system_access_mode="rw",
    )
    # Create the job template
    estimator_hvd = TensorFlow(
        base_job_name=job_name,
        entry_point=entry_point,
        source_dir=source_dir,
        role=role,
        framework_version="2.1.0",
        py_version="py3",
        hyperparameters=hyperparameters,
        train_instance_count=instance_count,
        train_instance_type=instance_type,
        distributions=distributions,
        image_name=image_name,
        subnets=subnet_ids,
        security_group_ids=security_group_ids,
        enable_sagemaker_metrics=True,
        train_max_run=2419200,
    )
    # Launch the job
    estimator_hvd.fit(fsx_input)
Пример #5
0
def test_format_string_uri_file_system_input():
    file_system_id = "fs-fd85e556"
    file_system_type = "EFS"
    directory_path = "ipinsights"

    file_system_input = FileSystemInput(
        file_system_id=file_system_id,
        file_system_type=file_system_type,
        directory_path=directory_path,
    )

    uri_input = _Job._format_string_uri_input(file_system_input)
    assert uri_input == file_system_input
Пример #6
0
def test_file_system_input_type_invalid():
    with pytest.raises(ValueError) as excinfo:
        file_system_id = "fs-0a48d2a1"
        file_system_type = "ABC"
        directory_path = "tensorflow"
        FileSystemInput(
            file_system_id=file_system_id,
            file_system_type=file_system_type,
            directory_path=directory_path,
        )
    assert str(
        excinfo.value
    ) == "Unrecognized file system type: ABC. Valid values: FSxLustre, EFS."
def test_tuning_tf_lustre(
    efs_fsx_setup,
    sagemaker_session,
    cpu_instance_type,
    tensorflow_training_latest_version,
    tensorflow_training_latest_py_version,
):
    role = efs_fsx_setup["role_name"]
    subnets = [efs_fsx_setup["subnet_id"]]
    security_group_ids = efs_fsx_setup["security_group_ids"]

    estimator = TensorFlow(
        entry_point=SCRIPT,
        role=role,
        instance_count=1,
        instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
        framework_version=tensorflow_training_latest_version,
        py_version=tensorflow_training_latest_py_version,
        subnets=subnets,
        security_group_ids=security_group_ids,
    )

    hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)}
    objective_metric_name = "accuracy"
    metric_definitions = [{
        "Name": objective_metric_name,
        "Regex": "accuracy = ([0-9\\.]+)"
    }]
    tuner = HyperparameterTuner(
        estimator,
        objective_metric_name,
        hyperparameter_ranges,
        metric_definitions,
        max_jobs=MAX_JOBS,
        max_parallel_jobs=MAX_PARALLEL_JOBS,
    )

    file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"]
    file_system_input = FileSystemInput(file_system_id=file_system_fsx_id,
                                        file_system_type="FSxLustre",
                                        directory_path=FSX_DIR_PATH)

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        tuning_job_name = unique_name_from_base(
            "test-tuning-tf-script-mode-lustre", max_length=32)
        tuner.fit(file_system_input, job_name=tuning_job_name)
        time.sleep(15)
        tuner.wait()
    best_training_job = tuner.best_training_job()
    assert best_training_job
Пример #8
0
def test_file_system_input_mode_invalid():
    with pytest.raises(ValueError) as excinfo:
        file_system_id = "fs-0a48d2a1"
        file_system_type = "EFS"
        directory_path = "tensorflow"
        file_system_access_mode = "p"
        FileSystemInput(
            file_system_id=file_system_id,
            file_system_type=file_system_type,
            directory_path=directory_path,
            file_system_access_mode=file_system_access_mode,
        )
    assert str(
        excinfo.value
    ) == "Unrecognized file system access mode: p. Valid values: ro, rw."
Пример #9
0
def launch_sagemaker_job(
    job_name: str,
    source_dir: str,
    entry_point: str,
    instance_type: str,
    instance_count: int,
    hyperparameters: Dict[str, Any],
) -> None:
    """ Create a SageMaker job connected to FSx and Horovod. """
    hvd_processes_per_host = {"ml.p3dn.24xlarge": 8, "ml.p3.16xlarge": 8, "ml.g4dn.12xlarge": 4,}[
        instance_type
    ]
    distributions = {
        "mpi": {
            "enabled": True,
            "processes_per_host": hvd_processes_per_host,
            "custom_mpi_options": "-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none",
        }
    }
    # Create FSx input
    fsx_input = FileSystemInput(
        file_system_id=FSX_ID,
        file_system_type="FSxLustre",
        directory_path="/fsx",
        file_system_access_mode="rw",
    )
    # Create the job template
    estimator_hvd = TensorFlow(
        base_job_name=job_name,
        entry_point=entry_point,
        source_dir=source_dir,
        role=ROLE,
        framework_version="2.1.0",
        py_version="py3",
        hyperparameters=hyperparameters,
        train_instance_count=instance_count,
        train_instance_type=instance_type,
        distributions=distributions,
        image_name=IMAGE_NAME,
        subnets=SUBNETS,
        security_group_ids=SECURITY_GROUP_IDS,
        enable_sagemaker_metrics=True,
    )
    # Launch the job
    estimator_hvd.fit(fsx_input)
Пример #10
0
def test_file_system_input_default_access_mode():
    file_system_id = "fs-0a48d2a1"
    file_system_type = "EFS"
    directory_path = "tensorflow"
    actual = FileSystemInput(
        file_system_id=file_system_id,
        file_system_type=file_system_type,
        directory_path=directory_path,
    )
    expected = {
        "DataSource": {
            "FileSystemDataSource": {
                "FileSystemId": file_system_id,
                "FileSystemType": file_system_type,
                "DirectoryPath": directory_path,
                "FileSystemAccessMode": "ro",
            }
        }
    }
    assert actual.config == expected
def test_mnist_efs(
    efs_fsx_setup,
    sagemaker_session,
    cpu_instance_type,
    tensorflow_training_latest_version,
    tensorflow_training_latest_py_version,
):
    role = efs_fsx_setup["role_name"]
    subnets = [efs_fsx_setup["subnet_id"]]
    security_group_ids = efs_fsx_setup["security_group_ids"]

    estimator = TensorFlow(
        entry_point=SCRIPT,
        role=role,
        instance_count=1,
        instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
        framework_version=tensorflow_training_latest_version,
        py_version=tensorflow_training_latest_py_version,
        subnets=subnets,
        security_group_ids=security_group_ids,
    )

    file_system_efs_id = efs_fsx_setup["file_system_efs_id"]
    content_type = "application/json"
    file_system_input = FileSystemInput(
        file_system_id=file_system_efs_id,
        file_system_type="EFS",
        directory_path=EFS_DIR_PATH,
        content_type=content_type,
    )
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=file_system_input,
                      job_name=unique_name_from_base("test-mnist-efs"))

    assert_s3_file_patterns_exist(
        sagemaker_session,
        estimator.model_dir,
        [r"model\.ckpt-\d+\.index", r"checkpoint"],
    )
Пример #12
0
def test_file_system_input_all_arguments():
    file_system_id = "fs-0a48d2a1"
    file_system_type = "FSxLustre"
    directory_path = "tensorflow"
    file_system_access_mode = "rw"
    actual = FileSystemInput(
        file_system_id=file_system_id,
        file_system_type=file_system_type,
        directory_path=directory_path,
        file_system_access_mode=file_system_access_mode,
    )
    expected = {
        "DataSource": {
            "FileSystemDataSource": {
                "FileSystemId": file_system_id,
                "FileSystemType": file_system_type,
                "DirectoryPath": directory_path,
                "FileSystemAccessMode": "rw",
            }
        }
    }
    assert actual.config == expected
def test_mnist_lustre(
    efs_fsx_setup,
    sagemaker_session,
    cpu_instance_type,
    tensorflow_training_latest_version,
    tensorflow_training_latest_py_version,
):
    role = efs_fsx_setup["role_name"]
    subnets = [efs_fsx_setup["subnet_id"]]
    security_group_ids = efs_fsx_setup["security_group_ids"]

    estimator = TensorFlow(
        entry_point=SCRIPT,
        role=role,
        instance_count=1,
        instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
        framework_version=tensorflow_training_latest_version,
        py_version=tensorflow_training_latest_py_version,
        subnets=subnets,
        security_group_ids=security_group_ids,
    )

    file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"]
    file_system_input = FileSystemInput(file_system_id=file_system_fsx_id,
                                        file_system_type="FSxLustre",
                                        directory_path=FSX_DIR_PATH)

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=file_system_input,
                      job_name=unique_name_from_base("test-mnist-lustre"))
    assert_s3_files_exist(
        sagemaker_session,
        estimator.model_dir,
        ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
    )
Пример #14
0
region = get_str("echo $(aws configure get region)")
image = str(sys.argv[1])
sess = sage.Session()
image_name=f"{account}.dkr.ecr.{region}.amazonaws.com/{image}"
sagemaker_iam_role = str(sys.argv[2])
num_gpus = 8
num_nodes = 4
instance_type = 'ml.p3.16xlarge'
custom_mpi_cmds = []

job_name = "maskrcnn-{}x{}-{}".format(num_nodes, num_gpus, image)

output_path = 's3://mrcnn-sagemaker/sagemaker_training_release'

lustre_input = FileSystemInput(file_system_id='fs-03f556d03c3c590a2',
                               file_system_type='FSxLustre',
                               directory_path='/fsx',
                               file_system_access_mode='ro')

hyperparams = {"sagemaker_use_mpi": "True",
               "sagemaker_process_slots_per_host": num_gpus,
               "num_gpus":num_gpus,
               "num_nodes": num_nodes,
               "custom_mpi_cmds": custom_mpi_cmds}

estimator = Estimator(image_name, role=sagemaker_iam_role, output_path=output_path,
                      train_instance_count=num_nodes,
                      train_instance_type=instance_type,
                      sagemaker_session=sess,
                      train_volume_size=200,
                      base_job_name=job_name,
                      subnets=['subnet-21ac2f2e'],
Пример #15
0
def handler(event, context):
    trainId = event['trainId']
    useSpotArg = event['useSpot']
    useSpot = True
    if useSpotArg.lower() == 'false':
        useSpot = False
    uniqueId = su.uuid()
    trainingConfigurationClient = bioims.client('training-configuration')
    trainInfo = trainingConfigurationClient.getTraining(trainId)
    embeddingName = trainInfo['embeddingName']
    embeddingInfo = trainingConfigurationClient.getEmbeddingInfo(embeddingName)
    trainScriptBucket = embeddingInfo['modelTrainingScriptBucket']
    trainScriptKey = embeddingInfo['modelTrainingScriptKey']
    localTrainingScript = '/tmp/bioims-training-script.py'
    getS3TextObjectWriteToPath(trainScriptBucket, trainScriptKey,
                               localTrainingScript)
    trainListArtifactKey = bp.getTrainImageListArtifactPath(trainId)
    sagemaker_session = sagemaker.Session()
    sagemaker_bucket = sagemaker_session.default_bucket()
    sagemaker_role = sagemaker.get_execution_role()
    py_version = '1.6.0'
    instance_type = embeddingInfo['trainingInstanceType']
    trainingHyperparameters = embeddingInfo['trainingHyperparameters']
    fsxInfo = getFsxInfo()
    print(fsxInfo)
    directory_path = '/' + fsxInfo['mountName']
    sgIds = []
    sgIds.append(fsxInfo['securityGroup'])
    jobName = 'bioims-' + trainId + '-' + uniqueId
    checkpoint_s3_uri = "s3://" + sagemaker_bucket + "/checkpoints/" + jobName

    file_system_input = FileSystemInput(file_system_id=fsxInfo['fsxId'],
                                        file_system_type='FSxLustre',
                                        directory_path=directory_path,
                                        file_system_access_mode='ro')

    trainingHyperparameters['train_list_file'] = trainListArtifactKey

    if useSpot:
        estimator = PyTorch(
            entry_point=localTrainingScript,
            role=sagemaker_role,
            framework_version=py_version,
            instance_count=1,
            instance_type=instance_type,
            py_version='py36',
            image_name=
            '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.6.0-gpu-py36-cu101-ubuntu16.04',
            subnets=fsxInfo['subnetIds'],
            security_group_ids=sgIds,
            hyperparameters=trainingHyperparameters,
            train_use_spot_instances=True,
            train_max_wait=100000,
            train_max_run=100000,
            checkpoint_s3_uri=checkpoint_s3_uri,
            debugger_hook_config=False)
    else:
        estimator = PyTorch(
            entry_point=localTrainingScript,
            role=sagemaker_role,
            framework_version=py_version,
            instance_count=1,
            instance_type=instance_type,
            py_version='py36',
            image_name=
            '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.6.0-gpu-py36-cu101-ubuntu16.04',
            subnets=fsxInfo['subnetIds'],
            security_group_ids=sgIds,
            hyperparameters=trainingHyperparameters,
            train_use_spot_instances=False,
            checkpoint_s3_uri=checkpoint_s3_uri,
            debugger_hook_config=False)

    trainingConfigurationClient.updateTraining(trainId, 'sagemakerJobName',
                                               jobName)

    estimator.fit(file_system_input, wait=False, job_name=jobName)

    responseInfo = {'trainingJobName': jobName}

    response = {'statusCode': 200, 'body': responseInfo}

    return response