示例#1
0
def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session,
                           cpu_instance_type):
    subnets = [efs_fsx_setup.subnet_id]
    security_group_ids = efs_fsx_setup.security_group_ids
    role = efs_fsx_setup.role_name
    kmeans = KMeans(
        role=role,
        train_instance_count=TRAIN_INSTANCE_COUNT,
        train_instance_type=cpu_instance_type,
        k=K,
        sagemaker_session=sagemaker_session,
        subnets=subnets,
        security_group_ids=security_group_ids,
    )

    hyperparameter_ranges = {
        "extra_center_factor": IntegerParameter(4, 10),
        "mini_batch_size": IntegerParameter(10, 100),
        "epochs": IntegerParameter(1, 2),
        "init_method": CategoricalParameter(["kmeans++", "random"]),
    }

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        tuner = HyperparameterTuner(
            estimator=kmeans,
            objective_metric_name=OBJECTIVE_METRIC_NAME,
            hyperparameter_ranges=hyperparameter_ranges,
            objective_type="Minimize",
            max_jobs=MAX_JOBS,
            max_parallel_jobs=MAX_PARALLEL_JOBS,
        )

        file_system_fsx_id = efs_fsx_setup.file_system_fsx_id
        train_records = FileSystemRecordSet(
            file_system_id=file_system_fsx_id,
            file_system_type="FSxLustre",
            directory_path=FSX_DIR_PATH,
            num_records=NUM_RECORDS,
            feature_dim=FEATURE_DIM,
        )

        test_records = FileSystemRecordSet(
            file_system_id=file_system_fsx_id,
            file_system_type="FSxLustre",
            directory_path=FSX_DIR_PATH,
            num_records=NUM_RECORDS,
            feature_dim=FEATURE_DIM,
            channel="test",
        )

        job_name = unique_name_from_base("tune-kmeans-fsx")
        tuner.fit([train_records, test_records], job_name=job_name)
        tuner.wait()
        best_training_job = tuner.best_training_job()
        assert best_training_job
def test_file_system_record_set_fsx_customized_parameters():
    file_system_id = "fs-0a48d2a1"
    file_system_type = "FSxLustre"
    directory_path = "ipinsights"
    num_records = 1
    feature_dim = 1

    actual = FileSystemRecordSet(
        file_system_id=file_system_id,
        file_system_type=file_system_type,
        directory_path=directory_path,
        num_records=num_records,
        feature_dim=feature_dim,
        file_system_access_mode="rw",
        channel="test",
    )

    expected_input_config = {
        "DataSource": {
            "FileSystemDataSource": {
                "DirectoryPath": "ipinsights",
                "FileSystemId": "fs-0a48d2a1",
                "FileSystemType": "FSxLustre",
                "FileSystemAccessMode": "rw",
            }
        }
    }
    assert actual.file_system_input.config == expected_input_config
    assert actual.num_records == num_records
    assert actual.feature_dim == feature_dim
    assert actual.channel == "test"
def test_file_system_record_set_efs_default_parameters():
    file_system_id = "fs-0a48d2a1"
    file_system_type = "EFS"
    directory_path = "ipinsights"
    num_records = 1
    feature_dim = 1

    actual = FileSystemRecordSet(
        file_system_id=file_system_id,
        file_system_type=file_system_type,
        directory_path=directory_path,
        num_records=num_records,
        feature_dim=feature_dim,
    )

    expected_input_config = {
        "DataSource": {
            "FileSystemDataSource": {
                "DirectoryPath": "ipinsights",
                "FileSystemId": "fs-0a48d2a1",
                "FileSystemType": "EFS",
                "FileSystemAccessMode": "ro",
            }
        }
    }
    assert actual.file_system_input.config == expected_input_config
    assert actual.num_records == num_records
    assert actual.feature_dim == feature_dim
    assert actual.channel == "train"
def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        role = efs_fsx_setup["role_name"]
        subnets = [efs_fsx_setup["subnet_id"]]
        security_group_ids = efs_fsx_setup["security_group_ids"]
        kmeans = KMeans(
            role=role,
            instance_count=INSTANCE_COUNT,
            instance_type=cpu_instance_type,
            k=K,
            sagemaker_session=sagemaker_session,
            subnets=subnets,
            security_group_ids=security_group_ids,
        )

        file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"]
        records = FileSystemRecordSet(
            file_system_id=file_system_fsx_id,
            file_system_type="FSxLustre",
            directory_path=FSX_DIR_PATH,
            num_records=NUM_RECORDS,
            feature_dim=FEATURE_DIM,
        )

        job_name = unique_name_from_base("kmeans-fsx")
        kmeans.fit(records, job_name=job_name)
        model_path, _ = kmeans.model_data.rsplit("/", 1)
        assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])
def test_file_system_record_set_data_channel():
    file_system_id = "fs-0a48d2a1"
    file_system_type = "EFS"
    directory_path = "ipinsights"
    num_records = 1
    feature_dim = 1
    record_set = FileSystemRecordSet(
        file_system_id=file_system_id,
        file_system_type=file_system_type,
        directory_path=directory_path,
        num_records=num_records,
        feature_dim=feature_dim,
    )

    file_system_input = Mock()
    record_set.file_system_input = file_system_input
    actual = record_set.data_channel()
    expected = {"train": file_system_input}
    assert actual == expected
示例#6
0
def test_format_record_set_list_input():
    records = FileSystemRecordSet(
        file_system_id="fs-fd85e556",
        file_system_type="EFS",
        directory_path="ipinsights",
        num_records=100,
        feature_dim=1,
    )
    test_records = FileSystemRecordSet(
        file_system_id="fs-fd85e556",
        file_system_type="EFS",
        directory_path="ipinsights",
        num_records=20,
        feature_dim=1,
        channel="validation",
    )
    inputs = [records, test_records]
    input_dict = _Job._format_record_set_list_input(inputs)
    assert isinstance(input_dict["train"], FileSystemInput)
    assert isinstance(input_dict["validation"], FileSystemInput)
示例#7
0
def test_format_inputs_to_input_config_file_system_record_set():
    file_system_id = "fs-0a48d2a1"
    file_system_type = "EFS"
    directory_path = "ipinsights"
    num_records = 1
    feature_dim = 1
    records = FileSystemRecordSet(
        file_system_id=file_system_id,
        file_system_type=file_system_type,
        directory_path=directory_path,
        num_records=num_records,
        feature_dim=feature_dim,
    )
    channels = _Job._format_inputs_to_input_config(records)
    assert channels[0]["DataSource"]["FileSystemDataSource"]["DirectoryPath"] == directory_path
    assert channels[0]["DataSource"]["FileSystemDataSource"]["FileSystemId"] == file_system_id
    assert channels[0]["DataSource"]["FileSystemDataSource"]["FileSystemType"] == file_system_type
    assert channels[0]["DataSource"]["FileSystemDataSource"]["FileSystemAccessMode"] == "ro"