Exemplo n.º 1
0
def test_mxnet_with_rules_and_debugger_hook_config(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        rules = [
            Rule.sagemaker(rule_configs.vanishing_gradient()),
            Rule.sagemaker(base_config=rule_configs.all_zero(),
                           rule_parameters={"tensor_regex": ".*"}),
            Rule.sagemaker(rule_configs.loss_not_decreasing()),
        ]
        debugger_hook_config = DebuggerHookConfig(s3_output_path=os.path.join(
            "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()),
            "tensors"))

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            rules=rules,
            debugger_hook_config=debugger_hook_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        mx.fit({"train": train_input, "test": test_input})

        job_description = mx.latest_training_job.describe()

        for index, rule in enumerate(rules):
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleConfigurationName"] == rule.name)
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleEvaluatorImage"] == rule.image_uri)
            assert job_description["DebugRuleConfigurations"][index][
                "VolumeSizeInGB"] == 0
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleParameters"]["rule_to_invoke"] ==
                    rule.rule_parameters["rule_to_invoke"])
        assert job_description[
            "DebugHookConfig"] == debugger_hook_config._to_request_dict()
        assert (job_description["DebugRuleEvaluationStatuses"] ==
                mx.latest_training_job.rule_job_summary())

        _wait_and_assert_that_no_rule_jobs_errored(
            training_job=mx.latest_training_job)
Exemplo n.º 2
0
def _get_custom_rule(session):
    script_path = os.path.join(DATA_DIR, "mxnet_mnist", "my_custom_rule.py")

    return Rule.custom(
        name="test-custom-rule",
        source=script_path,
        rule_to_invoke="CustomGradientRule",
        instance_type="ml.m5.xlarge",
        volume_size_in_gb=30,
        image_uri=CUSTOM_RULE_REPO_WITH_PLACEHOLDERS.format(
            CUSTOM_RULE_CONTAINERS_ACCOUNTS_MAP[session.boto_region_name],
            session.boto_region_name),
    )
def pca_estimator_with_debug_hook():
    s3_output_location = 's3://sagemaker/models'

    hook_config = DebuggerHookConfig(
        s3_output_path='s3://sagemaker/output/debug',
        hook_parameters={
            "save_interval": "1"
        },
        collection_configs=[
            CollectionConfig("hyperparameters"),
            CollectionConfig("metrics")
        ]
    )

    rules = [Rule.sagemaker(rule_configs.confusion(),
        rule_parameters={
            "category_no": "15",
            "min_diag": "0.7",
            "max_off_diag": "0.3",
            "start_step": "17",
            "end_step": "19"}
    )]

    pca = sagemaker.estimator.Estimator(
        PCA_IMAGE,
        role=EXECUTION_ROLE,
        train_instance_count=1,
        train_instance_type='ml.c4.xlarge',
        output_path=s3_output_location,
        debugger_hook_config = hook_config,
        rules=rules
    )

    pca.set_hyperparameters(
        feature_dim=50000,
        num_components=10,
        subtract_mean=True,
        algorithm_mode='randomized',
        mini_batch_size=200
    )

    pca.sagemaker_session = MagicMock()
    pca.sagemaker_session.boto_region_name = 'us-east-1'
    pca.sagemaker_session._default_bucket = 'sagemaker'

    return pca
def create_model(image: str, hyperparameters: dict, instance_type: str,
                 output_path: str, region_name: str, role: str, s3_train: str,
                 s3_validation: str, job_name: str):
    if image == 'xgboost':
        container = get_image_uri(region_name, image, '0.90-2')
    else:
        container = get_image_uri(region_name, image)
    save_interval = '1'
    model = sagemaker.estimator.Estimator(
        container,
        role=role,
        train_instance_count=1,
        train_instance_type=instance_type,
        train_use_spot_instances=True,
        train_max_run=300,
        train_max_wait=600,
        output_path=output_path,
        debugger_hook_config=DebuggerHookConfig(
            s3_output_path=f's3://{bucket}/{prefix}/debug',
            collection_configs=[
                CollectionConfig(name='metrics',
                                 parameters={'save_interval': save_interval}),
                CollectionConfig(name='feature_importance',
                                 parameters={'save_interval': save_interval}),
                CollectionConfig(name='full_shap',
                                 parameters={'save_interval': save_interval}),
                CollectionConfig(name='average_shap',
                                 parameters={'save_interval': save_interval})
            ]),
        rules=[
            Rule.sagemaker(rule_configs.class_imbalance(),
                           rule_parameters={'collection_names': 'metrics'})
        ])
    model.set_hyperparameters(**hyperparameters)
    data_channel = {
        'train': s3_input(s3_train, content_type='text/csv'),
        'validation': s3_input(s3_validation, content_type='text/csv')
    }
    model.fit(data_channel, job_name=job_name)
    return model
Exemplo n.º 5
0
def test_training_job_with_debugger_and_profiler(
    sagemaker_session,
    pipeline_name,
    role,
    pytorch_training_latest_version,
    pytorch_training_latest_py_version,
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.xlarge")

    rules = [
        Rule.sagemaker(rule_configs.vanishing_gradient()),
        Rule.sagemaker(base_config=rule_configs.all_zero(),
                       rule_parameters={"tensor_regex": ".*"}),
        Rule.sagemaker(rule_configs.loss_not_decreasing()),
    ]
    debugger_hook_config = DebuggerHookConfig(
        s3_output_path=
        f"s3://{sagemaker_session.default_bucket()}/{uuid.uuid4()}/tensors")

    base_dir = os.path.join(DATA_DIR, "pytorch_mnist")
    script_path = os.path.join(base_dir, "mnist.py")
    input_path = sagemaker_session.upload_data(
        path=os.path.join(base_dir, "training"),
        key_prefix="integ-test-data/pytorch_mnist/training",
    )
    inputs = TrainingInput(s3_data=input_path)

    pytorch_estimator = PyTorch(
        entry_point=script_path,
        role="SageMakerRole",
        framework_version=pytorch_training_latest_version,
        py_version=pytorch_training_latest_py_version,
        instance_count=instance_count,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        rules=rules,
        debugger_hook_config=debugger_hook_config,
    )

    step_train = TrainingStep(
        name="pytorch-train",
        estimator=pytorch_estimator,
        inputs=inputs,
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count, instance_type],
        steps=[step_train],
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]

        execution = pipeline.start()
        response = execution.describe()
        assert response["PipelineArn"] == create_arn

        try:
            execution.wait(delay=10, max_attempts=60)
        except WaiterError:
            pass
        execution_steps = execution.list_steps()

        assert len(execution_steps) == 1
        assert execution_steps[0].get("FailureReason", "") == ""
        assert execution_steps[0]["StepName"] == "pytorch-train"
        assert execution_steps[0]["StepStatus"] == "Succeeded"

        training_job_arn = execution_steps[0]["Metadata"]["TrainingJob"]["Arn"]
        job_description = sagemaker_session.sagemaker_client.describe_training_job(
            TrainingJobName=training_job_arn.split("/")[1])

        for index, rule in enumerate(rules):
            config = job_description["DebugRuleConfigurations"][index]
            assert config["RuleConfigurationName"] == rule.name
            assert config["RuleEvaluatorImage"] == rule.image_uri
            assert config["VolumeSizeInGB"] == 0
            assert (config["RuleParameters"]["rule_to_invoke"] ==
                    rule.rule_parameters["rule_to_invoke"])
        assert job_description[
            "DebugHookConfig"] == debugger_hook_config._to_request_dict()

        assert job_description["ProfilingStatus"] == "Enabled"
        assert job_description["ProfilerConfig"][
            "ProfilingIntervalInMilliseconds"] == 500
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
Exemplo n.º 6
0
debug_hook_config = DebuggerHookConfig(
    s3_output_path=debug_output_path,
    hook_parameters={"save_interval": "1"},
    collection_configs=[
        CollectionConfig("hyperparameters"),
        CollectionConfig("metrics"),
        CollectionConfig("predictions"),
        CollectionConfig("labels"),
        CollectionConfig("feature_importance")
    ])

debug_rules = [
    Rule.sagemaker(rule_configs.confusion(),
                   rule_parameters={
                       "category_no": "15",
                       "min_diag": "0.7",
                       "max_off_diag": "0.3",
                       "start_step": "17",
                       "end_step": "19"
                   })
]

hyperparameters = {
    "max_depth": "10",
    "eta": "0.2",
    "gamma": "1",
    "min_child_weight": "6",
    "silent": "0",
    "objective": "multi:softmax",
    "num_class": "15",
    "num_round": "1"  # TEMP: Hack to make faster
}
Exemplo n.º 7
0
def test_mxnet_with_profiler_and_debugger_then_disable_framework_metrics(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        rules = [
            Rule.sagemaker(rule_configs.vanishing_gradient()),
            Rule.sagemaker(base_config=rule_configs.all_zero(),
                           rule_parameters={"tensor_regex": ".*"}),
            ProfilerRule.sagemaker(rule_configs.ProfilerReport(),
                                   name="CustomProfilerReportRule"),
        ]
        debugger_hook_config = DebuggerHookConfig(
            s3_output_path=
            f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/tensors",
        )
        profiler_config = ProfilerConfig(
            s3_output_path=
            f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/system",
            system_monitor_interval_millis=1000,
            framework_profile_params=FrameworkProfile(),
        )

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            rules=rules,
            debugger_hook_config=debugger_hook_config,
            profiler_config=profiler_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        training_job_name = unique_name_from_base(
            "test-profiler-mxnet-training")
        mx.fit(
            inputs={
                "train": train_input,
                "test": test_input
            },
            job_name=training_job_name,
            wait=False,
        )

        job_description = mx.latest_training_job.describe()
        assert job_description[
            "ProfilerConfig"] == profiler_config._to_request_dict()
        assert job_description[
            "DebugHookConfig"] == debugger_hook_config._to_request_dict()
        assert job_description.get("ProfilingStatus") == "Enabled"

        profiler_rule_configuration = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert profiler_rule_configuration[
            "RuleConfigurationName"] == "CustomProfilerReportRule"
        assert profiler_rule_configuration["RuleEvaluatorImage"] == mx.rules[
            0].image_uri
        assert profiler_rule_configuration["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport",
        }

        for index, rule in enumerate(mx.debugger_rules):
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleConfigurationName"] == rule.name)
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleEvaluatorImage"] == rule.image_uri)

        _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client,
                                            training_job_name)

        mx.update_profiler(disable_framework_metrics=True)
        job_description = mx.latest_training_job.describe()
        assert job_description["ProfilerConfig"]["ProfilingParameters"] == {}
Exemplo n.º 8
0
s3_output_location = 's3://{}/{}/{}'.format(bucket, prefix, 'xgboost_model')

# print the container image's universal resource identifier (URI)
container = sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
print(container)

# train the model by using the xgboost model estimator
xgb_model = sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    train_volume_size=5,
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session(),
    rules=[Rule.sagemaker(rule_configs.create_xgboost_report())])

# set the hyperparameters for the model
xgb_model.set_hyperparameters(max_depth=5,
                              eta=0.2,
                              gamma=4,
                              min_child_weight=6,
                              subsample=0.7,
                              objective="binary:logistic",
                              num_round=1000)

# configure the input data flow
from sagemaker.session import TrainingInput

train_input = TrainingInput("s3://{}/{}/{}".format(bucket, prefix,
                                                   "data/train.csv"),