Exemplo n.º 1
0
def test_mxnet_with_debugger_hook_config(sagemaker_session, mxnet_full_version, cpu_instance_type):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        debugger_hook_config = DebuggerHookConfig(
            s3_output_path=os.path.join(
                "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()), "tensors"
            )
        )

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_full_version,
            py_version=PYTHON_VERSION,
            train_instance_count=1,
            train_instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            debugger_hook_config=debugger_hook_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train"
        )
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test"
        )

        mx.fit({"train": train_input, "test": test_input})

        job_description = mx.latest_training_job.describe()
        assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict()

        _wait_and_assert_that_no_rule_jobs_errored(training_job=mx.latest_training_job)
Exemplo n.º 2
0
def test_mxnet_with_rules_and_debugger_hook_config(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        rules = [
            Rule.sagemaker(rule_configs.vanishing_gradient()),
            Rule.sagemaker(base_config=rule_configs.all_zero(),
                           rule_parameters={"tensor_regex": ".*"}),
            Rule.sagemaker(rule_configs.loss_not_decreasing()),
        ]
        debugger_hook_config = DebuggerHookConfig(s3_output_path=os.path.join(
            "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()),
            "tensors"))

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            rules=rules,
            debugger_hook_config=debugger_hook_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        mx.fit({"train": train_input, "test": test_input})

        job_description = mx.latest_training_job.describe()

        for index, rule in enumerate(rules):
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleConfigurationName"] == rule.name)
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleEvaluatorImage"] == rule.image_uri)
            assert job_description["DebugRuleConfigurations"][index][
                "VolumeSizeInGB"] == 0
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleParameters"]["rule_to_invoke"] ==
                    rule.rule_parameters["rule_to_invoke"])
        assert job_description[
            "DebugHookConfig"] == debugger_hook_config._to_request_dict()
        assert (job_description["DebugRuleEvaluationStatuses"] ==
                mx.latest_training_job.rule_job_summary())

        _wait_and_assert_that_no_rule_jobs_errored(
            training_job=mx.latest_training_job)
def tensorflow_estimator():
    s3_output_location = 's3://sagemaker/models'
    s3_source_location = 's3://sagemaker/source'

    estimator = TensorFlow(
        entry_point='tf_train.py',
        role=EXECUTION_ROLE,
        framework_version='1.13',
        instance_count=1,
        instance_type='ml.p2.xlarge',
        output_path=s3_output_location,
        source_dir=s3_source_location,
        image_uri=TENSORFLOW_IMAGE,
        model_dir=False,
        hyperparameters={
            'training_steps':
            1000,
            'evaluation_steps':
            100,
            'checkpoint_path':
            's3://sagemaker/models/sagemaker-tensorflow/checkpoints',
        })

    estimator.debugger_hook_config = DebuggerHookConfig(
        s3_output_path='s3://sagemaker/models/debug')

    estimator.sagemaker_session = MagicMock()
    estimator.sagemaker_session.boto_region_name = 'us-east-1'
    estimator.sagemaker_session._default_bucket = 'sagemaker'

    return estimator
Exemplo n.º 4
0
def sklearn_preprocessor():
    script_path = 'sklearn_abalone_featurizer.py'
    source_dir = 's3://sagemaker/source'
    sagemaker_session = MagicMock()
    sagemaker_session.boto_region_name = 'us-east-1'

    sklearn_preprocessor = SKLearn(entry_point=script_path,
                                   role=SAGEMAKER_EXECUTION_ROLE,
                                   train_instance_type="ml.c4.xlarge",
                                   source_dir=source_dir,
                                   sagemaker_session=sagemaker_session)

    sklearn_preprocessor.debugger_hook_config = DebuggerHookConfig(
        s3_output_path='s3://sagemaker/source/debug')

    return sklearn_preprocessor
def pca_estimator_with_debug_hook():
    s3_output_location = 's3://sagemaker/models'

    hook_config = DebuggerHookConfig(
        s3_output_path='s3://sagemaker/output/debug',
        hook_parameters={
            "save_interval": "1"
        },
        collection_configs=[
            CollectionConfig("hyperparameters"),
            CollectionConfig("metrics")
        ]
    )

    rules = [Rule.sagemaker(rule_configs.confusion(),
        rule_parameters={
            "category_no": "15",
            "min_diag": "0.7",
            "max_off_diag": "0.3",
            "start_step": "17",
            "end_step": "19"}
    )]

    pca = sagemaker.estimator.Estimator(
        PCA_IMAGE,
        role=EXECUTION_ROLE,
        train_instance_count=1,
        train_instance_type='ml.c4.xlarge',
        output_path=s3_output_location,
        debugger_hook_config = hook_config,
        rules=rules
    )

    pca.set_hyperparameters(
        feature_dim=50000,
        num_components=10,
        subtract_mean=True,
        algorithm_mode='randomized',
        mini_batch_size=200
    )

    pca.sagemaker_session = MagicMock()
    pca.sagemaker_session.boto_region_name = 'us-east-1'
    pca.sagemaker_session._default_bucket = 'sagemaker'

    return pca
Exemplo n.º 6
0
    def _validate_and_set_debugger_configs(self):
        """Disable Debugger Hook Config for ParameterServer (PS) as it is not supported in smdebug.

        Else, set default HookConfig
        """
        ps_enabled = "parameter_server" in self.distribution and self.distribution[
            "parameter_server"].get("enabled", False)
        if ps_enabled:
            if self.debugger_hook_config is not None or self.debugger_rule_configs is not None:
                logger.info(
                    "Amazon SageMaker Debugger does not currently support "
                    "Parameter Server distribution")
            self.debugger_hook_config = None
            self.debugger_rule_configs = None
        elif self.debugger_hook_config is None and fw._region_supports_debugger(
                self.sagemaker_session.boto_session.region_name):
            # Set defaults for debugging.
            self.debugger_hook_config = DebuggerHookConfig(
                s3_output_path=self.output_path)
def create_model(image: str, hyperparameters: dict, instance_type: str,
                 output_path: str, region_name: str, role: str, s3_train: str,
                 s3_validation: str, job_name: str):
    if image == 'xgboost':
        container = get_image_uri(region_name, image, '0.90-2')
    else:
        container = get_image_uri(region_name, image)
    save_interval = '1'
    model = sagemaker.estimator.Estimator(
        container,
        role=role,
        train_instance_count=1,
        train_instance_type=instance_type,
        train_use_spot_instances=True,
        train_max_run=300,
        train_max_wait=600,
        output_path=output_path,
        debugger_hook_config=DebuggerHookConfig(
            s3_output_path=f's3://{bucket}/{prefix}/debug',
            collection_configs=[
                CollectionConfig(name='metrics',
                                 parameters={'save_interval': save_interval}),
                CollectionConfig(name='feature_importance',
                                 parameters={'save_interval': save_interval}),
                CollectionConfig(name='full_shap',
                                 parameters={'save_interval': save_interval}),
                CollectionConfig(name='average_shap',
                                 parameters={'save_interval': save_interval})
            ]),
        rules=[
            Rule.sagemaker(rule_configs.class_imbalance(),
                           rule_parameters={'collection_names': 'metrics'})
        ])
    model.set_hyperparameters(**hyperparameters)
    data_channel = {
        'train': s3_input(s3_train, content_type='text/csv'),
        'validation': s3_input(s3_validation, content_type='text/csv')
    }
    model.fit(data_channel, job_name=job_name)
    return model
Exemplo n.º 8
0
    def _validate_and_set_debugger_configs(self):
        """
        Disable Debugger Hook Config for PS and Horovod as they are not
        supported in smdebug 0.4.13, the current latest version of smdebug

        Else, set default HookConfig
        """
        ps_enabled = "parameter_server" in self.distributions and self.distributions[
            "parameter_server"].get("enabled", False)
        mpi_enabled = "mpi" in self.distributions and self.distributions[
            "mpi"].get("enabled", False)
        if ps_enabled or mpi_enabled:
            if self.debugger_hook_config is not None or self.debugger_rule_configs is not None:
                logger.info(
                    "Amazon SageMaker Debugger does not currently support "
                    "Parameter Server and MPI distributions")
            self.debugger_hook_config = None
            self.debugger_rule_configs = None
        elif self.debugger_hook_config is None:
            # Set defaults for debugging.
            self.debugger_hook_config = DebuggerHookConfig(
                s3_output_path=self.output_path)
Exemplo n.º 9
0
def linear_learner_estimator():
    s3_output_location = 's3://sagemaker/models'
    sagemaker_session = MagicMock()
    sagemaker_session.boto_region_name = 'us-east-1'

    ll_estimator = sagemaker.estimator.Estimator(
        LINEAR_LEARNER_IMAGE,
        SAGEMAKER_EXECUTION_ROLE,
        train_instance_count=1,
        train_instance_type='ml.c4.xlarge',
        train_volume_size=20,
        train_max_run=3600,
        input_mode='File',
        output_path=s3_output_location,
        sagemaker_session=sagemaker_session)

    ll_estimator.debugger_hook_config = DebuggerHookConfig(
        s3_output_path='s3://sagemaker/models/debug')

    ll_estimator.set_hyperparameters(feature_dim=10,
                                     predictor_type='regressor',
                                     mini_batch_size=32)

    return ll_estimator
Exemplo n.º 10
0
def test_training_job_with_debugger_and_profiler(
    sagemaker_session,
    pipeline_name,
    role,
    pytorch_training_latest_version,
    pytorch_training_latest_py_version,
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.xlarge")

    rules = [
        Rule.sagemaker(rule_configs.vanishing_gradient()),
        Rule.sagemaker(base_config=rule_configs.all_zero(),
                       rule_parameters={"tensor_regex": ".*"}),
        Rule.sagemaker(rule_configs.loss_not_decreasing()),
    ]
    debugger_hook_config = DebuggerHookConfig(
        s3_output_path=
        f"s3://{sagemaker_session.default_bucket()}/{uuid.uuid4()}/tensors")

    base_dir = os.path.join(DATA_DIR, "pytorch_mnist")
    script_path = os.path.join(base_dir, "mnist.py")
    input_path = sagemaker_session.upload_data(
        path=os.path.join(base_dir, "training"),
        key_prefix="integ-test-data/pytorch_mnist/training",
    )
    inputs = TrainingInput(s3_data=input_path)

    pytorch_estimator = PyTorch(
        entry_point=script_path,
        role="SageMakerRole",
        framework_version=pytorch_training_latest_version,
        py_version=pytorch_training_latest_py_version,
        instance_count=instance_count,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        rules=rules,
        debugger_hook_config=debugger_hook_config,
    )

    step_train = TrainingStep(
        name="pytorch-train",
        estimator=pytorch_estimator,
        inputs=inputs,
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count, instance_type],
        steps=[step_train],
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]

        execution = pipeline.start()
        response = execution.describe()
        assert response["PipelineArn"] == create_arn

        try:
            execution.wait(delay=10, max_attempts=60)
        except WaiterError:
            pass
        execution_steps = execution.list_steps()

        assert len(execution_steps) == 1
        assert execution_steps[0].get("FailureReason", "") == ""
        assert execution_steps[0]["StepName"] == "pytorch-train"
        assert execution_steps[0]["StepStatus"] == "Succeeded"

        training_job_arn = execution_steps[0]["Metadata"]["TrainingJob"]["Arn"]
        job_description = sagemaker_session.sagemaker_client.describe_training_job(
            TrainingJobName=training_job_arn.split("/")[1])

        for index, rule in enumerate(rules):
            config = job_description["DebugRuleConfigurations"][index]
            assert config["RuleConfigurationName"] == rule.name
            assert config["RuleEvaluatorImage"] == rule.image_uri
            assert config["VolumeSizeInGB"] == 0
            assert (config["RuleParameters"]["rule_to_invoke"] ==
                    rule.rule_parameters["rule_to_invoke"])
        assert job_description[
            "DebugHookConfig"] == debugger_hook_config._to_request_dict()

        assert job_description["ProfilingStatus"] == "Enabled"
        assert job_description["ProfilerConfig"][
            "ProfilingIntervalInMilliseconds"] == 500
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
Exemplo n.º 11
0
input_train_path = "s3://{}/{}/data/train".format(bucket_name, prefix)
input_validation_path = "s3://{}/{}/data/val".format(bucket_name, prefix)
model_output_path = "s3://{}/{}/model".format(bucket_name, prefix)
debug_output_path = 's3://{0}/{1}/model/debug'.format(bucket_name, prefix)
model_code_location = 's3://{0}/{1}/code'.format(bucket_name, prefix)
entry_point = 'train_xgboost.py'
source_dir = 'workflow/training/'

# TODO: Upload source files here given we are not calling fit

debug_hook_config = DebuggerHookConfig(
    s3_output_path=debug_output_path,
    hook_parameters={"save_interval": "1"},
    collection_configs=[
        CollectionConfig("hyperparameters"),
        CollectionConfig("metrics"),
        CollectionConfig("predictions"),
        CollectionConfig("labels"),
        CollectionConfig("feature_importance")
    ])

debug_rules = [
    Rule.sagemaker(rule_configs.confusion(),
                   rule_parameters={
                       "category_no": "15",
                       "min_diag": "0.7",
                       "max_off_diag": "0.3",
                       "start_step": "17",
                       "end_step": "19"
                   })
]
Exemplo n.º 12
0
def test_mxnet_with_profiler_and_debugger_then_disable_framework_metrics(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        rules = [
            Rule.sagemaker(rule_configs.vanishing_gradient()),
            Rule.sagemaker(base_config=rule_configs.all_zero(),
                           rule_parameters={"tensor_regex": ".*"}),
            ProfilerRule.sagemaker(rule_configs.ProfilerReport(),
                                   name="CustomProfilerReportRule"),
        ]
        debugger_hook_config = DebuggerHookConfig(
            s3_output_path=
            f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/tensors",
        )
        profiler_config = ProfilerConfig(
            s3_output_path=
            f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/system",
            system_monitor_interval_millis=1000,
            framework_profile_params=FrameworkProfile(),
        )

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
        data_path = os.path.join(DATA_DIR, "mxnet_mnist")

        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            rules=rules,
            debugger_hook_config=debugger_hook_config,
            profiler_config=profiler_config,
        )

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/mxnet_mnist/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/mxnet_mnist/test")

        training_job_name = unique_name_from_base(
            "test-profiler-mxnet-training")
        mx.fit(
            inputs={
                "train": train_input,
                "test": test_input
            },
            job_name=training_job_name,
            wait=False,
        )

        job_description = mx.latest_training_job.describe()
        assert job_description[
            "ProfilerConfig"] == profiler_config._to_request_dict()
        assert job_description[
            "DebugHookConfig"] == debugger_hook_config._to_request_dict()
        assert job_description.get("ProfilingStatus") == "Enabled"

        profiler_rule_configuration = job_description.get(
            "ProfilerRuleConfigurations")[0]
        assert profiler_rule_configuration[
            "RuleConfigurationName"] == "CustomProfilerReportRule"
        assert profiler_rule_configuration["RuleEvaluatorImage"] == mx.rules[
            0].image_uri
        assert profiler_rule_configuration["RuleParameters"] == {
            "rule_to_invoke": "ProfilerReport",
        }

        for index, rule in enumerate(mx.debugger_rules):
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleConfigurationName"] == rule.name)
            assert (job_description["DebugRuleConfigurations"][index]
                    ["RuleEvaluatorImage"] == rule.image_uri)

        _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client,
                                            training_job_name)

        mx.update_profiler(disable_framework_metrics=True)
        job_description = mx.latest_training_job.describe()
        assert job_description["ProfilerConfig"]["ProfilingParameters"] == {}
Exemplo n.º 13
0
# TODO: change me
BUCKET_NAME = "MY_BUCKET"
REPO_NAME = "REPO_NAME"

s3_output_location = f"s3://{BUCKET_NAME}/sagemaker/{REPO_NAME}"

tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path=f"{s3_output_location}/tensorboard",
    container_local_output_path="/opt/ml/output/tensorboard",
)

hook_config = DebuggerHookConfig(
    s3_output_path=s3_output_location,
    collection_configs=[
        CollectionConfig("weights"),
        CollectionConfig("gradients"),
        CollectionConfig("biases")
    ],
)

sess = sagemaker.Session(default_bucket=BUCKET_NAME)
role = os.environ["SAGEMAKER_ROLE"]
tag = os.environ.get("CIRCLE_BRANCH") or "latest"
account_url = os.environ["AWS_ECR_ACCOUNT_URL"]

tf_estimator = Estimator(
    role=role,
    train_instance_count=1,
    train_instance_type="ml.m5.large",
    base_job_name=tag,
    sagemaker_session=sess,