def test_mxnet_with_rules_and_debugger_hook_config( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): rules = [ Rule.sagemaker(rule_configs.vanishing_gradient()), Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}), Rule.sagemaker(rule_configs.loss_not_decreasing()), ] debugger_hook_config = DebuggerHookConfig(s3_output_path=os.path.join( "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4()), "tensors")) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, rules=rules, debugger_hook_config=debugger_hook_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") mx.fit({"train": train_input, "test": test_input}) job_description = mx.latest_training_job.describe() for index, rule in enumerate(rules): assert (job_description["DebugRuleConfigurations"][index] ["RuleConfigurationName"] == rule.name) assert (job_description["DebugRuleConfigurations"][index] ["RuleEvaluatorImage"] == rule.image_uri) assert job_description["DebugRuleConfigurations"][index][ "VolumeSizeInGB"] == 0 assert (job_description["DebugRuleConfigurations"][index] ["RuleParameters"]["rule_to_invoke"] == rule.rule_parameters["rule_to_invoke"]) assert job_description[ "DebugHookConfig"] == debugger_hook_config._to_request_dict() assert (job_description["DebugRuleEvaluationStatuses"] == mx.latest_training_job.rule_job_summary()) _wait_and_assert_that_no_rule_jobs_errored( training_job=mx.latest_training_job)
def _get_custom_rule(session): script_path = os.path.join(DATA_DIR, "mxnet_mnist", "my_custom_rule.py") return Rule.custom( name="test-custom-rule", source=script_path, rule_to_invoke="CustomGradientRule", instance_type="ml.m5.xlarge", volume_size_in_gb=30, image_uri=CUSTOM_RULE_REPO_WITH_PLACEHOLDERS.format( CUSTOM_RULE_CONTAINERS_ACCOUNTS_MAP[session.boto_region_name], session.boto_region_name), )
def pca_estimator_with_debug_hook(): s3_output_location = 's3://sagemaker/models' hook_config = DebuggerHookConfig( s3_output_path='s3://sagemaker/output/debug', hook_parameters={ "save_interval": "1" }, collection_configs=[ CollectionConfig("hyperparameters"), CollectionConfig("metrics") ] ) rules = [Rule.sagemaker(rule_configs.confusion(), rule_parameters={ "category_no": "15", "min_diag": "0.7", "max_off_diag": "0.3", "start_step": "17", "end_step": "19"} )] pca = sagemaker.estimator.Estimator( PCA_IMAGE, role=EXECUTION_ROLE, train_instance_count=1, train_instance_type='ml.c4.xlarge', output_path=s3_output_location, debugger_hook_config = hook_config, rules=rules ) pca.set_hyperparameters( feature_dim=50000, num_components=10, subtract_mean=True, algorithm_mode='randomized', mini_batch_size=200 ) pca.sagemaker_session = MagicMock() pca.sagemaker_session.boto_region_name = 'us-east-1' pca.sagemaker_session._default_bucket = 'sagemaker' return pca
def create_model(image: str, hyperparameters: dict, instance_type: str, output_path: str, region_name: str, role: str, s3_train: str, s3_validation: str, job_name: str): if image == 'xgboost': container = get_image_uri(region_name, image, '0.90-2') else: container = get_image_uri(region_name, image) save_interval = '1' model = sagemaker.estimator.Estimator( container, role=role, train_instance_count=1, train_instance_type=instance_type, train_use_spot_instances=True, train_max_run=300, train_max_wait=600, output_path=output_path, debugger_hook_config=DebuggerHookConfig( s3_output_path=f's3://{bucket}/{prefix}/debug', collection_configs=[ CollectionConfig(name='metrics', parameters={'save_interval': save_interval}), CollectionConfig(name='feature_importance', parameters={'save_interval': save_interval}), CollectionConfig(name='full_shap', parameters={'save_interval': save_interval}), CollectionConfig(name='average_shap', parameters={'save_interval': save_interval}) ]), rules=[ Rule.sagemaker(rule_configs.class_imbalance(), rule_parameters={'collection_names': 'metrics'}) ]) model.set_hyperparameters(**hyperparameters) data_channel = { 'train': s3_input(s3_train, content_type='text/csv'), 'validation': s3_input(s3_validation, content_type='text/csv') } model.fit(data_channel, job_name=job_name) return model
def test_training_job_with_debugger_and_profiler( sagemaker_session, pipeline_name, role, pytorch_training_latest_version, pytorch_training_latest_py_version, ): instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") rules = [ Rule.sagemaker(rule_configs.vanishing_gradient()), Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}), Rule.sagemaker(rule_configs.loss_not_decreasing()), ] debugger_hook_config = DebuggerHookConfig( s3_output_path= f"s3://{sagemaker_session.default_bucket()}/{uuid.uuid4()}/tensors") base_dir = os.path.join(DATA_DIR, "pytorch_mnist") script_path = os.path.join(base_dir, "mnist.py") input_path = sagemaker_session.upload_data( path=os.path.join(base_dir, "training"), key_prefix="integ-test-data/pytorch_mnist/training", ) inputs = TrainingInput(s3_data=input_path) pytorch_estimator = PyTorch( entry_point=script_path, role="SageMakerRole", framework_version=pytorch_training_latest_version, py_version=pytorch_training_latest_py_version, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, rules=rules, debugger_hook_config=debugger_hook_config, ) step_train = TrainingStep( name="pytorch-train", estimator=pytorch_estimator, inputs=inputs, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count, instance_type], steps=[step_train], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] execution = pipeline.start() response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=10, max_attempts=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0].get("FailureReason", "") == "" assert execution_steps[0]["StepName"] == "pytorch-train" assert execution_steps[0]["StepStatus"] == "Succeeded" training_job_arn = execution_steps[0]["Metadata"]["TrainingJob"]["Arn"] job_description = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=training_job_arn.split("/")[1]) for index, rule in enumerate(rules): config = job_description["DebugRuleConfigurations"][index] assert config["RuleConfigurationName"] == rule.name assert config["RuleEvaluatorImage"] == rule.image_uri assert config["VolumeSizeInGB"] == 0 assert (config["RuleParameters"]["rule_to_invoke"] == rule.rule_parameters["rule_to_invoke"]) assert job_description[ "DebugHookConfig"] == debugger_hook_config._to_request_dict() assert job_description["ProfilingStatus"] == "Enabled" assert job_description["ProfilerConfig"][ "ProfilingIntervalInMilliseconds"] == 500 finally: try: pipeline.delete() except Exception: pass
debug_hook_config = DebuggerHookConfig( s3_output_path=debug_output_path, hook_parameters={"save_interval": "1"}, collection_configs=[ CollectionConfig("hyperparameters"), CollectionConfig("metrics"), CollectionConfig("predictions"), CollectionConfig("labels"), CollectionConfig("feature_importance") ]) debug_rules = [ Rule.sagemaker(rule_configs.confusion(), rule_parameters={ "category_no": "15", "min_diag": "0.7", "max_off_diag": "0.3", "start_step": "17", "end_step": "19" }) ] hyperparameters = { "max_depth": "10", "eta": "0.2", "gamma": "1", "min_child_weight": "6", "silent": "0", "objective": "multi:softmax", "num_class": "15", "num_round": "1" # TEMP: Hack to make faster }
def test_mxnet_with_profiler_and_debugger_then_disable_framework_metrics( sagemaker_session, mxnet_training_latest_version, mxnet_training_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): rules = [ Rule.sagemaker(rule_configs.vanishing_gradient()), Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}), ProfilerRule.sagemaker(rule_configs.ProfilerReport(), name="CustomProfilerReportRule"), ] debugger_hook_config = DebuggerHookConfig( s3_output_path= f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/tensors", ) profiler_config = ProfilerConfig( s3_output_path= f"s3://{sagemaker_session.default_bucket()}/{str(uuid.uuid4())}/system", system_monitor_interval_millis=1000, framework_profile_params=FrameworkProfile(), ) script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py") data_path = os.path.join(DATA_DIR, "mxnet_mnist") mx = MXNet( entry_point=script_path, role="SageMakerRole", framework_version=mxnet_training_latest_version, py_version=mxnet_training_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, rules=rules, debugger_hook_config=debugger_hook_config, profiler_config=profiler_config, ) train_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/mxnet_mnist/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test") training_job_name = unique_name_from_base( "test-profiler-mxnet-training") mx.fit( inputs={ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False, ) job_description = mx.latest_training_job.describe() assert job_description[ "ProfilerConfig"] == profiler_config._to_request_dict() assert job_description[ "DebugHookConfig"] == debugger_hook_config._to_request_dict() assert job_description.get("ProfilingStatus") == "Enabled" profiler_rule_configuration = job_description.get( "ProfilerRuleConfigurations")[0] assert profiler_rule_configuration[ "RuleConfigurationName"] == "CustomProfilerReportRule" assert profiler_rule_configuration["RuleEvaluatorImage"] == mx.rules[ 0].image_uri assert profiler_rule_configuration["RuleParameters"] == { "rule_to_invoke": "ProfilerReport", } for index, rule in enumerate(mx.debugger_rules): assert (job_description["DebugRuleConfigurations"][index] ["RuleConfigurationName"] == rule.name) assert (job_description["DebugRuleConfigurations"][index] ["RuleEvaluatorImage"] == rule.image_uri) _wait_until_training_can_be_updated(sagemaker_session.sagemaker_client, training_job_name) mx.update_profiler(disable_framework_metrics=True) job_description = mx.latest_training_job.describe() assert job_description["ProfilerConfig"]["ProfilingParameters"] == {}
s3_output_location = 's3://{}/{}/{}'.format(bucket, prefix, 'xgboost_model') # print the container image's universal resource identifier (URI) container = sagemaker.image_uris.retrieve("xgboost", region, "1.2-1") print(container) # train the model by using the xgboost model estimator xgb_model = sagemaker.estimator.Estimator( image_uri=container, role=role, instance_count=1, instance_type='ml.m4.xlarge', train_volume_size=5, output_path=s3_output_location, sagemaker_session=sagemaker.Session(), rules=[Rule.sagemaker(rule_configs.create_xgboost_report())]) # set the hyperparameters for the model xgb_model.set_hyperparameters(max_depth=5, eta=0.2, gamma=4, min_child_weight=6, subsample=0.7, objective="binary:logistic", num_round=1000) # configure the input data flow from sagemaker.session import TrainingInput train_input = TrainingInput("s3://{}/{}/{}".format(bucket, prefix, "data/train.csv"),