def test_create_model_from_estimator(sagemaker_session, xgboost_version): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" xgboost = XGBoost( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_type=INSTANCE_TYPE, train_instance_count=1, framework_version=xgboost_version, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name="job", source_dir=source_dir, ) job_name = "new_name" xgboost.fit(inputs="s3://mybucket/train", job_name=job_name) model = xgboost.create_model() assert model.sagemaker_session == sagemaker_session assert model.framework_version == xgboost_version assert model.py_version == xgboost.py_version assert model.entry_point == SCRIPT_PATH assert model.role == ROLE assert model.name == job_name assert model.container_log_level == container_log_level assert model.source_dir == source_dir assert model.vpc_config is None
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" enable_cloudwatch_metrics = "true" xgboost = XGBoost( entry_point=SCRIPT_PATH, role=ROLE, framework_version=XGBOOST_LATEST_VERSION, sagemaker_session=sagemaker_session, train_instance_type=INSTANCE_TYPE, train_instance_count=1, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name="job", source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics, ) xgboost.fit(inputs="s3://mybucket/train", job_name="new_name") new_role = "role" model_server_workers = 2 vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]} model = xgboost.create_model( role=new_role, model_server_workers=model_server_workers, vpc_config_override=vpc_config ) assert model.role == new_role assert model.model_server_workers == model_server_workers assert model.vpc_config == vpc_config
def test_attach_custom_image(sagemaker_session): training_image = "1.dkr.ecr.us-west-2.amazonaws.com/my_custom_xgboost_image:latest" returned_job_description = { "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"', "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description ) with pytest.raises(TypeError) as error: XGBoost.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert "expected string" in str(error)
def test_training_with_network_isolation( sagemaker_session, xgboost_latest_version, xgboost_latest_py_version, cpu_instance_type, ): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): base_job_name = "test-network-isolation-xgboost" xgboost = XGBoost( entry_point=os.path.join(DATA_DIR, "xgboost_abalone", "abalone.py"), role=ROLE, instance_type=cpu_instance_type, instance_count=1, framework_version=xgboost_latest_version, py_version=xgboost_latest_py_version, base_job_name=base_job_name, sagemaker_session=sagemaker_session, enable_network_isolation=True, ) train_input = xgboost.sagemaker_session.upload_data( path=os.path.join(DATA_DIR, "xgboost_abalone", "abalone"), key_prefix="integ-test-data/xgboost_abalone/abalone", ) job_name = unique_name_from_base(base_job_name) xgboost.fit(inputs={"train": train_input}, job_name=job_name) assert sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=job_name)["EnableNetworkIsolation"]
def test_attach_wrong_framework(sagemaker_session): rjd = { "AlgorithmSpecification": { "TrainingInputMode": "File", "TrainingImage": "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py3-cpu:1.0.4", }, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "checkpoint_path": '"s3://other/1508872349"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_container_log_level": '"logging.INFO"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=rjd ) with pytest.raises(ValueError) as error: XGBoost.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert "didn't use image for requested framework" in str(error)
def main(): print('Starting model training.') print('Note: if launching for the first time in local mode, container image download might take a few minutes to complete.') hyperparameters = { "max_depth": "5", "eta": "0.2", "gamma": "4", "min_child_weight": "6", "subsample": "0.7", "objective": "reg:squarederror", "num_round": "50", "verbosity": "2", } xgb_script_mode_estimator = XGBoost( entry_point="./code/abalone.py", hyperparameters=hyperparameters, role=DUMMY_IAM_ROLE, instance_count=1, instance_type='local', framework_version="1.2-1" ) train_input = TrainingInput("file://./data/train/abalone", content_type="text/libsvm") xgb_script_mode_estimator.fit({"train": train_input, "validation": train_input}) print('Completed model training') model_data = xgb_script_mode_estimator.model_data print(model_data) xgb_inference_model = XGBoostModel( model_data=model_data, role=DUMMY_IAM_ROLE, entry_point="./code/inference.py", framework_version="1.2-1", ) print('Deploying endpoint in local mode') predictor = xgb_inference_model.deploy( initial_instance_count=1, instance_type="local", ) a_young_abalone = "6 1:3 2:0.37 3:0.29 4:0.095 5:0.249 6:0.1045 7:0.058 8:0.067" do_inference_on_local_endpoint(predictor, a_young_abalone) an_old_abalone = "15 1:1 2:0.655 3:0.53 4:0.175 5:1.2635 6:0.486 7:0.2635 8:0.415" do_inference_on_local_endpoint(predictor, an_old_abalone) print('About to delete the endpoint to stop paying (if in cloud mode).') predictor.delete_endpoint(predictor.endpoint_name)
def test_training_image_uri(sagemaker_session, xgboost_framework_version): xgboost = XGBoost( entry_point=SCRIPT_PATH, role=ROLE, framework_version=xgboost_framework_version, sagemaker_session=sagemaker_session, instance_type=INSTANCE_TYPE, instance_count=1, py_version=PYTHON_VERSION, ) assert _get_full_image_uri(xgboost_framework_version) in xgboost.training_image_uri()
def test_train_image_default(sagemaker_session): xgboost = XGBoost( entry_point=SCRIPT_PATH, role=ROLE, framework_version=XGBOOST_LATEST_VERSION, sagemaker_session=sagemaker_session, train_instance_type=INSTANCE_TYPE, train_instance_count=1, py_version=PYTHON_VERSION, ) assert _get_full_cpu_image_uri(XGBOOST_LATEST_VERSION) in xgboost.train_image()
def test_py2_xgboost_error(sagemaker_session, xgboost_framework_version): with pytest.raises(ValueError) as error1: XGBoost( entry_point=SCRIPT_PATH, role=ROLE, framework_version=xgboost_framework_version, sagemaker_session=sagemaker_session, instance_type=INSTANCE_TYPE, instance_count=1, py_version="py2", ) with pytest.raises(ValueError) as error2: model = XGBoostModel( model_data=DATA_DIR, role=ROLE, sagemaker_session=sagemaker_session, entry_point=SCRIPT_PATH, framework_version=xgboost_framework_version, py_version="py2", ) model.serving_image_uri(REGION, INSTANCE_TYPE) error_message = "Unsupported Python version: py2." assert error_message in str(error1) assert error_message in str(error2)
def test_py2_xgboost_attribute_error(sagemaker_session): with pytest.raises(AttributeError) as error1: XGBoost( entry_point=SCRIPT_PATH, role=ROLE, framework_version=XGBOOST_LATEST_VERSION, sagemaker_session=sagemaker_session, train_instance_type=INSTANCE_TYPE, train_instance_count=1, py_version="py2", ) with pytest.raises(AttributeError) as error2: XGBoostModel( model_data=DATA_DIR, role=ROLE, sagemaker_session=sagemaker_session, entry_point=SCRIPT_PATH, framework_version=XGBOOST_LATEST_VERSION, py_version="py2", ) error_message = "XGBoost container does not support Python 2, please use Python 3" assert error_message in str(error1) assert error_message in str(error2)
def test_create_model_with_optional_params(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" enable_cloudwatch_metrics = "true" xgboost = XGBoost( entry_point=SCRIPT_PATH, role=ROLE, framework_version=XGBOOST_LATEST_VERSION, sagemaker_session=sagemaker_session, train_instance_type=INSTANCE_TYPE, train_instance_count=1, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name="job", source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics, ) xgboost.fit(inputs="s3://mybucket/train", job_name="new_name") custom_image = "ubuntu:latest" new_role = "role" model_server_workers = 2 vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]} new_source_dir = "s3://myotherbucket/source" dependencies = ["/directory/a", "/directory/b"] model_name = "model-name" model = xgboost.create_model( image=custom_image, role=new_role, model_server_workers=model_server_workers, vpc_config_override=vpc_config, entry_point=SERVING_SCRIPT_FILE, source_dir=new_source_dir, dependencies=dependencies, name=model_name, ) assert model.image == custom_image assert model.role == new_role assert model.model_server_workers == model_server_workers assert model.vpc_config == vpc_config assert model.entry_point == SERVING_SCRIPT_FILE assert model.source_dir == new_source_dir assert model.dependencies == dependencies assert model.name == model_name
def test_attach(sagemaker_session, xgboost_version): training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:{}-cpu-{}".format( xgboost_version, PYTHON_VERSION) returned_job_description = { "AlgorithmSpecification": { "TrainingInputMode": "File", "TrainingImage": training_image }, "HyperParameters": { "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', "sagemaker_program": '"iris-dnn-classifier.py"', "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"', "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_container_log_level": '"logging.INFO"', "sagemaker_job_name": '"neo"', "training_steps": "100", "sagemaker_region": '"us-west-2"', }, "RoleArn": "arn:aws:iam::366:role/SageMakerRole", "ResourceConfig": { "VolumeSizeInGB": 30, "InstanceCount": 1, "InstanceType": "ml.c4.xlarge", }, "StoppingCondition": { "MaxRuntimeInSeconds": 24 * 60 * 60 }, "TrainingJobName": "neo", "TrainingJobStatus": "Completed", "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", "OutputDataConfig": { "KmsKeyId": "", "S3OutputPath": "s3://place/output/neo" }, "TrainingJobOutput": { "S3TrainingJobOutput": "s3://here/output.tar.gz" }, } sagemaker_session.sagemaker_client.describe_training_job = Mock( name="describe_training_job", return_value=returned_job_description) estimator = XGBoost.attach(training_job_name="neo", sagemaker_session=sagemaker_session) assert estimator._current_job_name == "neo" assert estimator.latest_training_job.job_name == "neo" assert estimator.py_version == PYTHON_VERSION assert estimator.framework_version == xgboost_version assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" assert estimator.train_instance_count == 1 assert estimator.train_max_run == 24 * 60 * 60 assert estimator.input_mode == "File" assert estimator.base_job_name == "neo" assert estimator.output_path == "s3://place/output/neo" assert estimator.output_kms_key == "" assert estimator.hyperparameters()["training_steps"] == "100" assert estimator.source_dir == "s3://some/sourcedir.tar.gz" assert estimator.entry_point == "iris-dnn-classifier.py"
def test_train_image(sagemaker_session, xgboost_version): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" xgboost = XGBoost( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, train_instance_type=INSTANCE_TYPE, train_instance_count=1, framework_version=xgboost_version, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name="job", source_dir=source_dir, ) train_image = xgboost.train_image() assert ( train_image == "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:0.90-1-cpu-py3" )
def test_create_model_with_custom_image(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" custom_image = "ubuntu:latest" xgboost = XGBoost( entry_point=SCRIPT_PATH, role=ROLE, framework_version=XGBOOST_LATEST_VERSION, sagemaker_session=sagemaker_session, train_instance_type=INSTANCE_TYPE, train_instance_count=1, image_name=custom_image, container_log_level=container_log_level, py_version=PYTHON_VERSION, base_job_name="job", source_dir=source_dir, ) xgboost.fit(inputs="s3://mybucket/train", job_name="new_name") model = xgboost.create_model() assert model.image == custom_image
def test_unsupported_xgboost_version_error(sagemaker_session): with pytest.raises(ValueError) as error1: XGBoost( entry_point=SCRIPT_PATH, role=ROLE, framework_version="1.1", sagemaker_session=sagemaker_session, instance_type=INSTANCE_TYPE, instance_count=1, ) with pytest.raises(ValueError) as error2: XGBoost( entry_point=SCRIPT_PATH, role=ROLE, framework_version="1.1-1", sagemaker_session=sagemaker_session, instance_type=INSTANCE_TYPE, instance_count=1, ) error_message = "XGBoost 1.1 is not supported" assert error_message in str(error1) assert error_message in str(error2)
def test_distributed_training(strftime, sagemaker_session, xgboost_framework_version): xgboost = XGBoost( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_count=DIST_INSTANCE_COUNT, instance_type=INSTANCE_TYPE, py_version=PYTHON_VERSION, framework_version=xgboost_framework_version, ) inputs = "s3://mybucket/train" xgboost.fit(inputs=inputs) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == ["train", "logs_for_job"] boto_call_names = [ c[0] for c in sagemaker_session.boto_session.method_calls ] assert boto_call_names == ["resource"] expected_train_args = _create_train_job(xgboost_framework_version, DIST_INSTANCE_COUNT) expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][ "S3Uri"] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = xgboost.create_model() expected_image_base = "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:{}-cpu-{}" assert { "Environment": { "SAGEMAKER_SUBMIT_DIRECTORY": "s3://mybucket/sagemaker-xgboost-{}/source/sourcedir.tar.gz". format(TIMESTAMP), "SAGEMAKER_PROGRAM": "dummy_script.py", "SAGEMAKER_REGION": "us-west-2", "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", }, "Image": expected_image_base.format(xgboost_framework_version, PYTHON_VERSION), "ModelDataUrl": "s3://m/m.tar.gz", } == model.prepare_container_def(CPU) assert "cpu" in model.prepare_container_def(CPU)["Image"] predictor = xgboost.deploy(1, CPU) assert isinstance(predictor, XGBoostPredictor)
def _xgboost_estimator(sagemaker_session, framework_version=XGBOOST_LATEST_VERSION, train_instance_type=None, train_instance_count=1, base_job_name=None, **kwargs): return XGBoost(entry_point=SCRIPT_PATH, framework_version=framework_version, role=ROLE, sagemaker_session=sagemaker_session, train_instance_type=train_instance_type if train_instance_type else INSTANCE_TYPE, train_instance_count=train_instance_count, base_job_name=base_job_name, py_version=PYTHON_VERSION, **kwargs)
def test_xgboost_gpu(time, strftime, sagemaker_session, xgboost_gpu_framework_version): xgboost = XGBoost( entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, instance_type=GPU_INSTANCE_TYPE, instance_count=1, framework_version=xgboost_gpu_framework_version, ) inputs = "s3://mybucket/train" xgboost.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG) sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls] assert sagemaker_call_names == ["train", "logs_for_job"] boto_call_names = [ c[0] for c in sagemaker_session.boto_session.method_calls ] assert boto_call_names == ["resource"] expected_train_args = _create_train_job(xgboost_gpu_framework_version, instance_type=GPU_INSTANCE_TYPE) expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][ "S3Uri"] = inputs expected_train_args["experiment_config"] = EXPERIMENT_CONFIG actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = xgboost.create_model() assert { "Environment": { "SAGEMAKER_SUBMIT_DIRECTORY": "s3://mybucket/sagemaker-xgboost-{}/source/sourcedir.tar.gz". format(TIMESTAMP), "SAGEMAKER_PROGRAM": "dummy_script.py", "SAGEMAKER_REGION": "us-west-2", "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", }, "Image": _get_full_image_uri(xgboost_gpu_framework_version), "ModelDataUrl": "s3://m/m.tar.gz", } == model.prepare_container_def(GPU_INSTANCE_TYPE) predictor = xgboost.deploy(1, GPU_INSTANCE_TYPE) assert isinstance(predictor, XGBoostPredictor)
def test_xgboost_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): xgboost = XGBoost( entry_point=os.path.join(DATA_DIR, "dummy_script.py"), framework_version=XGBOOST_LATEST_VERSION, role=ROLE, sagemaker_session=sagemaker_session, train_instance_type=cpu_instance_type, train_instance_count=SINGLE_INSTANCE_COUNT, base_job_name="XGBoost job", py_version=PYTHON_VERSION, ) training_config = _build_airflow_workflow( estimator=xgboost, instance_type=cpu_instance_type ) _assert_that_s3_url_contains_data( sagemaker_session, training_config["HyperParameters"]["sagemaker_submit_directory"].strip('"'), )
def get_pipeline( region, sagemaker_project_arn=None, role=None, default_bucket='', pipeline_name='end-to-end-ml-sagemaker-pipeline', model_package_group_name='end-to-end-ml-sm-model-package-group', base_job_prefix='endtoendmlsm') -> Pipeline: """ Gets the SM Pipeline. :param role: The execution role. :param bucket_name: The bucket where pipeline artifacts are stored. :param prefix: The prefix where pipeline artifacts are stored. :return: A Pipeline instance. """ bucket_name = default_bucket prefix = 'endtoendmlsm' sagemaker_session = get_session(region, bucket_name) # --------------------- # Processing parameters # --------------------- # The path to the raw data. raw_data_path = 's3://gianpo-public/endtoendml/data/raw/predmain_raw_data_header.csv'.format( bucket_name, prefix) raw_data_path_param = ParameterString(name="raw_data_path", default_value=raw_data_path) # The output path to the training data. train_data_path = 's3://{0}/{1}/data/preprocessed/train/'.format( bucket_name, prefix) train_data_path_param = ParameterString(name="train_data_path", default_value=train_data_path) # The output path to the validation data. val_data_path = 's3://{0}/{1}/data/preprocessed/val/'.format( bucket_name, prefix) val_data_path_param = ParameterString(name="val_data_path", default_value=val_data_path) # The output path to the featurizer model. model_path = 's3://{0}/{1}/output/sklearn/'.format(bucket_name, prefix) model_path_param = ParameterString(name="model_path", default_value=model_path) # The instance type for the processing job. processing_instance_type_param = ParameterString( name="processing_instance_type", default_value='ml.m5.large') # The instance count for the processing job. processing_instance_count_param = ParameterInteger( name="processing_instance_count", default_value=1) # The train/test split ration parameter. train_test_split_ratio_param = ParameterString( name="train_test_split_ratio", default_value='0.2') # ------------------- # Training parameters # ------------------- # XGB hyperparameters. max_depth_param = ParameterString(name="max_depth", default_value='3') eta_param = ParameterString(name="eta", default_value='0.1') gamma_param = ParameterString(name="gamma", default_value='0') min_child_weight_param = ParameterString(name="min_child_weight", default_value='1') objective_param = ParameterString(name="objective", default_value='binary:logistic') num_round_param = ParameterString(name="num_round", default_value='10') eval_metric_param = ParameterString(name="eval_metric", default_value='auc') # The instance type for the training job. training_instance_type_param = ParameterString( name="training_instance_type", default_value='ml.m5.xlarge') # The instance count for the training job. training_instance_count_param = ParameterInteger( name="training_instance_count", default_value=1) # The training output path for the model. output_path = 's3://{0}/{1}/output/'.format(bucket_name, prefix) output_path_param = ParameterString(name="output_path", default_value=output_path) # -------------------------- # Register model parameters # -------------------------- # The default instance type for deployment. deploy_instance_type_param = ParameterString(name="deploy_instance_type", default_value='ml.m5.2xlarge') # The approval status for models added to the registry. model_approval_status_param = ParameterString( name="model_approval_status", default_value='PendingManualApproval') # -------------------------- # Processing Step # -------------------------- sklearn_processor = SKLearnProcessor( role=role, instance_type=processing_instance_type_param, instance_count=processing_instance_count_param, framework_version='0.20.0') inputs = [ ProcessingInput(input_name='raw_data', source=raw_data_path_param, destination='/opt/ml/processing/input') ] outputs = [ ProcessingOutput(output_name='train_data', source='/opt/ml/processing/train', destination=train_data_path_param), ProcessingOutput(output_name='val_data', source='/opt/ml/processing/val', destination=val_data_path_param), ProcessingOutput(output_name='model', source='/opt/ml/processing/model', destination=model_path_param) ] code_path = os.path.join(BASE_DIR, 'dataprep/preprocess.py') processing_step = ProcessingStep(name='Processing', code=code_path, processor=sklearn_processor, inputs=inputs, outputs=outputs, job_arguments=[ '--train-test-split-ratio', train_test_split_ratio_param ]) # -------------------------- # Training Step # -------------------------- hyperparameters = { "max_depth": max_depth_param, "eta": eta_param, "gamma": gamma_param, "min_child_weight": min_child_weight_param, "silent": 0, "objective": objective_param, "num_round": num_round_param, "eval_metric": eval_metric_param } entry_point = 'train.py' source_dir = os.path.join(BASE_DIR, 'train/') code_location = 's3://{0}/{1}/code'.format(bucket_name, prefix) estimator = XGBoost(entry_point=entry_point, source_dir=source_dir, output_path=output_path_param, code_location=code_location, hyperparameters=hyperparameters, instance_type=training_instance_type_param, instance_count=training_instance_count_param, framework_version="0.90-2", py_version="py3", role=role) training_step = TrainingStep( name='Training', estimator=estimator, inputs={ 'train': TrainingInput( s3_data=processing_step.properties.ProcessingOutputConfig. Outputs['train_data'].S3Output.S3Uri, content_type='text/csv'), 'validation': TrainingInput( s3_data=processing_step.properties.ProcessingOutputConfig. Outputs['val_data'].S3Output.S3Uri, content_type='text/csv') }) # -------------------------- # Register Model Step # -------------------------- code_location = 's3://{0}/{1}/code'.format(bucket_name, prefix) sklearn_model = SKLearnModel( name='end-to-end-ml-sm-skl-model-{0}'.format(str(int(time.time()))), model_data=processing_step.properties.ProcessingOutputConfig. Outputs['model'].S3Output.S3Uri, entry_point='inference.py', source_dir=os.path.join(BASE_DIR, 'deploy/sklearn/'), code_location=code_location, role=role, sagemaker_session=sagemaker_session, framework_version='0.20.0', py_version='py3') code_location = 's3://{0}/{1}/code'.format(bucket_name, prefix) xgboost_model = XGBoostModel( name='end-to-end-ml-sm-xgb-model-{0}'.format(str(int(time.time()))), model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts, entry_point='inference.py', source_dir=os.path.join(BASE_DIR, 'deploy/xgboost/'), code_location=code_location, framework_version='0.90-2', py_version='py3', role=role, sagemaker_session=sagemaker_session) pipeline_model_name = 'end-to-end-ml-sm-xgb-skl-pipeline-{0}'.format( str(int(time.time()))) pipeline_model = PipelineModel(name=pipeline_model_name, role=role, models=[sklearn_model, xgboost_model], sagemaker_session=sagemaker_session) register_model_step = RegisterModel( name='RegisterModel', content_types=['text/csv'], response_types=['application/json', 'text/csv'], inference_instances=[deploy_instance_type_param, 'ml.m5.large'], transform_instances=['ml.c5.4xlarge'], model_package_group_name=model_package_group_name, approval_status=model_approval_status_param, model=pipeline_model) # -------------------------- # Pipeline # -------------------------- pipeline = Pipeline( name=pipeline_name, parameters=[ raw_data_path_param, train_data_path_param, val_data_path_param, model_path_param, processing_instance_type_param, processing_instance_count_param, train_test_split_ratio_param, max_depth_param, eta_param, gamma_param, min_child_weight_param, objective_param, num_round_param, eval_metric_param, training_instance_type_param, training_instance_count_param, output_path_param, deploy_instance_type_param, model_approval_status_param ], steps=[processing_step, training_step, register_model_step], sagemaker_session=sagemaker_session, ) response = pipeline.upsert(role_arn=role) print(response["PipelineArn"]) return pipeline
def test_sklearn_xgboost_sip_model_registration(sagemaker_session, role, pipeline_name, region_name): prefix = "sip" bucket_name = sagemaker_session.default_bucket() instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") sklearn_processor = SKLearnProcessor( role=role, instance_type=instance_type, instance_count=instance_count, framework_version="0.20.0", sagemaker_session=sagemaker_session, ) # The path to the raw data. raw_data_path = "s3://{0}/{1}/data/raw/".format(bucket_name, prefix) raw_data_path_param = ParameterString(name="raw_data_path", default_value=raw_data_path) # The output path to the training data. train_data_path = "s3://{0}/{1}/data/preprocessed/train/".format( bucket_name, prefix) train_data_path_param = ParameterString(name="train_data_path", default_value=train_data_path) # The output path to the validation data. val_data_path = "s3://{0}/{1}/data/preprocessed/val/".format( bucket_name, prefix) val_data_path_param = ParameterString(name="val_data_path", default_value=val_data_path) # The training output path for the model. output_path = "s3://{0}/{1}/output/".format(bucket_name, prefix) output_path_param = ParameterString(name="output_path", default_value=output_path) # The output path to the featurizer model. model_path = "s3://{0}/{1}/output/sklearn/".format(bucket_name, prefix) model_path_param = ParameterString(name="model_path", default_value=model_path) inputs = [ ProcessingInput( input_name="raw_data", source=raw_data_path_param, destination="/opt/ml/processing/input", ) ] outputs = [ ProcessingOutput( output_name="train_data", source="/opt/ml/processing/train", destination=train_data_path_param, ), ProcessingOutput( output_name="val_data", source="/opt/ml/processing/val", destination=val_data_path_param, ), ProcessingOutput( output_name="model", source="/opt/ml/processing/model", destination=model_path_param, ), ] base_dir = os.path.join(DATA_DIR, "sip") code_path = os.path.join(base_dir, "preprocessor.py") processing_step = ProcessingStep( name="Processing", code=code_path, processor=sklearn_processor, inputs=inputs, outputs=outputs, job_arguments=["--train-test-split-ratio", "0.2"], ) entry_point = "training.py" source_dir = base_dir code_location = "s3://{0}/{1}/code".format(bucket_name, prefix) estimator = XGBoost( entry_point=entry_point, source_dir=source_dir, output_path=output_path_param, code_location=code_location, instance_type=instance_type, instance_count=instance_count, framework_version="0.90-2", sagemaker_session=sagemaker_session, py_version="py3", role=role, ) training_step = TrainingStep( name="Training", estimator=estimator, inputs={ "train": TrainingInput( s3_data=processing_step.properties.ProcessingOutputConfig. Outputs["train_data"].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=processing_step.properties.ProcessingOutputConfig. Outputs["val_data"].S3Output.S3Uri, content_type="text/csv", ), }, ) code_location = "s3://{0}/{1}/code".format(bucket_name, prefix) source_dir = os.path.join(base_dir, "sklearn_source_dir") sklearn_model = SKLearnModel( name="sklearn-model", model_data=processing_step.properties.ProcessingOutputConfig. Outputs["model"].S3Output.S3Uri, entry_point="inference.py", source_dir=source_dir, code_location=code_location, role=role, sagemaker_session=sagemaker_session, framework_version="0.20.0", py_version="py3", ) code_location = "s3://{0}/{1}/code".format(bucket_name, prefix) source_dir = os.path.join(base_dir, "xgboost_source_dir") xgboost_model = XGBoostModel( name="xgboost-model", model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts, entry_point="inference.py", source_dir=source_dir, code_location=code_location, framework_version="0.90-2", py_version="py3", role=role, sagemaker_session=sagemaker_session, ) pipeline_model = PipelineModel([xgboost_model, sklearn_model], role, sagemaker_session=sagemaker_session) step_register = RegisterModel( name="AbaloneRegisterModel", model=pipeline_model, content_types=["application/json"], response_types=["application/json"], inference_instances=["ml.t2.medium", "ml.m5.xlarge"], transform_instances=["ml.m5.xlarge"], model_package_group_name="windturbine", ) pipeline = Pipeline( name=pipeline_name, parameters=[ raw_data_path_param, train_data_path_param, val_data_path_param, model_path_param, instance_type, instance_count, output_path_param, ], steps=[processing_step, training_step, step_register], sagemaker_session=sagemaker_session, ) try: response = pipeline.upsert(role_arn=role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) execution = pipeline.start() assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) finally: try: pipeline.delete() except Exception: pass
def test_model_registration_with_drift_check_baselines( sagemaker_session, role, pipeline_name, ): instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") # upload model data to s3 model_local_path = os.path.join(DATA_DIR, "mxnet_mnist/model.tar.gz") model_base_uri = "s3://{}/{}/input/model/{}".format( sagemaker_session.default_bucket(), "register_model_test_with_drift_baseline", utils.unique_name_from_base("model"), ) model_uri = S3Uploader.upload(model_local_path, model_base_uri, sagemaker_session=sagemaker_session) model_uri_param = ParameterString(name="model_uri", default_value=model_uri) # upload metrics to s3 metrics_data = ( '{"regression_metrics": {"mse": {"value": 4.925353410353891, ' '"standard_deviation": 2.219186917819692}}}') metrics_base_uri = "s3://{}/{}/input/metrics/{}".format( sagemaker_session.default_bucket(), "register_model_test_with_drift_baseline", utils.unique_name_from_base("metrics"), ) metrics_uri = S3Uploader.upload_string_as_file_body( body=metrics_data, desired_s3_uri=metrics_base_uri, sagemaker_session=sagemaker_session, ) metrics_uri_param = ParameterString(name="metrics_uri", default_value=metrics_uri) model_metrics = ModelMetrics( bias=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), explainability=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), bias_pre_training=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), bias_post_training=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), ) drift_check_baselines = DriftCheckBaselines( model_statistics=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), model_constraints=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), model_data_statistics=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), model_data_constraints=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), bias_config_file=FileSource( s3_uri=metrics_uri_param, content_type="application/json", ), bias_pre_training_constraints=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), bias_post_training_constraints=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), explainability_constraints=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), explainability_config_file=FileSource( s3_uri=metrics_uri_param, content_type="application/json", ), ) customer_metadata_properties = {"key1": "value1"} estimator = XGBoost( entry_point="training.py", source_dir=os.path.join(DATA_DIR, "sip"), instance_type=instance_type, instance_count=instance_count, framework_version="0.90-2", sagemaker_session=sagemaker_session, py_version="py3", role=role, ) step_register = RegisterModel( name="MyRegisterModelStep", estimator=estimator, model_data=model_uri_param, content_types=["application/json"], response_types=["application/json"], inference_instances=["ml.t2.medium", "ml.m5.xlarge"], transform_instances=["ml.m5.xlarge"], model_package_group_name="testModelPackageGroup", model_metrics=model_metrics, drift_check_baselines=drift_check_baselines, customer_metadata_properties=customer_metadata_properties, ) pipeline = Pipeline( name=pipeline_name, parameters=[ model_uri_param, metrics_uri_param, instance_type, instance_count, ], steps=[step_register], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] for _ in retries( max_retry_count=5, exception_message_prefix= "Waiting for a successful execution of pipeline", seconds_to_sleep=10, ): execution = pipeline.start(parameters={ "model_uri": model_uri, "metrics_uri": metrics_uri }) response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=30, max_attempts=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 failure_reason = execution_steps[0].get("FailureReason", "") if failure_reason != "": logging.error( f"Pipeline execution failed with error: {failure_reason}." " Retrying..") continue assert execution_steps[0]["StepStatus"] == "Succeeded" assert execution_steps[0]["StepName"] == "MyRegisterModelStep" response = sagemaker_session.sagemaker_client.describe_model_package( ModelPackageName=execution_steps[0]["Metadata"] ["RegisterModel"]["Arn"]) assert (response["ModelMetrics"]["Explainability"]["Report"] ["ContentType"] == "application/json") assert (response["DriftCheckBaselines"]["Bias"][ "PreTrainingConstraints"]["ContentType"] == "application/json") assert (response["DriftCheckBaselines"]["Explainability"] ["Constraints"]["ContentType"] == "application/json") assert (response["DriftCheckBaselines"]["ModelQuality"] ["Statistics"]["ContentType"] == "application/json") assert (response["DriftCheckBaselines"]["ModelDataQuality"] ["Statistics"]["ContentType"] == "application/json") assert response[ "CustomerMetadataProperties"] == customer_metadata_properties break finally: try: pipeline.delete() except Exception: pass
"max_depth": "10", "eta": "0.2", "gamma": "1", "min_child_weight": "6", "silent": "0", "objective": "multi:softmax", "num_class": "15", "num_round": "1" # TEMP: Hack to make faster } xgb = XGBoost(entry_point=entry_point, source_dir=source_dir, output_path=model_output_path, code_location=model_code_location, hyperparameters=hyperparameters, train_instance_type="ml.m5.4xlarge", train_instance_count=1, framework_version="0.90-2", py_version="py3", role=sagemaker_execution_role, debugger_hook_config=debug_hook_config, rules=debug_rules) # Upload model code to s3 xgb.prepare_workflow_for_training(job_name) print('uploaded code to: {}'.format(xgb.uploaded_code.s3_prefix)) # Create Workflow steps execution_input = ExecutionInput(schema={ 'TrainLocation': str,