def test_training_step(sagemaker_session): estimator = Estimator( image_uri=IMAGE_URI, role=ROLE, instance_count=1, instance_type="c4.4xlarge", profiler_config=ProfilerConfig(system_monitor_interval_millis=500), rules=[], sagemaker_session=sagemaker_session, ) inputs = TrainingInput(f"s3://{BUCKET}/train_manifest") cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step = TrainingStep(name="MyTrainingStep", estimator=estimator, inputs=inputs, cache_config=cache_config) assert step.to_request() == { "Name": "MyTrainingStep", "Type": "Training", "Arguments": { "AlgorithmSpecification": { "TrainingImage": IMAGE_URI, "TrainingInputMode": "File" }, "InputDataConfig": [{ "ChannelName": "training", "DataSource": { "S3DataSource": { "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri": f"s3://{BUCKET}/train_manifest", } }, }], "OutputDataConfig": { "S3OutputPath": f"s3://{BUCKET}/" }, "ResourceConfig": { "InstanceCount": 1, "InstanceType": "c4.4xlarge", "VolumeSizeInGB": 30, }, "RoleArn": ROLE, "StoppingCondition": { "MaxRuntimeInSeconds": 86400 }, "ProfilerConfig": { "ProfilingIntervalInMilliseconds": 500, "S3OutputPath": f"s3://{BUCKET}/", }, }, "CacheConfig": { "Enabled": True, "ExpireAfter": "PT1H" }, } assert step.properties.TrainingJobName.expr == { "Get": "Steps.MyTrainingStep.TrainingJobName" }
def test_processing_step_normalizes_args_with_no_code(mock_normalize_args, script_processor): cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") inputs = [ ProcessingInput( source=f"s3://{BUCKET}/processing_manifest", destination="processing_manifest", ) ] outputs = [ ProcessingOutput( source=f"s3://{BUCKET}/processing_manifest", destination="processing_manifest", ) ] step = ProcessingStep( name="MyProcessingStep", processor=script_processor, inputs=inputs, outputs=outputs, job_arguments=["arg1", "arg2"], cache_config=cache_config, ) mock_normalize_args.return_value = [step.inputs, step.outputs] step.to_request() mock_normalize_args.assert_called_with( job_name=None, arguments=step.job_arguments, inputs=step.inputs, outputs=step.outputs, code=None, )
def test_processing_step(sagemaker_session): processing_input_data_uri_parameter = ParameterString( name="ProcessingInputDataUri", default_value=f"s3://{BUCKET}/processing_manifest" ) instance_type_parameter = ParameterString(name="InstanceType", default_value="ml.m4.4xlarge") instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1) processor = Processor( image_uri=IMAGE_URI, role=ROLE, instance_count=instance_count_parameter, instance_type=instance_type_parameter, sagemaker_session=sagemaker_session, ) inputs = [ ProcessingInput( source=processing_input_data_uri_parameter, destination="processing_manifest", ) ] cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step = ProcessingStep( name="MyProcessingStep", processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) assert step.to_request() == { "Name": "MyProcessingStep", "Type": "Processing", "Arguments": { "AppSpecification": {"ImageUri": "fakeimage"}, "ProcessingInputs": [ { "InputName": "input-1", "AppManaged": False, "S3Input": { "LocalPath": "processing_manifest", "S3CompressionType": "None", "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3InputMode": "File", "S3Uri": processing_input_data_uri_parameter, }, } ], "ProcessingResources": { "ClusterConfig": { "InstanceCount": instance_count_parameter, "InstanceType": instance_type_parameter, "VolumeSizeInGB": 30, } }, "RoleArn": "DummyRole", }, "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"}, } assert step.properties.ProcessingJobName.expr == { "Get": "Steps.MyProcessingStep.ProcessingJobName" }
def test_processing_step_normalizes_args(mock_normalize_args, sagemaker_session): processor = ScriptProcessor( role=ROLE, image_uri= "012345678901.dkr.ecr.us-west-2.amazonaws.com/my-custom-image-uri", command=["python3"], instance_type="ml.m4.xlarge", instance_count=1, volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="my_sklearn_processor", env={"my_env_variable": "my_env_variable_value"}, tags=[{ "Key": "my-tag", "Value": "my-tag-value" }], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, encrypt_inter_container_traffic=True, ), sagemaker_session=sagemaker_session, ) cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") inputs = [ ProcessingInput( source=f"s3://{BUCKET}/processing_manifest", destination="processing_manifest", ) ] outputs = [ ProcessingOutput( source=f"s3://{BUCKET}/processing_manifest", destination="processing_manifest", ) ] step = ProcessingStep( name="MyProcessingStep", processor=processor, code="foo.py", inputs=inputs, outputs=outputs, job_arguments=["arg1", "arg2"], cache_config=cache_config, ) mock_normalize_args.return_value = [step.inputs, step.outputs] step.to_request() mock_normalize_args.assert_called_with( arguments=step.job_arguments, inputs=step.inputs, outputs=step.outputs, code=step.code, )
def test_processing_step_with_processor(pipeline_session, processing_input): processor = Processor( image_uri=IMAGE_URI, role=sagemaker.get_execution_role(), instance_count=1, instance_type=INSTANCE_TYPE, sagemaker_session=pipeline_session, ) with warnings.catch_warnings(record=True) as w: step_args = processor.run(inputs=processing_input) assert len(w) == 1 assert issubclass(w[-1].category, UserWarning) assert "Running within a PipelineSession" in str(w[-1].message) cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") evaluation_report = PropertyFile( name="EvaluationReport", output_name="evaluation", path="evaluation.json" ) with warnings.catch_warnings(record=True) as w: step = ProcessingStep( name="MyProcessingStep", step_args=step_args, description="ProcessingStep description", display_name="MyProcessingStep", depends_on=["TestStep", "SecondTestStep"], cache_config=cache_config, property_files=[evaluation_report], ) assert len(w) == 0 pipeline = Pipeline( name="MyPipeline", steps=[step], sagemaker_session=pipeline_session, ) assert json.loads(pipeline.definition())["Steps"][0] == { "Name": "MyProcessingStep", "Description": "ProcessingStep description", "DisplayName": "MyProcessingStep", "Type": "Processing", "DependsOn": ["TestStep", "SecondTestStep"], "Arguments": step_args, "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"}, "PropertyFiles": [ { "FilePath": "evaluation.json", "OutputName": "evaluation", "PropertyFileName": "EvaluationReport", } ], } assert step.properties.ProcessingJobName.expr == { "Get": "Steps.MyProcessingStep.ProcessingJobName" }
def test_add_depends_on(sagemaker_session): processing_input_data_uri_parameter = ParameterString( name="ProcessingInputDataUri", default_value=f"s3://{BUCKET}/processing_manifest") instance_type_parameter = ParameterString(name="InstanceType", default_value="ml.m4.4xlarge") instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1) processor = Processor( image_uri=IMAGE_URI, role=ROLE, instance_count=instance_count_parameter, instance_type=instance_type_parameter, sagemaker_session=sagemaker_session, ) inputs = [ ProcessingInput( source=processing_input_data_uri_parameter, destination="processing_manifest", ) ] cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step_1 = ProcessingStep( name="MyProcessingStep-1", processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) step_2 = ProcessingStep( name="MyProcessingStep-2", depends_on=[step_1], processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) step_3 = ProcessingStep( name="MyProcessingStep-3", depends_on=[step_1], processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) step_3.add_depends_on([step_2.name]) assert "DependsOn" not in step_1.to_request() assert step_2.to_request()["DependsOn"] == ["MyProcessingStep-1"] assert step_3.to_request()["DependsOn"] == [ "MyProcessingStep-1", "MyProcessingStep-2" ]
def test_emr_step_with_one_step_config(sagemaker_session): emr_step_config = EMRStepConfig( jar="s3:/script-runner/script-runner.jar", args=["--arg_0", "arg_0_value"], main_class="com.my.main", properties=[{"Key": "Foo", "Value": "Foo_value"}, {"Key": "Bar", "Value": "Bar_value"}], ) emr_step = EMRStep( name="MyEMRStep", display_name="MyEMRStep", description="MyEMRStepDescription", cluster_id="MyClusterID", step_config=emr_step_config, depends_on=["TestStep"], cache_config=CacheConfig(enable_caching=True, expire_after="PT1H"), ) emr_step.add_depends_on(["SecondTestStep"]) assert emr_step.to_request() == { "Name": "MyEMRStep", "Type": "EMR", "Arguments": { "ClusterId": "MyClusterID", "StepConfig": { "HadoopJarStep": { "Args": ["--arg_0", "arg_0_value"], "Jar": "s3:/script-runner/script-runner.jar", "MainClass": "com.my.main", "Properties": [ {"Key": "Foo", "Value": "Foo_value"}, {"Key": "Bar", "Value": "Bar_value"}, ], } }, }, "DependsOn": ["TestStep", "SecondTestStep"], "DisplayName": "MyEMRStep", "Description": "MyEMRStepDescription", "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"}, } assert emr_step.properties.ClusterId == "MyClusterID" assert emr_step.properties.ActionOnFailure.expr == {"Get": "Steps.MyEMRStep.ActionOnFailure"} assert emr_step.properties.Config.Args.expr == {"Get": "Steps.MyEMRStep.Config.Args"} assert emr_step.properties.Config.Jar.expr == {"Get": "Steps.MyEMRStep.Config.Jar"} assert emr_step.properties.Config.MainClass.expr == {"Get": "Steps.MyEMRStep.Config.MainClass"} assert emr_step.properties.Id.expr == {"Get": "Steps.MyEMRStep.Id"} assert emr_step.properties.Name.expr == {"Get": "Steps.MyEMRStep.Name"} assert emr_step.properties.Status.State.expr == {"Get": "Steps.MyEMRStep.Status.State"} assert emr_step.properties.Status.FailureDetails.Reason.expr == { "Get": "Steps.MyEMRStep.Status.FailureDetails.Reason" }
def test_transform_step(sagemaker_session): transformer = Transformer( model_name=MODEL_NAME, instance_count=1, instance_type="c4.4xlarge", sagemaker_session=sagemaker_session, ) inputs = TransformInput(data=f"s3://{BUCKET}/transform_manifest") cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step = TransformStep( name="MyTransformStep", depends_on=["TestStep"], transformer=transformer, display_name="TransformStep", description="TestDescription", inputs=inputs, cache_config=cache_config, ) step.add_depends_on(["SecondTestStep"]) assert step.to_request() == { "Name": "MyTransformStep", "Type": "Transform", "Description": "TestDescription", "DisplayName": "TransformStep", "DependsOn": ["TestStep", "SecondTestStep"], "Arguments": { "ModelName": "gisele", "TransformInput": { "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": "s3://my-bucket/transform_manifest", } } }, "TransformOutput": { "S3OutputPath": None }, "TransformResources": { "InstanceCount": 1, "InstanceType": "c4.4xlarge", }, }, "CacheConfig": { "Enabled": True, "ExpireAfter": "PT1H" }, } assert step.properties.TransformJobName.expr == { "Get": "Steps.MyTransformStep.TransformJobName" }
def test_training_step_no_profiler_warning(sagemaker_session): estimator = TensorFlow( entry_point=DUMMY_SCRIPT_PATH, role=ROLE, model_dir=False, image_uri=IMAGE_URI, source_dir="s3://mybucket/source", framework_version="2.4.1", py_version="py37", disable_profiler=True, instance_count=1, instance_type="ml.p3.16xlarge", sagemaker_session=sagemaker_session, hyperparameters={ "batch-size": 500, "epochs": 5, }, debugger_hook_config=False, distribution={"smdistributed": { "dataparallel": { "enabled": True } }}, ) inputs = TrainingInput(s3_data=f"s3://{BUCKET}/train_manifest") cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") with warnings.catch_warnings(record=True) as w: # profiler disabled, cache config not None TrainingStep(name="MyTrainingStep", estimator=estimator, inputs=inputs, cache_config=cache_config) assert len(w) == 0 with warnings.catch_warnings(record=True) as w: # profiler enabled, cache config is None estimator.disable_profiler = False TrainingStep(name="MyTrainingStep", estimator=estimator, inputs=inputs, cache_config=None) assert len(w) == 0
def test_data_quality_check_step( sagemaker_session, check_job_config, model_package_group_name, supplied_baseline_statistics_uri, supplied_baseline_constraints_uri, ): data_quality_check_config = DataQualityCheckConfig( baseline_dataset=ParameterString(name="BaselineDataset"), dataset_format=DatasetFormat.csv(header=True), output_s3_uri="s3://...", record_preprocessor_script= "s3://my_bucket/data_quality/preprocessor.py", post_analytics_processor_script= "s3://my_bucket/data_quality/postprocessor.py", ) data_quality_check_step = QualityCheckStep( name="DataQualityCheckStep", skip_check=False, register_new_baseline=False, quality_check_config=data_quality_check_config, check_job_config=check_job_config, model_package_group_name=model_package_group_name, supplied_baseline_statistics=supplied_baseline_statistics_uri, supplied_baseline_constraints=supplied_baseline_constraints_uri, cache_config=CacheConfig(enable_caching=True, expire_after="PT1H"), ) pipeline = Pipeline( name="MyPipeline", parameters=[ supplied_baseline_statistics_uri, supplied_baseline_constraints_uri, model_package_group_name, ], steps=[data_quality_check_step], sagemaker_session=sagemaker_session, ) step_definition = _get_step_definition_for_test( pipeline, ["baseline_dataset_input", "quality_check_output"]) assert step_definition == _expected_data_quality_dsl
def test_data_bias_check_step(sagemaker_session, check_job_config, model_package_group_name, bias_config): data_bias_data_config = DataConfig( s3_data_input_path=_S3_INPUT_PATH, s3_output_path=_S3_OUTPUT_PATH, s3_analysis_config_output_path=_S3_ANALYSIS_CONFIG_OUTPUT_PATH, label="fraud", dataset_type="text/csv", ) data_bias_check_config = DataBiasCheckConfig( data_config=data_bias_data_config, data_bias_config=bias_config, methods="all", kms_key="kms_key", ) data_bias_check_step = ClarifyCheckStep( name="DataBiasCheckStep", clarify_check_config=data_bias_check_config, check_job_config=check_job_config, skip_check=False, register_new_baseline=False, model_package_group_name=model_package_group_name, supplied_baseline_constraints="supplied_baseline_constraints", cache_config=CacheConfig(enable_caching=True, expire_after="PT1H"), ) pipeline = Pipeline( name="MyPipeline", parameters=[model_package_group_name], steps=[data_bias_check_step], sagemaker_session=sagemaker_session, ) assert json.loads( pipeline.definition())["Steps"][0] == _expected_data_bias_dsl assert re.match( f"{_S3_ANALYSIS_CONFIG_OUTPUT_PATH}/{_BIAS_MONITORING_CFG_BASE_NAME}-configuration" + f"/{_BIAS_MONITORING_CFG_BASE_NAME}-config.*/.*/analysis_config.json", data_bias_check_config.monitoring_analysis_config_uri, )
def test_lambda_step(sagemaker_session): param = ParameterInteger(name="MyInt") output_param1 = LambdaOutput(output_name="output1", output_type=LambdaOutputTypeEnum.String) output_param2 = LambdaOutput(output_name="output2", output_type=LambdaOutputTypeEnum.Boolean) cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") lambda_step = LambdaStep( name="MyLambdaStep", depends_on=["TestStep"], lambda_func=Lambda( function_arn="arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda", session=sagemaker_session, ), display_name="MyLambdaStep", description="MyLambdaStepDescription", inputs={"arg1": "foo", "arg2": 5, "arg3": param}, outputs=[output_param1, output_param2], cache_config=cache_config, ) lambda_step.add_depends_on(["SecondTestStep"]) pipeline = Pipeline( name="MyPipeline", parameters=[param], steps=[lambda_step], sagemaker_session=sagemaker_session, ) assert json.loads(pipeline.definition())["Steps"][0] == { "Name": "MyLambdaStep", "Type": "Lambda", "DependsOn": ["TestStep", "SecondTestStep"], "DisplayName": "MyLambdaStep", "Description": "MyLambdaStepDescription", "FunctionArn": "arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda", "OutputParameters": [ {"OutputName": "output1", "OutputType": "String"}, {"OutputName": "output2", "OutputType": "Boolean"}, ], "Arguments": {"arg1": "foo", "arg2": 5, "arg3": {"Get": "Parameters.MyInt"}}, "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"}, }
def test_one_step_sparkjar_processing_pipeline( sagemaker_session, role, cpu_instance_type, pipeline_name, region_name, configuration, build_jar, ): instance_count = ParameterInteger(name="InstanceCount", default_value=2) cache_config = CacheConfig(enable_caching=True, expire_after="T30m") spark_path = os.path.join(DATA_DIR, "spark") spark_jar_processor = SparkJarProcessor( role=role, instance_count=2, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version="2.4", ) bucket = spark_jar_processor.sagemaker_session.default_bucket() with open(os.path.join(spark_path, "files", "data.jsonl")) as data: body = data.read() input_data_uri = f"s3://{bucket}/spark/input/data.jsonl" S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session, ) output_data_uri = f"s3://{bucket}/spark/output/sales/{datetime.now().isoformat()}" java_project_dir = os.path.join(spark_path, "code", "java", "hello-java-spark") spark_run_args = spark_jar_processor.get_run_args( submit_app=f"{java_project_dir}/hello-spark-java.jar", submit_class="com.amazonaws.sagemaker.spark.test.HelloJavaSparkApp", arguments=["--input", input_data_uri, "--output", output_data_uri], configuration=configuration, ) step_pyspark = ProcessingStep( name="sparkjar-process", processor=spark_jar_processor, inputs=spark_run_args.inputs, outputs=spark_run_args.outputs, job_arguments=spark_run_args.arguments, code=spark_run_args.code, cache_config=cache_config, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count], steps=[step_pyspark], sagemaker_session=sagemaker_session, ) try: # NOTE: We should exercise the case when role used in the pipeline execution is # different than that required of the steps in the pipeline itself. The role in # the pipeline definition needs to create training and processing jobs and other # sagemaker entities. However, the jobs created in the steps themselves execute # under a potentially different role, often requiring access to S3 and other # artifacts not required to during creation of the jobs in the pipeline steps. response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) pipeline.parameters = [ ParameterInteger(name="InstanceCount", default_value=1) ] response = pipeline.update(role) update_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", update_arn, ) execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) response = execution.describe() assert response["PipelineArn"] == create_arn # Check CacheConfig response = json.loads( pipeline.describe() ["PipelineDefinition"])["Steps"][0]["CacheConfig"] assert response["Enabled"] == cache_config.enable_caching assert response["ExpireAfter"] == cache_config.expire_after try: execution.wait(delay=30, max_attempts=3) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "sparkjar-process" finally: try: pipeline.delete() except Exception: pass
def test_one_step_pyspark_processing_pipeline( sagemaker_session, role, cpu_instance_type, pipeline_name, region_name, ): instance_count = ParameterInteger(name="InstanceCount", default_value=2) script_path = os.path.join(DATA_DIR, "dummy_script.py") cache_config = CacheConfig(enable_caching=True, expire_after="T30m") pyspark_processor = PySparkProcessor( base_job_name="sm-spark", framework_version="2.4", role=role, instance_count=instance_count, instance_type=cpu_instance_type, max_runtime_in_seconds=1200, sagemaker_session=sagemaker_session, ) spark_run_args = pyspark_processor.get_run_args( submit_app=script_path, arguments=[ "--s3_input_bucket", sagemaker_session.default_bucket(), "--s3_input_key_prefix", "spark-input", "--s3_output_bucket", sagemaker_session.default_bucket(), "--s3_output_key_prefix", "spark-output", ], ) step_pyspark = ProcessingStep( name="pyspark-process", processor=pyspark_processor, inputs=spark_run_args.inputs, outputs=spark_run_args.outputs, job_arguments=spark_run_args.arguments, code=spark_run_args.code, cache_config=cache_config, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count], steps=[step_pyspark], sagemaker_session=sagemaker_session, ) try: # NOTE: We should exercise the case when role used in the pipeline execution is # different than that required of the steps in the pipeline itself. The role in # the pipeline definition needs to create training and processing jobs and other # sagemaker entities. However, the jobs created in the steps themselves execute # under a potentially different role, often requiring access to S3 and other # artifacts not required to during creation of the jobs in the pipeline steps. response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) pipeline.parameters = [ ParameterInteger(name="InstanceCount", default_value=1) ] response = pipeline.update(role) update_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", update_arn, ) execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) response = execution.describe() assert response["PipelineArn"] == create_arn # Check CacheConfig response = json.loads( pipeline.describe() ["PipelineDefinition"])["Steps"][0]["CacheConfig"] assert response["Enabled"] == cache_config.enable_caching assert response["ExpireAfter"] == cache_config.expire_after try: execution.wait(delay=30, max_attempts=3) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "pyspark-process" finally: try: pipeline.delete() except Exception: pass
def test_one_step_framework_processing_pipeline( sagemaker_session, role, sklearn_latest_version, cpu_instance_type, pipeline_name, region_name, athena_dataset_definition, ): """Use `SKLearnProcessor` to test `FrameworkProcessor`.""" instance_count = ParameterInteger(name="InstanceCount", default_value=2) script_path = os.path.join(DATA_DIR, "dummy_script.py") input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") inputs = [ ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/"), ProcessingInput(dataset_definition=athena_dataset_definition), ] cache_config = CacheConfig(enable_caching=True, expire_after="T30m") sklearn_processor = SKLearnProcessor( framework_version=sklearn_latest_version, role=role, instance_type=cpu_instance_type, instance_count=instance_count, sagemaker_session=sagemaker_session, base_job_name="test-sklearn", ) run_args = sklearn_processor.get_run_args(code=script_path, inputs=inputs) step_sklearn = ProcessingStep( name="sklearn-process", processor=sklearn_processor, inputs=run_args.inputs, outputs=run_args.outputs, job_arguments=run_args.arguments, code=run_args.code, cache_config=cache_config, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count], steps=[step_sklearn], sagemaker_session=sagemaker_session, ) try: # NOTE: We should exercise the case when role used in the pipeline execution is # different than that required of the steps in the pipeline itself. The role in # the pipeline definition needs to create training and processing jobs and other # sagemaker entities. However, the jobs created in the steps themselves execute # under a potentially different role, often requiring access to S3 and other # artifacts not required to during creation of the jobs in the pipeline steps. response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) pipeline.parameters = [ ParameterInteger(name="InstanceCount", default_value=1) ] response = pipeline.update(role) update_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", update_arn, ) execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) response = execution.describe() assert response["PipelineArn"] == create_arn # Check CacheConfig response = json.loads( pipeline.describe() ["PipelineDefinition"])["Steps"][0]["CacheConfig"] assert response["Enabled"] == cache_config.enable_caching assert response["ExpireAfter"] == cache_config.expire_after try: execution.wait(delay=30, max_attempts=3) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "sklearn-process" finally: try: pipeline.delete() except Exception: pass
def get_pipeline( region, role=None, default_bucket=None, model_package_group_name="AbalonePackageGroup", pipeline_name="AbalonePipeline", base_job_prefix="Abalone", ): """Gets a SageMaker ML Pipeline instance working with on abalone data. Args: region: AWS region to create and run the pipeline. role: IAM role to create and run steps and pipeline. default_bucket: the bucket to use for storing the artifacts Returns: an instance of a pipeline """ sagemaker_session = get_session(region, default_bucket) if role is None: role = sagemaker.session.get_execution_role(sagemaker_session) # Create cache configuration cache_config = CacheConfig(enable_caching=True, expire_after="T30m") # Create SKlean processor object sklearn_processor = SKLearnProcessor( framework_version="0.20.0", role=role, instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name="credit-processing-job" ) # Use the sklearn_processor in a Sagemaker pipelines ProcessingStep step_preprocess_data = ProcessingStep( name="PreprocessCreditData", processor=sklearn_processor, cache_config=cache_config, inputs=[ ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), ], outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/output/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/output/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/output/test"), ProcessingOutput(output_name="baseline_with_headers", source="/opt/ml/processing/output/baseline") ], code=os.path.join(BASE_DIR, "preprocessing.py"), ) # Where to store the trained model model_path = f"s3://{default_bucket}/CreditTrain" # Fetch container to use for training image_uri = sagemaker.image_uris.retrieve( framework="xgboost", region=region, version="1.2-2", py_version="py3", instance_type=training_instance_type, ) # Create XGBoost estimator object xgb_estimator = Estimator( image_uri=image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, role=role, disable_profiler=True, ) # Specify hyperparameters xgb_estimator.set_hyperparameters(max_depth=5, eta=0.2, gamma=4, min_child_weight=6, subsample=0.8, objective='binary:logistic', num_round=25) # Use the xgb_estimator in a Sagemaker pipelines ProcessingStep. # NOTE how the input to the training job directly references the output of the previous step. step_train_model = TrainingStep( name="TrainCreditModel", estimator=xgb_estimator, cache_config=cache_config, inputs={ "train": TrainingInput( s3_data=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, content_type="text/csv" ), "validation": TrainingInput( s3_data=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, content_type="text/csv" ) }, ) # Create ScriptProcessor object. evaluate_model_processor = ScriptProcessor( image_uri=image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name="script-credit-eval", role=role, ) # Create a PropertyFile # We use a PropertyFile to be able to reference outputs from a processing step, for instance to use in a condition step, which we'll see later on. # For more information, visit https://docs.aws.amazon.com/sagemaker/latest/dg/build-and-manage-propertyfile.html evaluation_report = PropertyFile( name="EvaluationReport", output_name="evaluation", path="evaluation.json" ) # Use the evaluate_model_processor in a Sagemaker pipelines ProcessingStep. step_evaluate_model = ProcessingStep( name="EvaluateCreditModel", processor=evaluate_model_processor, cache_config=cache_config, inputs=[ ProcessingInput( source=step_train_model.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model" ), ProcessingInput( source=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[ "test" ].S3Output.S3Uri, destination="/opt/ml/processing/test" ) ], outputs=[ ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(BASE_DIR, "evaluation.py"), property_files=[evaluation_report], ) model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/evaluation.json".format( step_evaluate_model.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] ), content_type="application/json" ) ) # Crete a RegisterModel step, which registers your model with Sagemaker Model Registry. step_register_model = RegisterModel( name="RegisterCreditModel", estimator=xgb_estimator, model_data=step_train_model.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.xlarge", "ml.m5.large"], transform_instances=["ml.m5.xlarge"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=model_metrics ) # Create Processor object using the model monitor image baseline_processor = sagemaker.processing.Processor( base_job_name="credit-risk-baseline-processor", image_uri=sagemaker.image_uris.retrieve(framework='model-monitor', region='eu-west-1'), role=role, instance_count=1, instance_type=processing_instance_type, env = { "dataset_format": "{\"csv\": {\"header\": true} }", "dataset_source": "/opt/ml/processing/sm_input", "output_path": "/opt/ml/processing/sm_output", "publish_cloudwatch_metrics": "Disabled" } ) # Create a Sagemaker Pipeline step, using the baseline_processor. step_create_data_baseline = ProcessingStep( name="CreateModelQualityBaseline", processor=baseline_processor, cache_config=cache_config, inputs=[ ProcessingInput( source=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[ "baseline_with_headers" ].S3Output.S3Uri, destination="/opt/ml/processing/sm_input", ) ], outputs=[ ProcessingOutput( source="/opt/ml/processing/sm_output", destination="s3://{}/{}/baseline".format(default_bucket, base_job_prefix), output_name="baseline_result", ) ], ) # Create Condition cond_gte = ConditionGreaterThanOrEqualTo( left=JsonGet( step=step_evaluate_model, property_file=evaluation_report, json_path="binary_classification_metrics.accuracy.value" ), right=0.7 ) # Create a Sagemaker Pipelines ConditionStep, using the condition we just created. step_cond = ConditionStep( name="AccuracyCondition", conditions=[cond_gte], if_steps=[step_register_model], else_steps=[], ) from sagemaker.workflow.pipeline import Pipeline # Create a Sagemaker Pipeline pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data, ], steps=[step_preprocess_data, step_train_model, step_evaluate_model, step_create_data_baseline, step_cond], ) return pipeline
def test_training_step_tensorflow(sagemaker_session): instance_type_parameter = ParameterString(name="InstanceType", default_value="ml.p3.16xlarge") instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1) data_source_uri_parameter = ParameterString( name="DataSourceS3Uri", default_value=f"s3://{BUCKET}/train_manifest") training_epochs_parameter = ParameterInteger(name="TrainingEpochs", default_value=5) training_batch_size_parameter = ParameterInteger(name="TrainingBatchSize", default_value=500) estimator = TensorFlow( entry_point=os.path.join(DATA_DIR, SCRIPT_FILE), role=ROLE, model_dir=False, image_uri=IMAGE_URI, source_dir="s3://mybucket/source", framework_version="2.4.1", py_version="py37", instance_count=instance_count_parameter, instance_type=instance_type_parameter, sagemaker_session=sagemaker_session, # subnets=subnets, hyperparameters={ "batch-size": training_batch_size_parameter, "epochs": training_epochs_parameter, }, # security_group_ids=security_group_ids, debugger_hook_config=False, # Training using SMDataParallel Distributed Training Framework distribution={"smdistributed": { "dataparallel": { "enabled": True } }}, ) inputs = TrainingInput(s3_data=data_source_uri_parameter) cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step = TrainingStep(name="MyTrainingStep", estimator=estimator, inputs=inputs, cache_config=cache_config) step_request = step.to_request() step_request["Arguments"]["HyperParameters"].pop("sagemaker_job_name", None) step_request["Arguments"]["HyperParameters"].pop("sagemaker_program", None) step_request["Arguments"].pop("ProfilerRuleConfigurations", None) assert step_request == { "Name": "MyTrainingStep", "Type": "Training", "Arguments": { "AlgorithmSpecification": { "TrainingInputMode": "File", "TrainingImage": "fakeimage", "EnableSageMakerMetricsTimeSeries": True, }, "OutputDataConfig": { "S3OutputPath": "s3://my-bucket/" }, "StoppingCondition": { "MaxRuntimeInSeconds": 86400 }, "ResourceConfig": { "InstanceCount": instance_count_parameter, "InstanceType": instance_type_parameter, "VolumeSizeInGB": 30, }, "RoleArn": "DummyRole", "InputDataConfig": [{ "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": data_source_uri_parameter, "S3DataDistributionType": "FullyReplicated", } }, "ChannelName": "training", }], "HyperParameters": { "batch-size": training_batch_size_parameter, "epochs": training_epochs_parameter, "sagemaker_submit_directory": '"s3://mybucket/source"', "sagemaker_container_log_level": "20", "sagemaker_region": '"us-west-2"', "sagemaker_distributed_dataparallel_enabled": "true", "sagemaker_instance_type": instance_type_parameter, "sagemaker_distributed_dataparallel_custom_mpi_options": '""', }, "ProfilerConfig": { "S3OutputPath": "s3://my-bucket/" }, }, "CacheConfig": { "Enabled": True, "ExpireAfter": "PT1H" }, } assert step.properties.TrainingJobName.expr == { "Get": "Steps.MyTrainingStep.TrainingJobName" }
def test_training_step_base_estimator(sagemaker_session): instance_type_parameter = ParameterString(name="InstanceType", default_value="c4.4xlarge") instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1) data_source_uri_parameter = ParameterString( name="DataSourceS3Uri", default_value=f"s3://{BUCKET}/train_manifest") training_epochs_parameter = ParameterInteger(name="TrainingEpochs", default_value=5) training_batch_size_parameter = ParameterInteger(name="TrainingBatchSize", default_value=500) estimator = Estimator( image_uri=IMAGE_URI, role=ROLE, instance_count=instance_count_parameter, instance_type=instance_type_parameter, profiler_config=ProfilerConfig(system_monitor_interval_millis=500), hyperparameters={ "batch-size": training_batch_size_parameter, "epochs": training_epochs_parameter, }, rules=[], sagemaker_session=sagemaker_session, ) inputs = TrainingInput(s3_data=data_source_uri_parameter) cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step = TrainingStep( name="MyTrainingStep", depends_on=["TestStep"], estimator=estimator, inputs=inputs, cache_config=cache_config, ) step.add_depends_on(["AnotherTestStep"]) assert step.to_request() == { "Name": "MyTrainingStep", "Type": "Training", "DependsOn": ["TestStep", "AnotherTestStep"], "Arguments": { "AlgorithmSpecification": { "TrainingImage": IMAGE_URI, "TrainingInputMode": "File" }, "HyperParameters": { "batch-size": training_batch_size_parameter, "epochs": training_epochs_parameter, }, "InputDataConfig": [{ "ChannelName": "training", "DataSource": { "S3DataSource": { "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri": data_source_uri_parameter, } }, }], "OutputDataConfig": { "S3OutputPath": f"s3://{BUCKET}/" }, "ResourceConfig": { "InstanceCount": instance_count_parameter, "InstanceType": instance_type_parameter, "VolumeSizeInGB": 30, }, "RoleArn": ROLE, "StoppingCondition": { "MaxRuntimeInSeconds": 86400 }, "ProfilerConfig": { "ProfilingIntervalInMilliseconds": 500, "S3OutputPath": f"s3://{BUCKET}/", }, }, "CacheConfig": { "Enabled": True, "ExpireAfter": "PT1H" }, } assert step.properties.TrainingJobName.expr == { "Get": "Steps.MyTrainingStep.TrainingJobName" }
def read_conf(cfg_file): """Reads config file, returns a dict with workflow parameters""" # FIXME: refactor! the function is ugly, instead we can set the # three names from one present in the .ini file. That would # likely need to have some snake case to PascalCase and back # conversion hacks sg_cfg = SGConf() config = ConfigParser(interpolation=ExtendedInterpolation()) config.read_file(open(cfg_file)) region = boto3.Session().region_name sg_cfg.bucket = config['metadata'].get('bucket') sg_cfg.pipeline_name = config['metadata'].get('pipeline_name') # fetch the execution role and account id from Secrets Manager session = boto3.session.Session() client_secrets_fetch = session.client(service_name='secretsmanager', region_name=region).get_secret_value secret_role = config['secretsmanager'].get('secret_role') secret_account_id = config['secretsmanager'].get('secret_account_id') sg_cfg.role = client_secrets_fetch(SecretId=secret_role)['SecretString'] account_id = client_secrets_fetch( SecretId=secret_account_id)['SecretString'] # will reuse the same cache config for all the steps sg_cfg.cache_config = CacheConfig( enable_caching=config['metadata'].getboolean('cache_steps'), expire_after=config['metadata'].get('cache_expire_after')) # FIXME: resolve with pathlib! sg_cfg.source_dir = config['metadata'].get('source_dir') # start defining workflow parameters for sagemaker pipeline steps # first off, processing steps sg_cfg.input_data = ParameterString( name='InputData', default_value=config['processing'].get('input_data')) sg_cfg.processing_instance_count = ParameterInteger( name='ProcessingInstanceCount', default_value=config['processing'].getint('instance_count')) sg_cfg.processing_instance_type = ParameterString( name='ProcessingInstanceType', default_value=config['processing'].get('instance_type')) sg_cfg.processing_train_test_split = ParameterFloat( name='TrainTestSplit', default_value=config['processing'].getfloat( 'train_test_split_fraction')) sg_cfg.processing_turicreate_uri = ParameterString( name='TuriCreateProcessingURI', default_value=config['processing'].get('image_uri_fmt').format( account_id)) # control settings for the training job sg_cfg.training_instance_count = ParameterInteger( name='TrainingInstanceCount', default_value=config['training'].getint('instance_count')) sg_cfg.training_instance_type = ParameterString( name='TrainingInstanceType', default_value=config['training'].get('instance_type')) sg_cfg.training_batch_size = ParameterInteger( name='TrainingBatchSize', default_value=config['training'].getint('batch_size')) sg_cfg.training_max_iterations = ParameterInteger( name='MaxIterations', default_value=config['training'].getint('max_iterations')) sg_cfg.training_turicreate_uri = ParameterString( name='TuriCreateTrainingURI', default_value=config['training'].get('image_uri_fmt').format( account_id)) # settings for model card creation sg_cfg.summarizing_instance_count = ParameterInteger( name='SummarizingInstanceCount', default_value=config['summarizing'].getint('instance_count')) sg_cfg.summarizing_instance_type = ParameterString( name='SummarizingInstanceType', default_value=config['summarizing'].get('instance_type')) sg_cfg.summarizing_turicreate_uri = ParameterString( name='TuriCreateProcessingURI', default_value=config['summarizing'].get('image_uri_fmt').format( account_id)) # workflow parameters for model approval / rejection sg_cfg.model_approval_status = ParameterString( name='ModelApprovalStatus', default_value=config['evaluation'].get('approval_status')) sg_cfg.model_approval_map_threshold = ParameterFloat( name='ModelApprovalmAPThreshold', default_value=config['evaluation'].getfloat('approval_map_threshold')) sg_cfg.model_package_group_name = config['metadata'].get( 'model_package_group_name') return sg_cfg
def get_pipeline( region, sagemaker_project_arn=None, role=None, default_bucket=None, model_package_group_name="restatePackageGroup", # Choose any name pipeline_name="restate-p-XXXXXXXXX", # You can find your pipeline name in the Studio UI (project -> Pipelines -> name) base_job_prefix="restate", # Choose any name ): """Gets a SageMaker ML Pipeline instance working with on RE data. Args: region: AWS region to create and run the pipeline. role: IAM role to create and run steps and pipeline. default_bucket: the bucket to use for storing the artifacts Returns: an instance of a pipeline """ sagemaker_session = get_session(region, default_bucket) if role is None: role = sagemaker.session.get_execution_role(sagemaker_session) # Parameters for pipeline execution processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString( name="ProcessingInstanceType", default_value="ml.m5.2xlarge" ) training_instance_type = ParameterString( name="TrainingInstanceType", default_value="ml.m5.xlarge" ) model_approval_status = ParameterString( name="ModelApprovalStatus", default_value="PendingManualApproval", # ModelApprovalStatus can be set to a default of "Approved" if you don't want manual approval. ) input_data = ParameterString( name="InputDataUrl", default_value=f"", # Change this to point to the s3 location of your raw input data. ) data_sources = [] # Sagemaker session sess = sagemaker_session # You can configure this with your own bucket name, e.g. # bucket = "my-bucket" bucket = sess.default_bucket() data_sources.append( ProcessingInput( input_name="restate-california", dataset_definition=DatasetDefinition( local_path="/opt/ml/processing/restate-california", data_distribution_type="FullyReplicated", # You can override below to point to other database or use different queries athena_dataset_definition=AthenaDatasetDefinition( catalog="AwsDataCatalog", database="restate", query_string="SELECT * FROM restate.california_10", output_s3_uri=f"s3://{bucket}/athena/", output_format="PARQUET", ), ), ) ) print(f"Data Wrangler export storage bucket: {bucket}") # unique flow export ID flow_export_id = f"{time.strftime('%d-%H-%M-%S', time.gmtime())}-{str(uuid.uuid4())[:8]}" flow_export_name = f"flow-{flow_export_id}" # Output name is auto-generated from the select node's ID + output name from the flow file. output_name = "99ae1ec3-dd5f-453c-bfae-721dac423cd7.default" s3_output_prefix = f"export-{flow_export_name}/output" s3_output_path = f"s3://{bucket}/{s3_output_prefix}" print(f"Flow S3 export result path: {s3_output_path}") processing_job_output = ProcessingOutput( output_name=output_name, source="/opt/ml/processing/output", destination=s3_output_path, s3_upload_mode="EndOfJob", ) # name of the flow file which should exist in the current notebook working directory flow_file_name = "sagemaker-pipeline/restate-athena-california.flow" # Load .flow file from current notebook working directory #!echo "Loading flow file from current notebook working directory: $PWD" with open(flow_file_name) as f: flow = json.load(f) # Upload flow to S3 s3_client = boto3.client("s3") s3_client.upload_file( flow_file_name, bucket, f"data_wrangler_flows/{flow_export_name}.flow", ExtraArgs={"ServerSideEncryption": "aws:kms"}, ) flow_s3_uri = f"s3://{bucket}/data_wrangler_flows/{flow_export_name}.flow" print(f"Data Wrangler flow {flow_file_name} uploaded to {flow_s3_uri}") ## Input - Flow: restate-athena-russia.flow flow_input = ProcessingInput( source=flow_s3_uri, destination="/opt/ml/processing/flow", input_name="flow", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", ) # IAM role for executing the processing job. iam_role = role # Unique processing job name. Give a unique name every time you re-execute processing jobs processing_job_name = f"data-wrangler-flow-processing-{flow_export_id}" # Data Wrangler Container URL. container_uri = sagemaker.image_uris.retrieve( framework="data-wrangler", # we are using the Sagemaker built in xgboost algorithm region=region, ) # Processing Job Instance count and instance type. instance_count = 2 instance_type = "ml.m5.4xlarge" # Size in GB of the EBS volume to use for storing data during processing volume_size_in_gb = 30 # Content type for each output. Data Wrangler supports CSV as default and Parquet. output_content_type = "CSV" # Network Isolation mode; default is off enable_network_isolation = False # List of tags to be passed to the processing job user_tags = [] # Output configuration used as processing job container arguments output_config = {output_name: {"content_type": output_content_type}} # KMS key for per object encryption; default is None kms_key = None processor = Processor( role=iam_role, image_uri=container_uri, instance_count=instance_count, instance_type=instance_type, volume_size_in_gb=volume_size_in_gb, network_config=NetworkConfig(enable_network_isolation=enable_network_isolation), sagemaker_session=sess, output_kms_key=kms_key, tags=user_tags, ) data_wrangler_step = ProcessingStep( name="DataWranglerProcess", processor=processor, inputs=[flow_input] + data_sources, outputs=[processing_job_output], job_arguments=[f"--output-config '{json.dumps(output_config)}'"], ) # Processing step for feature engineering # this processor does not have awswrangler installed sklearn_processor = SKLearnProcessor( framework_version="0.23-1", instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name=f"{base_job_prefix}/sklearn-restate-preprocess", # choose any name sagemaker_session=sagemaker_session, role=role, ) step_process = ProcessingStep( name="Preprocess", # choose any name processor=sklearn_processor, inputs=[ ProcessingInput( source=data_wrangler_step.properties.ProcessingOutputConfig.Outputs[ output_name ].S3Output.S3Uri, destination="/opt/ml/processing/data/raw-data-dir", ) ], outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), ], code=os.path.join(BASE_DIR, "preprocess.py"), job_arguments=[ "--input-data", data_wrangler_step.properties.ProcessingOutputConfig.Outputs[ output_name ].S3Output.S3Uri, ], ) # Training step for generating model artifacts model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/restateTrain" model_bucket_key = f"{sagemaker_session.default_bucket()}/{base_job_prefix}/restateTrain" cache_config = CacheConfig(enable_caching=True, expire_after="30d") xgb_image_uri = sagemaker.image_uris.retrieve( framework="xgboost", # we are using the Sagemaker built in xgboost algorithm region=region, version="1.0-1", py_version="py3", instance_type=training_instance_type, ) xgb_train = Estimator( image_uri=xgb_image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, base_job_name=f"{base_job_prefix}/restate-xgb-train", sagemaker_session=sagemaker_session, role=role, ) xgb_train.set_hyperparameters( # #objective="binary:logistic", # objective="reg:linear", num_round=50, # max_depth=5, # eta=0.2, # gamma=4, # min_child_weight=6, # subsample=0.7, # silent=0, ) xgb_train.set_hyperparameters(grow_policy="lossguide") xgb_objective_metric_name = "validation:mse" xgb_hyperparameter_ranges = { "max_depth": IntegerParameter(2, 10, scaling_type="Linear"), } xgb_tuner_log = HyperparameterTuner( xgb_train, xgb_objective_metric_name, xgb_hyperparameter_ranges, max_jobs=3, max_parallel_jobs=3, strategy="Random", objective_type="Minimize", ) xgb_step_tuning = TuningStep( name="XGBHPTune", tuner=xgb_tuner_log, inputs={ "train": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, content_type="text/csv", ), }, cache_config=cache_config, ) # dtree_image_uri = '625467769535.dkr.ecr.ap-southeast-1.amazonaws.com/sagemaker-decision-tree:latest' dtree_image_uri = sagemaker_session.sagemaker_client.describe_image_version( ImageName="restate-dtree" )["ContainerImage"] dtree_train = Estimator( image_uri=dtree_image_uri, role=role, instance_count=1, instance_type=training_instance_type, base_job_name=f"{base_job_prefix}/restate-dtree-train", output_path=model_path, sagemaker_session=sagemaker_session, ) dtree_objective_metric_name = "validation:mse" dtree_metric_definitions = [{"Name": "validation:mse", "Regex": "mse:(\S+)"}] dtree_hyperparameter_ranges = { "max_depth": IntegerParameter(10, 50, scaling_type="Linear"), "max_leaf_nodes": IntegerParameter(2, 12, scaling_type="Linear"), } dtree_tuner_log = HyperparameterTuner( dtree_train, dtree_objective_metric_name, dtree_hyperparameter_ranges, dtree_metric_definitions, max_jobs=3, max_parallel_jobs=3, strategy="Random", objective_type="Minimize", ) dtree_step_tuning = TuningStep( name="DTreeHPTune", tuner=dtree_tuner_log, inputs={ "training": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, content_type="text/csv", ), }, cache_config=cache_config, ) dtree_script_eval = ScriptProcessor( image_uri=dtree_image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-dtree-eval", sagemaker_session=sagemaker_session, role=role, ) dtree_evaluation_report = PropertyFile( name="EvaluationReportDTree", output_name="dtree_evaluation", path="dtree_evaluation.json", ) dtree_step_eval = ProcessingStep( name="DTreeEval", processor=dtree_script_eval, inputs=[ ProcessingInput( # source=dtree_step_train.properties.ModelArtifacts.S3ModelArtifacts, source=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig.Outputs[ "test" ].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput( output_name="dtree_evaluation", source="/opt/ml/processing/evaluation" ), ], code=os.path.join(BASE_DIR, "dtree_evaluate.py"), property_files=[dtree_evaluation_report], ) xgb_script_eval = ScriptProcessor( image_uri=xgb_image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-xgb-eval", sagemaker_session=sagemaker_session, role=role, ) xgb_evaluation_report = PropertyFile( name="EvaluationReportXGBoost", output_name="xgb_evaluation", path="xgb_evaluation.json", ) xgb_step_eval = ProcessingStep( name="XGBEval", processor=xgb_script_eval, inputs=[ ProcessingInput( source=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig.Outputs[ "test" ].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput(output_name="xgb_evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(BASE_DIR, "xgb_evaluate.py"), property_files=[xgb_evaluation_report], ) xgb_model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/xgb_evaluation.json".format( xgb_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] ), content_type="application/json", ) ) dtree_model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/dtree_evaluation.json".format( dtree_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"][ "S3Uri" ] ), content_type="application/json", ) ) xgb_eval_metrics = JsonGet( step=xgb_step_eval, property_file=xgb_evaluation_report, json_path="regression_metrics.r2s.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ) dtree_eval_metrics = JsonGet( step=dtree_step_eval, property_file=dtree_evaluation_report, json_path="regression_metrics.r2s.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ) # Register model step that will be conditionally executed dtree_step_register = RegisterModel( name="DTreeReg", estimator=dtree_train, model_data=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=dtree_model_metrics, ) # Register model step that will be conditionally executed xgb_step_register = RegisterModel( name="XGBReg", estimator=xgb_train, model_data=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=xgb_model_metrics, ) # Condition step for evaluating model quality and branching execution cond_lte = ConditionGreaterThanOrEqualTo( # You can change the condition here left=JsonGet( step=dtree_step_eval, property_file=dtree_evaluation_report, json_path="regression_metrics.r2s.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ), right=JsonGet( step=xgb_step_eval, property_file=xgb_evaluation_report, json_path="regression_metrics.r2s.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ), # You can change the threshold here ) step_cond = ConditionStep( name="AccuracyCond", conditions=[cond_lte], if_steps=[dtree_step_register], else_steps=[xgb_step_register], ) create_date = time.strftime("%Y-%m-%d-%H-%M-%S") # Pipeline instance pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data ], pipeline_experiment_config=PipelineExperimentConfig( pipeline_name + "-" + create_date, "restate-{}".format(create_date) ), steps=[ data_wrangler_step, step_process, dtree_step_tuning, xgb_step_tuning, dtree_step_eval, xgb_step_eval, step_cond, ], sagemaker_session=sagemaker_session, ) return pipeline