def test_processing_step_with_processor_and_step_args(pipeline_session, processing_input): processor = Processor( image_uri=IMAGE_URI, role=sagemaker.get_execution_role(), instance_count=1, instance_type=INSTANCE_TYPE, sagemaker_session=pipeline_session, ) step_args = processor.run(inputs=processing_input) try: ProcessingStep( name="MyProcessingStep", step_args=step_args, processor=processor, ) assert False except Exception as e: assert isinstance(e, ValueError) try: ProcessingStep( name="MyProcessingStep", ) assert False except Exception as e: assert isinstance(e, ValueError)
def test_processing_step(sagemaker_session): processing_input_data_uri_parameter = ParameterString( name="ProcessingInputDataUri", default_value=f"s3://{BUCKET}/processing_manifest" ) instance_type_parameter = ParameterString(name="InstanceType", default_value="ml.m4.4xlarge") instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1) processor = Processor( image_uri=IMAGE_URI, role=ROLE, instance_count=instance_count_parameter, instance_type=instance_type_parameter, sagemaker_session=sagemaker_session, ) inputs = [ ProcessingInput( source=processing_input_data_uri_parameter, destination="processing_manifest", ) ] cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step = ProcessingStep( name="MyProcessingStep", processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) assert step.to_request() == { "Name": "MyProcessingStep", "Type": "Processing", "Arguments": { "AppSpecification": {"ImageUri": "fakeimage"}, "ProcessingInputs": [ { "InputName": "input-1", "AppManaged": False, "S3Input": { "LocalPath": "processing_manifest", "S3CompressionType": "None", "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3InputMode": "File", "S3Uri": processing_input_data_uri_parameter, }, } ], "ProcessingResources": { "ClusterConfig": { "InstanceCount": instance_count_parameter, "InstanceType": instance_type_parameter, "VolumeSizeInGB": 30, } }, "RoleArn": "DummyRole", }, "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"}, } assert step.properties.ProcessingJobName.expr == { "Get": "Steps.MyProcessingStep.ProcessingJobName" }
def test_processor_with_all_parameters(sagemaker_session): processor = Processor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, entrypoint=[ "python3", "/opt/ml/processing/input/code/processing_code.py" ], volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="processor_base_name", env={"my_env_variable": "my_env_variable_value"}, tags=[{ "Key": "my-tag", "Value": "my-tag-value" }], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, ), ) processor.run( inputs=[ ProcessingInput( source="s3://path/to/my/dataset/census.csv", destination="/container/path/", input_name="my_dataset", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", s3_compression_type="None", ) ], outputs=[ ProcessingOutput( source="/container/path/", destination="s3://uri/", output_name="my_output", s3_upload_mode="EndOfJob", ) ], arguments=["--drop-columns", "'SelfEmployed'"], wait=True, logs=False, job_name="my_job_name", experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = _get_expected_args_all_parameters( processor._current_job_name) # Drop the "code" input from expected values. expected_args["inputs"] = [expected_args["inputs"][0]] sagemaker_session.process.assert_called_with(**expected_args)
def test_byo_container_with_baked_in_script(sagemaker_session): custom_processor = Processor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, ) custom_processor.run( inputs=[ ProcessingInput(source="/local/path/to/my/sklearn_transformer", destination="/code/") ], arguments=["CensusTract", "County"], ) expected_args = { "inputs": [{ "InputName": "input-1", "S3Input": { "S3Uri": "mocked_s3_uri_from_upload_data", "LocalPath": "/code/", "S3DataType": "S3Prefix", "S3InputMode": "File", "S3DataDistributionType": "FullyReplicated", "S3CompressionType": "None", }, }], "output_config": { "Outputs": [] }, "job_name": custom_processor._current_job_name, "resources": { "ClusterConfig": { "InstanceType": "ml.m4.xlarge", "InstanceCount": 1, "VolumeSizeInGB": 30, } }, "stopping_condition": None, "app_specification": { "ImageUri": CUSTOM_IMAGE_URI, "ContainerArguments": ["CensusTract", "County"], }, "environment": None, "network_config": None, "role_arn": ROLE, "tags": None, "experiment_config": None, } sagemaker_session.process.assert_called_with(**expected_args)
def test_processing_step_with_processor(pipeline_session, processing_input): processor = Processor( image_uri=IMAGE_URI, role=sagemaker.get_execution_role(), instance_count=1, instance_type=INSTANCE_TYPE, sagemaker_session=pipeline_session, ) with warnings.catch_warnings(record=True) as w: step_args = processor.run(inputs=processing_input) assert len(w) == 1 assert issubclass(w[-1].category, UserWarning) assert "Running within a PipelineSession" in str(w[-1].message) cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") evaluation_report = PropertyFile( name="EvaluationReport", output_name="evaluation", path="evaluation.json" ) with warnings.catch_warnings(record=True) as w: step = ProcessingStep( name="MyProcessingStep", step_args=step_args, description="ProcessingStep description", display_name="MyProcessingStep", depends_on=["TestStep", "SecondTestStep"], cache_config=cache_config, property_files=[evaluation_report], ) assert len(w) == 0 pipeline = Pipeline( name="MyPipeline", steps=[step], sagemaker_session=pipeline_session, ) assert json.loads(pipeline.definition())["Steps"][0] == { "Name": "MyProcessingStep", "Description": "ProcessingStep description", "DisplayName": "MyProcessingStep", "Type": "Processing", "DependsOn": ["TestStep", "SecondTestStep"], "Arguments": step_args, "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"}, "PropertyFiles": [ { "FilePath": "evaluation.json", "OutputName": "evaluation", "PropertyFileName": "EvaluationReport", } ], } assert step.properties.ProcessingJobName.expr == { "Get": "Steps.MyProcessingStep.ProcessingJobName" }
def create_evaluation_processor(params, sagemaker_role): evaluation_repository_uri = params['eval-image-uri'] model_evaluation_processor = Processor( image_uri=evaluation_repository_uri, role=sagemaker_role, instance_count=1, instance_type='ml.p3.2xlarge', max_runtime_in_seconds=1200 ) return model_evaluation_processor
def test_add_depends_on(sagemaker_session): processing_input_data_uri_parameter = ParameterString( name="ProcessingInputDataUri", default_value=f"s3://{BUCKET}/processing_manifest") instance_type_parameter = ParameterString(name="InstanceType", default_value="ml.m4.4xlarge") instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1) processor = Processor( image_uri=IMAGE_URI, role=ROLE, instance_count=instance_count_parameter, instance_type=instance_type_parameter, sagemaker_session=sagemaker_session, ) inputs = [ ProcessingInput( source=processing_input_data_uri_parameter, destination="processing_manifest", ) ] cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step_1 = ProcessingStep( name="MyProcessingStep-1", processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) step_2 = ProcessingStep( name="MyProcessingStep-2", depends_on=[step_1], processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) step_3 = ProcessingStep( name="MyProcessingStep-3", depends_on=[step_1], processor=processor, inputs=inputs, outputs=[], cache_config=cache_config, ) step_3.add_depends_on([step_2.name]) assert "DependsOn" not in step_1.to_request() assert step_2.to_request()["DependsOn"] == ["MyProcessingStep-1"] assert step_3.to_request()["DependsOn"] == [ "MyProcessingStep-1", "MyProcessingStep-2" ]
def test_local_mode_disables_local_code_by_default(localsession_mock): Processor( image_uri="", role=ROLE, instance_count=1, instance_type="local", ) # Most tests use a fixture for sagemaker_session for consistent behaviour, so this unit test # checks that the default initialization disables unsupported 'local_code' mode: localsession_mock.assert_called_with(disable_local_code=True)
def test_processing_step(sagemaker_session): processor = Processor( image_uri=IMAGE_URI, role=ROLE, instance_count=1, instance_type="ml.m4.4xlarge", sagemaker_session=sagemaker_session, ) inputs = [ ProcessingInput( source=f"s3://{BUCKET}/processing_manifest", destination="processing_manifest", ) ] step = ProcessingStep( name="MyProcessingStep", processor=processor, inputs=inputs, outputs=[], ) assert step.to_request() == { "Name": "MyProcessingStep", "Type": "Processing", "Arguments": { "AppSpecification": { "ImageUri": "fakeimage" }, "ProcessingInputs": [{ "InputName": "input-1", "AppManaged": False, "S3Input": { "LocalPath": "processing_manifest", "S3CompressionType": "None", "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3InputMode": "File", "S3Uri": "s3://my-bucket/processing_manifest", }, }], "ProcessingResources": { "ClusterConfig": { "InstanceCount": 1, "InstanceType": "ml.m4.4xlarge", "VolumeSizeInGB": 30, } }, "RoleArn": "DummyRole", }, } assert step.properties.ProcessingJobName.expr == { "Get": "Steps.MyProcessingStep.ProcessingJobName" }
def test_processor_with_required_parameters(sagemaker_session): processor = Processor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, ) processor.run() expected_args = _get_expected_args(processor._current_job_name) del expected_args["app_specification"]["ContainerEntrypoint"] expected_args["inputs"] = [] sagemaker_session.process.assert_called_with(**expected_args)
def test_extend_processing_args(sagemaker_session): inputs = [] outputs = [] processor = Processor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, network_config=NetworkConfig(encrypt_inter_container_traffic=False), ) extended_inputs, extended_outputs = processor._extend_processing_args([], []) assert extended_inputs == inputs assert extended_outputs == outputs
def test_processor_with_missing_network_config_parameters(sagemaker_session): processor = Processor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, network_config=NetworkConfig(enable_network_isolation=True), ) processor.run() expected_args = _get_expected_args(processor._current_job_name) del expected_args["app_specification"]["ContainerEntrypoint"] expected_args["inputs"] = [] expected_args["network_config"] = {"EnableNetworkIsolation": True} sagemaker_session.process.assert_called_with(**expected_args)
def create_prepro_processing(params, job_name, sagemaker_role): prepro_repository_uri = params['prep-image-uri'] pre_processor = Processor( role=sagemaker_role, image_uri=prepro_repository_uri, instance_count=1, instance_type="ml.m5.xlarge", volume_size_in_gb=16, volume_kms_key=None, output_kms_key=None, max_runtime_in_seconds=86400, # default is 24 hours(60*60*24) sagemaker_session=None, env=None, tags=None, network_config=None ) return pre_processor
def test_processor_with_all_parameters(sagemaker_session): processor = Processor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, entrypoint=[ "python3", "/opt/ml/processing/input/code/processing_code.py" ], volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="processor_base_name", env={"my_env_variable": "my_env_variable_value"}, tags=[{ "Key": "my-tag", "Value": "my-tag-value" }], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, encrypt_inter_container_traffic=True, ), ) processor.run( inputs=_get_data_inputs_all_parameters(), outputs=_get_data_outputs_all_parameters(), arguments=["--drop-columns", "'SelfEmployed'"], wait=True, logs=False, job_name="my_job_name", experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = _get_expected_args_all_parameters( processor._current_job_name) # Drop the "code" input from expected values. expected_args["inputs"] = expected_args["inputs"][:-1] sagemaker_session.process.assert_called_with(**expected_args)
project_name=project_name, env=env, region_name=region, current_time=current_time, ) proc_config = metadata.getter(prcossing_task) sm_config = proc_config.get('sm_config') # create sagemaker session sess = sm.Session(default_bucket=sm_config.getter('sm_bucket')) processor = Processor( role=sm_config.getter('sm_role'), image_uri=image_uri, instance_count=sm_config.getter('sm_instance_count'), instance_type=sm_config.getter('sm_instance_type'), entrypoint=proc_config.get('endpoint'), volume_size_in_gb=sm_config.getter('sm_volumesize'), sagemaker_session=sess, tags=sm_config.getter('project_tag'), ) processor.run( inputs=proc_config.get('inputs'), outputs=proc_config.get('outputs'), arguments=proc_config.get('arguments'), wait=False, logs=False, job_name=sm_config.getter('processing_job_name'), )
def test_processor_with_custom_bucket( sagemaker_session_with_custom_bucket, custom_bucket_name, image_uri, cpu_instance_type, output_kms_key, ): script_path = os.path.join(DATA_DIR, "dummy_script.py") processor = Processor( role=ROLE, image_uri=image_uri, instance_count=1, instance_type=cpu_instance_type, entrypoint=[ "python3", "/opt/ml/processing/input/code/dummy_script.py" ], volume_size_in_gb=100, volume_kms_key=None, output_kms_key=output_kms_key, max_runtime_in_seconds=3600, base_job_name="test-processor", env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"}, tags=[{ "Key": "dummy-tag", "Value": "dummy-tag-value" }], sagemaker_session=sagemaker_session_with_custom_bucket, ) processor.run( inputs=[ ProcessingInput(source=script_path, destination="/opt/ml/processing/input/code/", input_name="code") ], outputs=[ ProcessingOutput( source="/opt/ml/processing/output/container/path/", output_name="dummy_output", s3_upload_mode="EndOfJob", ) ], arguments=["-v"], wait=True, logs=True, ) job_description = processor.latest_job.describe() assert job_description["ProcessingInputs"][0]["InputName"] == "code" assert custom_bucket_name in job_description["ProcessingInputs"][0][ "S3Input"]["S3Uri"] assert job_description["ProcessingJobName"].startswith("test-processor") assert job_description["ProcessingJobStatus"] == "Completed" assert job_description["ProcessingOutputConfig"][ "KmsKeyId"] == output_kms_key assert job_description["ProcessingOutputConfig"]["Outputs"][0][ "OutputName"] == "dummy_output" assert job_description["ProcessingResources"]["ClusterConfig"][ "InstanceCount"] == 1 assert (job_description["ProcessingResources"]["ClusterConfig"] ["InstanceType"] == cpu_instance_type) assert job_description["ProcessingResources"]["ClusterConfig"][ "VolumeSizeInGB"] == 100 assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"] assert job_description["AppSpecification"]["ContainerEntrypoint"] == [ "python3", "/opt/ml/processing/input/code/dummy_script.py", ] assert job_description["AppSpecification"]["ImageUri"] == image_uri assert job_description["Environment"] == { "DUMMY_ENVIRONMENT_VARIABLE": "dummy-value" } assert ROLE in job_description["RoleArn"] assert job_description["StoppingCondition"] == { "MaxRuntimeInSeconds": 3600 }
def get_pipeline( region, sagemaker_project_arn=None, role=None, default_bucket=None, model_package_group_name="restatePackageGroup", # Choose any name pipeline_name="restate-p-XXXXXXXXX", # You can find your pipeline name in the Studio UI (project -> Pipelines -> name) base_job_prefix="restate", # Choose any name ): """Gets a SageMaker ML Pipeline instance working with on RE data. Args: region: AWS region to create and run the pipeline. role: IAM role to create and run steps and pipeline. default_bucket: the bucket to use for storing the artifacts Returns: an instance of a pipeline """ sagemaker_session = get_session(region, default_bucket) if role is None: role = sagemaker.session.get_execution_role(sagemaker_session) # Parameters for pipeline execution processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString( name="ProcessingInstanceType", default_value="ml.m5.2xlarge" ) training_instance_type = ParameterString( name="TrainingInstanceType", default_value="ml.m5.xlarge" ) model_approval_status = ParameterString( name="ModelApprovalStatus", default_value="PendingManualApproval", # ModelApprovalStatus can be set to a default of "Approved" if you don't want manual approval. ) input_data = ParameterString( name="InputDataUrl", default_value=f"", # Change this to point to the s3 location of your raw input data. ) data_sources = [] # Sagemaker session sess = sagemaker_session # You can configure this with your own bucket name, e.g. # bucket = "my-bucket" bucket = sess.default_bucket() data_sources.append( ProcessingInput( input_name="restate-california", dataset_definition=DatasetDefinition( local_path="/opt/ml/processing/restate-california", data_distribution_type="FullyReplicated", # You can override below to point to other database or use different queries athena_dataset_definition=AthenaDatasetDefinition( catalog="AwsDataCatalog", database="restate", query_string="SELECT * FROM restate.california_10", output_s3_uri=f"s3://{bucket}/athena/", output_format="PARQUET", ), ), ) ) print(f"Data Wrangler export storage bucket: {bucket}") # unique flow export ID flow_export_id = f"{time.strftime('%d-%H-%M-%S', time.gmtime())}-{str(uuid.uuid4())[:8]}" flow_export_name = f"flow-{flow_export_id}" # Output name is auto-generated from the select node's ID + output name from the flow file. output_name = "99ae1ec3-dd5f-453c-bfae-721dac423cd7.default" s3_output_prefix = f"export-{flow_export_name}/output" s3_output_path = f"s3://{bucket}/{s3_output_prefix}" print(f"Flow S3 export result path: {s3_output_path}") processing_job_output = ProcessingOutput( output_name=output_name, source="/opt/ml/processing/output", destination=s3_output_path, s3_upload_mode="EndOfJob", ) # name of the flow file which should exist in the current notebook working directory flow_file_name = "sagemaker-pipeline/restate-athena-california.flow" # Load .flow file from current notebook working directory #!echo "Loading flow file from current notebook working directory: $PWD" with open(flow_file_name) as f: flow = json.load(f) # Upload flow to S3 s3_client = boto3.client("s3") s3_client.upload_file( flow_file_name, bucket, f"data_wrangler_flows/{flow_export_name}.flow", ExtraArgs={"ServerSideEncryption": "aws:kms"}, ) flow_s3_uri = f"s3://{bucket}/data_wrangler_flows/{flow_export_name}.flow" print(f"Data Wrangler flow {flow_file_name} uploaded to {flow_s3_uri}") ## Input - Flow: restate-athena-russia.flow flow_input = ProcessingInput( source=flow_s3_uri, destination="/opt/ml/processing/flow", input_name="flow", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", ) # IAM role for executing the processing job. iam_role = role # Unique processing job name. Give a unique name every time you re-execute processing jobs processing_job_name = f"data-wrangler-flow-processing-{flow_export_id}" # Data Wrangler Container URL. container_uri = sagemaker.image_uris.retrieve( framework="data-wrangler", # we are using the Sagemaker built in xgboost algorithm region=region, ) # Processing Job Instance count and instance type. instance_count = 2 instance_type = "ml.m5.4xlarge" # Size in GB of the EBS volume to use for storing data during processing volume_size_in_gb = 30 # Content type for each output. Data Wrangler supports CSV as default and Parquet. output_content_type = "CSV" # Network Isolation mode; default is off enable_network_isolation = False # List of tags to be passed to the processing job user_tags = [] # Output configuration used as processing job container arguments output_config = {output_name: {"content_type": output_content_type}} # KMS key for per object encryption; default is None kms_key = None processor = Processor( role=iam_role, image_uri=container_uri, instance_count=instance_count, instance_type=instance_type, volume_size_in_gb=volume_size_in_gb, network_config=NetworkConfig(enable_network_isolation=enable_network_isolation), sagemaker_session=sess, output_kms_key=kms_key, tags=user_tags, ) data_wrangler_step = ProcessingStep( name="DataWranglerProcess", processor=processor, inputs=[flow_input] + data_sources, outputs=[processing_job_output], job_arguments=[f"--output-config '{json.dumps(output_config)}'"], ) # Processing step for feature engineering # this processor does not have awswrangler installed sklearn_processor = SKLearnProcessor( framework_version="0.23-1", instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name=f"{base_job_prefix}/sklearn-restate-preprocess", # choose any name sagemaker_session=sagemaker_session, role=role, ) step_process = ProcessingStep( name="Preprocess", # choose any name processor=sklearn_processor, inputs=[ ProcessingInput( source=data_wrangler_step.properties.ProcessingOutputConfig.Outputs[ output_name ].S3Output.S3Uri, destination="/opt/ml/processing/data/raw-data-dir", ) ], outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), ], code=os.path.join(BASE_DIR, "preprocess.py"), job_arguments=[ "--input-data", data_wrangler_step.properties.ProcessingOutputConfig.Outputs[ output_name ].S3Output.S3Uri, ], ) # Training step for generating model artifacts model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/restateTrain" model_bucket_key = f"{sagemaker_session.default_bucket()}/{base_job_prefix}/restateTrain" cache_config = CacheConfig(enable_caching=True, expire_after="30d") xgb_image_uri = sagemaker.image_uris.retrieve( framework="xgboost", # we are using the Sagemaker built in xgboost algorithm region=region, version="1.0-1", py_version="py3", instance_type=training_instance_type, ) xgb_train = Estimator( image_uri=xgb_image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, base_job_name=f"{base_job_prefix}/restate-xgb-train", sagemaker_session=sagemaker_session, role=role, ) xgb_train.set_hyperparameters( # #objective="binary:logistic", # objective="reg:linear", num_round=50, # max_depth=5, # eta=0.2, # gamma=4, # min_child_weight=6, # subsample=0.7, # silent=0, ) xgb_train.set_hyperparameters(grow_policy="lossguide") xgb_objective_metric_name = "validation:mse" xgb_hyperparameter_ranges = { "max_depth": IntegerParameter(2, 10, scaling_type="Linear"), } xgb_tuner_log = HyperparameterTuner( xgb_train, xgb_objective_metric_name, xgb_hyperparameter_ranges, max_jobs=3, max_parallel_jobs=3, strategy="Random", objective_type="Minimize", ) xgb_step_tuning = TuningStep( name="XGBHPTune", tuner=xgb_tuner_log, inputs={ "train": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, content_type="text/csv", ), }, cache_config=cache_config, ) # dtree_image_uri = '625467769535.dkr.ecr.ap-southeast-1.amazonaws.com/sagemaker-decision-tree:latest' dtree_image_uri = sagemaker_session.sagemaker_client.describe_image_version( ImageName="restate-dtree" )["ContainerImage"] dtree_train = Estimator( image_uri=dtree_image_uri, role=role, instance_count=1, instance_type=training_instance_type, base_job_name=f"{base_job_prefix}/restate-dtree-train", output_path=model_path, sagemaker_session=sagemaker_session, ) dtree_objective_metric_name = "validation:mse" dtree_metric_definitions = [{"Name": "validation:mse", "Regex": "mse:(\S+)"}] dtree_hyperparameter_ranges = { "max_depth": IntegerParameter(10, 50, scaling_type="Linear"), "max_leaf_nodes": IntegerParameter(2, 12, scaling_type="Linear"), } dtree_tuner_log = HyperparameterTuner( dtree_train, dtree_objective_metric_name, dtree_hyperparameter_ranges, dtree_metric_definitions, max_jobs=3, max_parallel_jobs=3, strategy="Random", objective_type="Minimize", ) dtree_step_tuning = TuningStep( name="DTreeHPTune", tuner=dtree_tuner_log, inputs={ "training": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, content_type="text/csv", ), }, cache_config=cache_config, ) dtree_script_eval = ScriptProcessor( image_uri=dtree_image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-dtree-eval", sagemaker_session=sagemaker_session, role=role, ) dtree_evaluation_report = PropertyFile( name="EvaluationReportDTree", output_name="dtree_evaluation", path="dtree_evaluation.json", ) dtree_step_eval = ProcessingStep( name="DTreeEval", processor=dtree_script_eval, inputs=[ ProcessingInput( # source=dtree_step_train.properties.ModelArtifacts.S3ModelArtifacts, source=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig.Outputs[ "test" ].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput( output_name="dtree_evaluation", source="/opt/ml/processing/evaluation" ), ], code=os.path.join(BASE_DIR, "dtree_evaluate.py"), property_files=[dtree_evaluation_report], ) xgb_script_eval = ScriptProcessor( image_uri=xgb_image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-xgb-eval", sagemaker_session=sagemaker_session, role=role, ) xgb_evaluation_report = PropertyFile( name="EvaluationReportXGBoost", output_name="xgb_evaluation", path="xgb_evaluation.json", ) xgb_step_eval = ProcessingStep( name="XGBEval", processor=xgb_script_eval, inputs=[ ProcessingInput( source=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig.Outputs[ "test" ].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput(output_name="xgb_evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(BASE_DIR, "xgb_evaluate.py"), property_files=[xgb_evaluation_report], ) xgb_model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/xgb_evaluation.json".format( xgb_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] ), content_type="application/json", ) ) dtree_model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/dtree_evaluation.json".format( dtree_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"][ "S3Uri" ] ), content_type="application/json", ) ) xgb_eval_metrics = JsonGet( step=xgb_step_eval, property_file=xgb_evaluation_report, json_path="regression_metrics.r2s.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ) dtree_eval_metrics = JsonGet( step=dtree_step_eval, property_file=dtree_evaluation_report, json_path="regression_metrics.r2s.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ) # Register model step that will be conditionally executed dtree_step_register = RegisterModel( name="DTreeReg", estimator=dtree_train, model_data=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=dtree_model_metrics, ) # Register model step that will be conditionally executed xgb_step_register = RegisterModel( name="XGBReg", estimator=xgb_train, model_data=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=xgb_model_metrics, ) # Condition step for evaluating model quality and branching execution cond_lte = ConditionGreaterThanOrEqualTo( # You can change the condition here left=JsonGet( step=dtree_step_eval, property_file=dtree_evaluation_report, json_path="regression_metrics.r2s.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ), right=JsonGet( step=xgb_step_eval, property_file=xgb_evaluation_report, json_path="regression_metrics.r2s.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ), # You can change the threshold here ) step_cond = ConditionStep( name="AccuracyCond", conditions=[cond_lte], if_steps=[dtree_step_register], else_steps=[xgb_step_register], ) create_date = time.strftime("%Y-%m-%d-%H-%M-%S") # Pipeline instance pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data ], pipeline_experiment_config=PipelineExperimentConfig( pipeline_name + "-" + create_date, "restate-{}".format(create_date) ), steps=[ data_wrangler_step, step_process, dtree_step_tuning, xgb_step_tuning, dtree_step_eval, xgb_step_eval, step_cond, ], sagemaker_session=sagemaker_session, ) return pipeline
def _generate_baseline_processor( self, baseline_dataset_input, baseline_output, post_processor_script_input=None, record_preprocessor_script_input=None, ): """Generates a baseline processor Args: baseline_dataset_input (ProcessingInput): A ProcessingInput instance for baseline dataset input. baseline_output (ProcessingOutput): A ProcessingOutput instance for baseline dataset output. post_processor_script_input (ProcessingInput): A ProcessingInput instance for post processor script input. record_preprocessor_script_input (ProcessingInput): A ProcessingInput instance for record preprocessor script input. Returns: sagemaker.processing.Processor: The baseline processor """ quality_check_cfg = self.quality_check_config # Unlike other input, dataset must be a directory for the Monitoring image. baseline_dataset_container_path = baseline_dataset_input.destination post_processor_script_container_path = None if post_processor_script_input is not None: post_processor_script_container_path = str( pathlib.PurePosixPath( post_processor_script_input.destination, os.path.basename(quality_check_cfg.post_analytics_processor_script), ) ) record_preprocessor_script_container_path = None if isinstance(quality_check_cfg, DataQualityCheckConfig): if record_preprocessor_script_input is not None: record_preprocessor_script_container_path = str( pathlib.PurePosixPath( record_preprocessor_script_input.destination, os.path.basename(quality_check_cfg.record_preprocessor_script), ) ) normalized_env = ModelMonitor._generate_env_map( env=self._model_monitor.env, dataset_format=quality_check_cfg.dataset_format, output_path=baseline_output.source, enable_cloudwatch_metrics=False, # Only supported for monitoring schedules dataset_source_container_path=baseline_dataset_container_path, record_preprocessor_script_container_path=record_preprocessor_script_container_path, post_processor_script_container_path=post_processor_script_container_path, ) else: inference_attribute = ( str(quality_check_cfg.inference_attribute) if quality_check_cfg.inference_attribute is not None else None ) probability_attribute = ( str(quality_check_cfg.probability_attribute) if quality_check_cfg.probability_attribute is not None else None ) ground_truth_attribute = ( str(quality_check_cfg.ground_truth_attribute) if quality_check_cfg.ground_truth_attribute is not None else None ) probability_threshold_attr = ( str(quality_check_cfg.probability_threshold_attribute) if quality_check_cfg.probability_threshold_attribute is not None else None ) normalized_env = ModelMonitor._generate_env_map( env=self._model_monitor.env, dataset_format=quality_check_cfg.dataset_format, output_path=baseline_output.source, enable_cloudwatch_metrics=False, # Only supported for monitoring schedules dataset_source_container_path=baseline_dataset_container_path, post_processor_script_container_path=post_processor_script_container_path, analysis_type=_MODEL_QUALITY_TYPE, problem_type=quality_check_cfg.problem_type, inference_attribute=inference_attribute, probability_attribute=probability_attribute, ground_truth_attribute=ground_truth_attribute, probability_threshold_attribute=probability_threshold_attr, ) return Processor( role=self._model_monitor.role, image_uri=self._model_monitor.image_uri, instance_count=self._model_monitor.instance_count, instance_type=self._model_monitor.instance_type, entrypoint=self._model_monitor.entrypoint, volume_size_in_gb=self._model_monitor.volume_size_in_gb, volume_kms_key=self._model_monitor.volume_kms_key, output_kms_key=self._model_monitor.output_kms_key, max_runtime_in_seconds=self._model_monitor.max_runtime_in_seconds, base_job_name=self._model_monitor.base_job_name, sagemaker_session=self._model_monitor.sagemaker_session, env=normalized_env, tags=self._model_monitor.tags, network_config=self._model_monitor.network_config, )
def create_pipeline( pipeline_name="s3-fs-ingest-pipeline", pipeline_description="automated ingestion from s3 to feature store", project_id="", project_name="", data_wrangler_flow_s3_url="", flow_output_name="", input_data_s3_url="", feature_group_name="", execution_role=""): logger.info( f"Creating sagemaker S3 to feature store load pipeline: {pipeline_name}" ) logger.info(f"execution role passed: {execution_role}") if execution_role is None or execution_role == "": execution_role = get_execution_role() logger.info(f"execution_role set to {execution_role}") output_content_type = "CSV" sagemaker_session = sagemaker.Session() # setup pipeline parameters p_processing_instance_count = ParameterInteger( name="ProcessingInstanceCount", default_value=1) p_processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.4xlarge") p_processing_volume_size = ParameterInteger(name="ProcessingVolumeSize", default_value=50) p_flow_output_name = ParameterString(name='FlowOutputName', default_value=flow_output_name) p_input_flow = ParameterString(name='InputFlowUrl', default_value=data_wrangler_flow_s3_url) p_input_data = ParameterString(name="InputDataUrl", default_value=input_data_s3_url) p_feature_group_name = ParameterString(name="FeatureGroupName", default_value=feature_group_name) # DW flow processing job inputs and output flow_input = ProcessingInput( source=p_input_flow, destination="/opt/ml/processing/flow", input_name="flow", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", ) data_input = ProcessingInput(source=p_input_data, destination="/opt/ml/processing/data", input_name="data", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated") processing_job_output = ProcessingOutput( output_name=p_flow_output_name, app_managed=True, feature_store_output=FeatureStoreOutput( feature_group_name=p_feature_group_name), ) # Output configuration used as processing job container arguments output_config = {flow_output_name: {"content_type": output_content_type}} # get data wrangler container uri container_uri = image_uris.retrieve( framework='data-wrangler', region=sagemaker_session.boto_region_name) logger.info(f"creating DW processor with container uri: {container_uri}") # create DW processor processor = Processor( role=execution_role, image_uri=container_uri, instance_count=p_processing_instance_count, instance_type=p_processing_instance_type, volume_size_in_gb=p_processing_volume_size, sagemaker_session=sagemaker_session, ) step_process = ProcessingStep( name="datawrangler-processing-to-feature-store", processor=processor, inputs=[flow_input] + [data_input], outputs=[processing_job_output], job_arguments=[f"--output-config '{json.dumps(output_config)}'"], ) pipeline = Pipeline(name=pipeline_name, parameters=[ p_processing_instance_type, p_processing_instance_count, p_processing_volume_size, p_flow_output_name, p_input_flow, p_input_data, p_feature_group_name ], steps=[step_process], sagemaker_session=sagemaker_session) response = pipeline.upsert( role_arn=execution_role, description=pipeline_description, tags=[{ 'Key': 'sagemaker:project-name', 'Value': project_name }, { 'Key': 'sagemaker:project-id', 'Value': project_id }], ) logger.info(f"pipeline upsert response: {response}") return pipeline
def run_model_monitor_job_processor(region, instance_type, role, data_capture_path, statistics_path, constraints_path, reports_path, instance_count=1, preprocessor_path=None, postprocessor_path=None, publish_cloudwatch_metrics='Disabled'): data_capture_sub_path = data_capture_path[data_capture_path.rfind('datacapture/') :] data_capture_sub_path = data_capture_sub_path[data_capture_sub_path.find('/') + 1 :] processing_output_paths = reports_path + '/' + data_capture_sub_path input_1 = ProcessingInput(input_name='input_1', source=data_capture_path, destination='/opt/ml/processing/input/endpoint/' + data_capture_sub_path, s3_data_type='S3Prefix', s3_input_mode='File') baseline = ProcessingInput(input_name='baseline', source=statistics_path, destination='/opt/ml/processing/baseline/stats', s3_data_type='S3Prefix', s3_input_mode='File') constraints = ProcessingInput(input_name='constraints', source=constraints_path, destination='/opt/ml/processing/baseline/constraints', s3_data_type='S3Prefix', s3_input_mode='File') outputs = ProcessingOutput(output_name='result', source='/opt/ml/processing/output', destination=processing_output_paths, s3_upload_mode='Continuous') env = {'baseline_constraints': '/opt/ml/processing/baseline/constraints/' + get_file_name(constraints_path), 'baseline_statistics': '/opt/ml/processing/baseline/stats/' + get_file_name(statistics_path), 'dataset_format': '{"sagemakerCaptureJson":{"captureIndexNames":["endpointInput","endpointOutput"]}}', 'dataset_source': '/opt/ml/processing/input/endpoint', 'output_path': '/opt/ml/processing/output', 'publish_cloudwatch_metrics': publish_cloudwatch_metrics } inputs=[input_1, baseline, constraints] if postprocessor_path: env['post_analytics_processor_script'] = '/opt/ml/processing/code/postprocessing/' + get_file_name(postprocessor_path) post_processor_script = ProcessingInput(input_name='post_processor_script', source=postprocessor_path, destination='/opt/ml/processing/code/postprocessing', s3_data_type='S3Prefix', s3_input_mode='File') inputs.append(post_processor_script) if preprocessor_path: env['record_preprocessor_script'] = '/opt/ml/processing/code/preprocessing/' + get_file_name(preprocessor_path) pre_processor_script = ProcessingInput(input_name='pre_processor_script', source=preprocessor_path, destination='/opt/ml/processing/code/preprocessing', s3_data_type='S3Prefix', s3_input_mode='File') inputs.append(pre_processor_script) processor = Processor(image_uri = get_model_monitor_container_uri(region), instance_count = instance_count, instance_type = instance_type, role=role, env = env) return processor.run(inputs=inputs, outputs=[outputs])
def create_baseline_step(input_data, execution_input, region, role): # Define the enviornment dataset_format = DatasetFormat.csv() env = { "dataset_format": json.dumps(dataset_format), "dataset_source": "/opt/ml/processing/input/baseline_dataset_input", "output_path": "/opt/ml/processing/output", "publish_cloudwatch_metrics": "Disabled", # Have to be disabled from processing job? } # Define the inputs and outputs inputs = [ ProcessingInput( source=input_data["BaselineUri"], destination="/opt/ml/processing/input/baseline_dataset_input", input_name="baseline_dataset_input", ), ] outputs = [ ProcessingOutput( source="/opt/ml/processing/output", destination=execution_input["BaselineOutputUri"], output_name="monitoring_output", ), ] # Get the default model monitor container monor_monitor_container_uri = retrieve(region=region, framework="model-monitor", version="latest") # Create the processor monitor_analyzer = Processor( image_uri=monor_monitor_container_uri, role=role, instance_count=1, instance_type="ml.m5.xlarge", max_runtime_in_seconds=1800, env=env, ) # Create the processing step baseline_step = steps.sagemaker.ProcessingStep( "Baseline Job", processor=monitor_analyzer, job_name=execution_input["BaselineJobName"], inputs=inputs, outputs=outputs, experiment_config={ "ExperimentName": execution_input["ExperimentName"], # '$.ExperimentName', "TrialName": execution_input["TrialName"], "TrialComponentDisplayName": "Baseline", }, tags={ "GitBranch": execution_input["GitBranch"], "GitCommitHash": execution_input["GitCommitHash"], "DataVersionId": execution_input["DataVersionId"], }, ) # Add the catch baseline_step.add_catch( steps.states.Catch( error_equals=["States.TaskFailed"], next_step=stepfunctions.steps.states.Fail( "Baseline failed", cause="SageMakerBaselineJobFailed"), )) return baseline_step
def run_model_monitor_job_processor( region, instance_type, role, data_capture_path, statistics_path, constraints_path, reports_path, instance_count=1, preprocessor_path=None, postprocessor_path=None, publish_cloudwatch_metrics="Disabled", ): data_capture_sub_path = data_capture_path[data_capture_path. rfind("datacapture/"):] data_capture_sub_path = data_capture_sub_path[data_capture_sub_path. find("/") + 1:] processing_output_paths = reports_path + "/" + data_capture_sub_path input_1 = ProcessingInput( input_name="input_1", source=data_capture_path, destination="/opt/ml/processing/input/endpoint/" + data_capture_sub_path, s3_data_type="S3Prefix", s3_input_mode="File", ) baseline = ProcessingInput( input_name="baseline", source=statistics_path, destination="/opt/ml/processing/baseline/stats", s3_data_type="S3Prefix", s3_input_mode="File", ) constraints = ProcessingInput( input_name="constraints", source=constraints_path, destination="/opt/ml/processing/baseline/constraints", s3_data_type="S3Prefix", s3_input_mode="File", ) outputs = ProcessingOutput( output_name="result", source="/opt/ml/processing/output", destination=processing_output_paths, s3_upload_mode="Continuous", ) env = { "baseline_constraints": "/opt/ml/processing/baseline/constraints/" + get_file_name(constraints_path), "baseline_statistics": "/opt/ml/processing/baseline/stats/" + get_file_name(statistics_path), "dataset_format": '{"sagemakerCaptureJson":{"captureIndexNames":["endpointInput","endpointOutput"]}}', "dataset_source": "/opt/ml/processing/input/endpoint", "output_path": "/opt/ml/processing/output", "publish_cloudwatch_metrics": publish_cloudwatch_metrics, } inputs = [input_1, baseline, constraints] if postprocessor_path: env["post_analytics_processor_script"] = "/opt/ml/processing/code/postprocessing/" + get_file_name( postprocessor_path) post_processor_script = ProcessingInput( input_name="post_processor_script", source=postprocessor_path, destination="/opt/ml/processing/code/postprocessing", s3_data_type="S3Prefix", s3_input_mode="File", ) inputs.append(post_processor_script) if preprocessor_path: env["record_preprocessor_script"] = "/opt/ml/processing/code/preprocessing/" + get_file_name( preprocessor_path) pre_processor_script = ProcessingInput( input_name="pre_processor_script", source=preprocessor_path, destination="/opt/ml/processing/code/preprocessing", s3_data_type="S3Prefix", s3_input_mode="File", ) inputs.append(pre_processor_script) processor = Processor( image_uri=get_model_monitor_container_uri(region), instance_count=instance_count, instance_type=instance_type, role=role, env=env, ) return processor.run(inputs=inputs, outputs=[outputs])