def test_transform_step(sagemaker_session): transformer = Transformer( model_name=MODEL_NAME, instance_count=1, instance_type="c4.4xlarge", sagemaker_session=sagemaker_session, ) inputs = TransformInput(data=f"s3://{BUCKET}/transform_manifest") cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") step = TransformStep( name="MyTransformStep", depends_on=["TestStep"], transformer=transformer, display_name="TransformStep", description="TestDescription", inputs=inputs, cache_config=cache_config, ) step.add_depends_on(["SecondTestStep"]) assert step.to_request() == { "Name": "MyTransformStep", "Type": "Transform", "Description": "TestDescription", "DisplayName": "TransformStep", "DependsOn": ["TestStep", "SecondTestStep"], "Arguments": { "ModelName": "gisele", "TransformInput": { "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": "s3://my-bucket/transform_manifest", } } }, "TransformOutput": { "S3OutputPath": None }, "TransformResources": { "InstanceCount": 1, "InstanceType": "c4.4xlarge", }, }, "CacheConfig": { "Enabled": True, "ExpireAfter": "PT1H" }, } assert step.properties.TransformJobName.expr == { "Get": "Steps.MyTransformStep.TransformJobName" }
def test_transform_step(sagemaker_session): transformer = Transformer( model_name=MODEL_NAME, instance_count=1, instance_type="c4.4xlarge", sagemaker_session=sagemaker_session, ) inputs = TransformInput(data=f"s3://{BUCKET}/transform_manifest") step = TransformStep( name="MyTransformStep", transformer=transformer, inputs=inputs, ) assert step.to_request() == { "Name": "MyTransformStep", "Type": "Transform", "Arguments": { "ModelName": "gisele", "TransformInput": { "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": "s3://my-bucket/transform_manifest", } } }, "TransformOutput": { "S3OutputPath": None }, "TransformResources": { "InstanceCount": 1, "InstanceType": "c4.4xlarge", }, }, } assert step.properties.TransformJobName.expr == { "Get": "Steps.MyTransformStep.TransformJobName" }
def test_transform_step_with_transformer(pipeline_session): model_name = ParameterString("ModelName") transformer = Transformer( model_name=model_name, instance_type="ml.m5.xlarge", instance_count=1, output_path=f"s3://{pipeline_session.default_bucket()}/Transform", sagemaker_session=pipeline_session, ) transform_inputs = TransformInput( data=f"s3://{pipeline_session.default_bucket()}/batch-data", ) with warnings.catch_warnings(record=True) as w: step_args = transformer.transform( data=transform_inputs.data, data_type=transform_inputs.data_type, content_type=transform_inputs.content_type, compression_type=transform_inputs.compression_type, split_type=transform_inputs.split_type, input_filter=transform_inputs.input_filter, output_filter=transform_inputs.output_filter, join_source=transform_inputs.join_source, model_client_config=transform_inputs.model_client_config, ) assert len(w) == 1 assert issubclass(w[-1].category, UserWarning) assert "Running within a PipelineSession" in str(w[-1].message) with warnings.catch_warnings(record=True) as w: step = TransformStep( name="MyTransformStep", step_args=step_args, ) assert len(w) == 0 pipeline = Pipeline( name="MyPipeline", steps=[step], parameters=[model_name], sagemaker_session=pipeline_session, ) step_args["ModelName"] = model_name.expr assert json.loads(pipeline.definition())["Steps"][0] == { "Name": "MyTransformStep", "Type": "Transform", "Arguments": step_args, }
def __init__( self, name: str, estimator: EstimatorBase, model_data, model_inputs, instance_count, instance_type, transform_inputs, # model arguments image_uri=None, predictor_cls=None, env=None, # transformer arguments strategy=None, assemble_with=None, output_path=None, output_kms_key=None, accept=None, max_concurrent_transforms=None, max_payload=None, tags=None, volume_kms_key=None, depends_on: List[str] = None, **kwargs, ): """Construct steps required for a Transformer step collection: An estimator-centric step collection. It models what happens in workflows when invoking the `transform()` method on an estimator instance: First, if custom model artifacts are required, a `_RepackModelStep` is included. Second, a `CreateModelStep` with the model data passed in from a training step or other training job output. Finally, a `TransformerStep`. If repacking the model artifacts is not necessary, only the CreateModelStep and TransformerStep are in the step collection. Args: name (str): The name of the Transform Step. estimator: The estimator instance. instance_count (int): The number of EC2 instances to use. instance_type (str): The type of EC2 instance to use. strategy (str): The strategy used to decide how to batch records in a single request (default: None). Valid values: 'MultiRecord' and 'SingleRecord'. assemble_with (str): How the output is assembled (default: None). Valid values: 'Line' or 'None'. output_path (str): The S3 location for saving the transform result. If not specified, results are stored to a default bucket. output_kms_key (str): Optional. A KMS key ID for encrypting the transform output (default: None). accept (str): The accept header passed by the client to the inference endpoint. If it is supported by the endpoint, it will be the format of the batch transform output. env (dict): The Environment variables to be set for use during the transform job (default: None). depends_on (List[str]): The list of step names the first step in the collection depends on """ steps = [] if "entry_point" in kwargs: entry_point = kwargs["entry_point"] source_dir = kwargs.get("source_dir") dependencies = kwargs.get("dependencies") repack_model_step = _RepackModelStep( name=f"{name}RepackModel", depends_on=depends_on, estimator=estimator, model_data=model_data, entry_point=entry_point, source_dir=source_dir, dependencies=dependencies, ) steps.append(repack_model_step) model_data = repack_model_step.properties.ModelArtifacts.S3ModelArtifacts def predict_wrapper(endpoint, session): return Predictor(endpoint, session) predictor_cls = predictor_cls or predict_wrapper model = Model( image_uri=image_uri or estimator.training_image_uri(), model_data=model_data, predictor_cls=predictor_cls, vpc_config=None, sagemaker_session=estimator.sagemaker_session, role=estimator.role, **kwargs, ) model_step = CreateModelStep( name=f"{name}CreateModelStep", model=model, inputs=model_inputs, ) if "entry_point" not in kwargs and depends_on: # if the CreateModelStep is the first step in the collection model_step.add_depends_on(depends_on) steps.append(model_step) transformer = Transformer( model_name=model_step.properties.ModelName, instance_count=instance_count, instance_type=instance_type, strategy=strategy, assemble_with=assemble_with, output_path=output_path, output_kms_key=output_kms_key, accept=accept, max_concurrent_transforms=max_concurrent_transforms, max_payload=max_payload, env=env, tags=tags, base_transform_job_name=name, volume_kms_key=volume_kms_key, sagemaker_session=estimator.sagemaker_session, ) transform_step = TransformStep( name=f"{name}TransformStep", transformer=transformer, inputs=transform_inputs, ) steps.append(transform_step) self.steps = steps
def test_end_to_end_pipeline_successful_execution( sagemaker_session, region_name, role, pipeline_name, wait=False ): model_package_group_name = f"{pipeline_name}ModelPackageGroup" data_path = os.path.join(DATA_DIR, "workflow") default_bucket = sagemaker_session.default_bucket() # download the input data local_input_path = os.path.join(data_path, "abalone-dataset.csv") s3 = sagemaker_session.boto_session.resource("s3") s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file( "dataset/abalone-dataset.csv", local_input_path ) # # upload the input data to our bucket base_uri = f"s3://{default_bucket}/{pipeline_name}" with open(local_input_path) as data: body = data.read() input_data_uri = S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=f"{base_uri}/abalone-dataset.csv", sagemaker_session=sagemaker_session, ) # download batch transform data local_batch_path = os.path.join(data_path, "abalone-dataset-batch") s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file( "dataset/abalone-dataset-batch", local_batch_path ) # upload the batch transform data with open(local_batch_path) as data: body = data.read() batch_data_uri = S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=f"{base_uri}/abalone-dataset-batch", sagemaker_session=sagemaker_session, ) # define parameters processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString( name="ProcessingInstanceType", default_value="ml.m5.xlarge" ) training_instance_type = ParameterString( name="TrainingInstanceType", default_value="ml.m5.xlarge" ) model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved") input_data = ParameterString( name="InputData", default_value=input_data_uri, ) batch_data = ParameterString( name="BatchData", default_value=batch_data_uri, ) # define processing step framework_version = "0.23-1" sklearn_processor = SKLearnProcessor( framework_version=framework_version, instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name=f"{pipeline_name}-process", role=role, sagemaker_session=sagemaker_session, ) step_process = ProcessingStep( name="AbaloneProcess", processor=sklearn_processor, inputs=[ ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), ], outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), ], code=os.path.join(data_path, "abalone/preprocessing.py"), ) # define training step model_path = f"s3://{default_bucket}/{pipeline_name}Train" image_uri = image_uris.retrieve( framework="xgboost", region=region_name, version="1.0-1", py_version="py3", instance_type=training_instance_type, ) xgb_train = Estimator( image_uri=image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, role=role, sagemaker_session=sagemaker_session, ) xgb_train.set_hyperparameters( objective="reg:linear", num_round=50, max_depth=5, eta=0.2, gamma=4, min_child_weight=6, subsample=0.7, silent=0, ) step_train = TrainingStep( name="AbaloneTrain", estimator=xgb_train, inputs={ "train": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, content_type="text/csv", ), }, ) # define evaluation step script_eval = ScriptProcessor( image_uri=image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{pipeline_name}-eval", role=role, sagemaker_session=sagemaker_session, ) evaluation_report = PropertyFile( name="EvaluationReport", output_name="evaluation", path="evaluation.json" ) step_eval = ProcessingStep( name="AbaloneEval", processor=script_eval, inputs=[ ProcessingInput( source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig.Outputs[ "test" ].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(data_path, "abalone/evaluation.py"), property_files=[evaluation_report], ) # define create model step model = Model( image_uri=image_uri, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, sagemaker_session=sagemaker_session, role=role, ) inputs = CreateModelInput( instance_type="ml.m5.large", accelerator_type="ml.eia1.medium", ) step_create_model = CreateModelStep( name="AbaloneCreateModel", model=model, inputs=inputs, ) # define transform step transformer = Transformer( model_name=step_create_model.properties.ModelName, instance_type="ml.m5.xlarge", instance_count=1, output_path=f"s3://{default_bucket}/{pipeline_name}Transform", sagemaker_session=sagemaker_session, ) step_transform = TransformStep( name="AbaloneTransform", transformer=transformer, inputs=TransformInput(data=batch_data), ) # define register model step model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/evaluation.json".format( step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] ), content_type="application/json", ) ) step_register = RegisterModel( name="AbaloneRegisterModel", estimator=xgb_train, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.xlarge"], transform_instances=["ml.m5.xlarge"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=model_metrics, ) # define condition step cond_lte = ConditionLessThanOrEqualTo( left=JsonGet( step_name=step_eval.name, property_file=evaluation_report, json_path="regression_metrics.mse.value", ), right=20.0, ) step_cond = ConditionStep( name="AbaloneMSECond", conditions=[cond_lte], if_steps=[step_register, step_create_model, step_transform], else_steps=[], ) # define pipeline pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data, batch_data, ], steps=[step_process, step_train, step_eval, step_cond], sagemaker_session=sagemaker_session, ) pipeline.create(role) execution = pipeline.start() execution_arn = execution.arn if wait: execution.wait() return execution_arn