def get_pipeline( region, sagemaker_project_arn=None, role=None, default_bucket='', pipeline_name='end-to-end-ml-sagemaker-pipeline', model_package_group_name='end-to-end-ml-sm-model-package-group', base_job_prefix='endtoendmlsm') -> Pipeline: """ Gets the SM Pipeline. :param role: The execution role. :param bucket_name: The bucket where pipeline artifacts are stored. :param prefix: The prefix where pipeline artifacts are stored. :return: A Pipeline instance. """ bucket_name = default_bucket prefix = 'endtoendmlsm' sagemaker_session = get_session(region, bucket_name) # --------------------- # Processing parameters # --------------------- # The path to the raw data. raw_data_path = 's3://gianpo-public/endtoendml/data/raw/predmain_raw_data_header.csv'.format( bucket_name, prefix) raw_data_path_param = ParameterString(name="raw_data_path", default_value=raw_data_path) # The output path to the training data. train_data_path = 's3://{0}/{1}/data/preprocessed/train/'.format( bucket_name, prefix) train_data_path_param = ParameterString(name="train_data_path", default_value=train_data_path) # The output path to the validation data. val_data_path = 's3://{0}/{1}/data/preprocessed/val/'.format( bucket_name, prefix) val_data_path_param = ParameterString(name="val_data_path", default_value=val_data_path) # The output path to the featurizer model. model_path = 's3://{0}/{1}/output/sklearn/'.format(bucket_name, prefix) model_path_param = ParameterString(name="model_path", default_value=model_path) # The instance type for the processing job. processing_instance_type_param = ParameterString( name="processing_instance_type", default_value='ml.m5.large') # The instance count for the processing job. processing_instance_count_param = ParameterInteger( name="processing_instance_count", default_value=1) # The train/test split ration parameter. train_test_split_ratio_param = ParameterString( name="train_test_split_ratio", default_value='0.2') # ------------------- # Training parameters # ------------------- # XGB hyperparameters. max_depth_param = ParameterString(name="max_depth", default_value='3') eta_param = ParameterString(name="eta", default_value='0.1') gamma_param = ParameterString(name="gamma", default_value='0') min_child_weight_param = ParameterString(name="min_child_weight", default_value='1') objective_param = ParameterString(name="objective", default_value='binary:logistic') num_round_param = ParameterString(name="num_round", default_value='10') eval_metric_param = ParameterString(name="eval_metric", default_value='auc') # The instance type for the training job. training_instance_type_param = ParameterString( name="training_instance_type", default_value='ml.m5.xlarge') # The instance count for the training job. training_instance_count_param = ParameterInteger( name="training_instance_count", default_value=1) # The training output path for the model. output_path = 's3://{0}/{1}/output/'.format(bucket_name, prefix) output_path_param = ParameterString(name="output_path", default_value=output_path) # -------------------------- # Register model parameters # -------------------------- # The default instance type for deployment. deploy_instance_type_param = ParameterString(name="deploy_instance_type", default_value='ml.m5.2xlarge') # The approval status for models added to the registry. model_approval_status_param = ParameterString( name="model_approval_status", default_value='PendingManualApproval') # -------------------------- # Processing Step # -------------------------- sklearn_processor = SKLearnProcessor( role=role, instance_type=processing_instance_type_param, instance_count=processing_instance_count_param, framework_version='0.20.0') inputs = [ ProcessingInput(input_name='raw_data', source=raw_data_path_param, destination='/opt/ml/processing/input') ] outputs = [ ProcessingOutput(output_name='train_data', source='/opt/ml/processing/train', destination=train_data_path_param), ProcessingOutput(output_name='val_data', source='/opt/ml/processing/val', destination=val_data_path_param), ProcessingOutput(output_name='model', source='/opt/ml/processing/model', destination=model_path_param) ] code_path = os.path.join(BASE_DIR, 'dataprep/preprocess.py') processing_step = ProcessingStep(name='Processing', code=code_path, processor=sklearn_processor, inputs=inputs, outputs=outputs, job_arguments=[ '--train-test-split-ratio', train_test_split_ratio_param ]) # -------------------------- # Training Step # -------------------------- hyperparameters = { "max_depth": max_depth_param, "eta": eta_param, "gamma": gamma_param, "min_child_weight": min_child_weight_param, "silent": 0, "objective": objective_param, "num_round": num_round_param, "eval_metric": eval_metric_param } entry_point = 'train.py' source_dir = os.path.join(BASE_DIR, 'train/') code_location = 's3://{0}/{1}/code'.format(bucket_name, prefix) estimator = XGBoost(entry_point=entry_point, source_dir=source_dir, output_path=output_path_param, code_location=code_location, hyperparameters=hyperparameters, instance_type=training_instance_type_param, instance_count=training_instance_count_param, framework_version="0.90-2", py_version="py3", role=role) training_step = TrainingStep( name='Training', estimator=estimator, inputs={ 'train': TrainingInput( s3_data=processing_step.properties.ProcessingOutputConfig. Outputs['train_data'].S3Output.S3Uri, content_type='text/csv'), 'validation': TrainingInput( s3_data=processing_step.properties.ProcessingOutputConfig. Outputs['val_data'].S3Output.S3Uri, content_type='text/csv') }) # -------------------------- # Register Model Step # -------------------------- code_location = 's3://{0}/{1}/code'.format(bucket_name, prefix) sklearn_model = SKLearnModel( name='end-to-end-ml-sm-skl-model-{0}'.format(str(int(time.time()))), model_data=processing_step.properties.ProcessingOutputConfig. Outputs['model'].S3Output.S3Uri, entry_point='inference.py', source_dir=os.path.join(BASE_DIR, 'deploy/sklearn/'), code_location=code_location, role=role, sagemaker_session=sagemaker_session, framework_version='0.20.0', py_version='py3') code_location = 's3://{0}/{1}/code'.format(bucket_name, prefix) xgboost_model = XGBoostModel( name='end-to-end-ml-sm-xgb-model-{0}'.format(str(int(time.time()))), model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts, entry_point='inference.py', source_dir=os.path.join(BASE_DIR, 'deploy/xgboost/'), code_location=code_location, framework_version='0.90-2', py_version='py3', role=role, sagemaker_session=sagemaker_session) pipeline_model_name = 'end-to-end-ml-sm-xgb-skl-pipeline-{0}'.format( str(int(time.time()))) pipeline_model = PipelineModel(name=pipeline_model_name, role=role, models=[sklearn_model, xgboost_model], sagemaker_session=sagemaker_session) register_model_step = RegisterModel( name='RegisterModel', content_types=['text/csv'], response_types=['application/json', 'text/csv'], inference_instances=[deploy_instance_type_param, 'ml.m5.large'], transform_instances=['ml.c5.4xlarge'], model_package_group_name=model_package_group_name, approval_status=model_approval_status_param, model=pipeline_model) # -------------------------- # Pipeline # -------------------------- pipeline = Pipeline( name=pipeline_name, parameters=[ raw_data_path_param, train_data_path_param, val_data_path_param, model_path_param, processing_instance_type_param, processing_instance_count_param, train_test_split_ratio_param, max_depth_param, eta_param, gamma_param, min_child_weight_param, objective_param, num_round_param, eval_metric_param, training_instance_type_param, training_instance_count_param, output_path_param, deploy_instance_type_param, model_approval_status_param ], steps=[processing_step, training_step, register_model_step], sagemaker_session=sagemaker_session, ) response = pipeline.upsert(role_arn=role) print(response["PipelineArn"]) return pipeline
def run_model_monitor_job_processor(region, instance_type, role, data_capture_path, preprocessor_path, postprocessor_path, statistics_path, constraints_path, reports_path): data_capture_sub_path = data_capture_path[data_capture_path. rfind('datacapture/'):] data_capture_sub_path = data_capture_sub_path[data_capture_sub_path. find('/') + 1:] processing_output_paths = reports_path + '/' + data_capture_sub_path input_1 = ProcessingInput( input_name='input_1', source=data_capture_path, destination='/opt/ml/processing/input/endpoint/' + data_capture_sub_path, s3_data_type='S3Prefix', s3_input_mode='File') baseline = ProcessingInput(input_name='baseline', source=statistics_path, destination='/opt/ml/processing/baseline/stats', s3_data_type='S3Prefix', s3_input_mode='File') constraints = ProcessingInput( input_name='constraints', source=constraints_path, destination='/opt/ml/processing/baseline/constraints', s3_data_type='S3Prefix', s3_input_mode='File') post_processor_script = ProcessingInput( input_name='post_processor_script', source=postprocessor_path, destination='/opt/ml/processing/code/postprocessing', s3_data_type='S3Prefix', s3_input_mode='File') pre_processor_script = ProcessingInput( input_name='pre_processor_script', source=preprocessor_path, destination='/opt/ml/processing/code/preprocessing', s3_data_type='S3Prefix', s3_input_mode='File') outputs = ProcessingOutput(output_name='result', source='/opt/ml/processing/output', destination=processing_output_paths, s3_upload_mode='Continuous') processor = Processor( image_uri=get_model_monitor_container_uri(region), instance_count=1, instance_type=instance_type, role=role, env={ 'baseline_constraints': '/opt/ml/processing/baseline/constraints/constraints.json', 'baseline_statistics': '/opt/ml/processing/baseline/stats/statistics.json', 'dataset_format': '{"sagemakerCaptureJson":{"captureIndexNames":["endpointInput","endpointOutput"]}}', 'dataset_source': '/opt/ml/processing/input/endpoint', 'output_path': '/opt/ml/processing/output', 'post_analytics_processor_script': '/opt/ml/processing/code/postprocessing/postprocessor.py', 'publish_cloudwatch_metrics': 'Disabled', 'record_preprocessor_script': '/opt/ml/processing/code/preprocessing/preprocessor.py' }) return processor.run(inputs=[ input_1, baseline, constraints, post_processor_script, pre_processor_script ], outputs=[outputs])
role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001' processor = ScriptProcessor( command=['python3'], image_uri='sagemaker-delta-sharing-processing-local', role=role, instance_count=1, instance_type='local') processor.run(code='processing_script.py', inputs=[ ProcessingInput(source='./profile/', destination='/opt/ml/processing/profile/') ], outputs=[ ProcessingOutput(output_name='delta_lake_processed_data', source='/opt/ml/processing/processed_data/') ]) preprocessing_job_description = processor.jobs[-1].describe() output_config = preprocessing_job_description['ProcessingOutputConfig'] print(output_config) for output in output_config['Outputs']: if output['OutputName'] == 'delta_lake_processed_data': delta_lake_processed_data_file = output['S3Output']['S3Uri'] bucket = delta_lake_processed_data_file.split("/")[:3][2] output_file_name = '/'.join( delta_lake_processed_data_file.split("/") [3:]) + "/total_cases_per_location.csv"
def processing_output(): return ProcessingOutput( source="/opt/ml/processing/spark-events/", destination=SPARK_EVENT_LOGS_S3_URI, s3_upload_mode="Continuous", )
def test_one_step_ingestion_pipeline( sagemaker_session, feature_store_session, feature_definitions, role, pipeline_name ): instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.4xlarge") input_name = "features.csv" input_file_path = os.path.join(DATA_DIR, "workflow", "features.csv") input_data_uri = os.path.join( "s3://", sagemaker_session.default_bucket(), "py-sdk-ingestion-test-input/features.csv", ) with open(input_file_path, "r") as data: body = data.read() S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session, ) inputs = [ ProcessingInput( input_name=input_name, source=input_data_uri, destination="/opt/ml/processing/features.csv", ) ] feature_group_name = f"py-sdk-integ-fg-{int(time.time() * 10**7)}" feature_group = FeatureGroup( name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=feature_store_session, ) ingestion_only_flow, output_name = generate_data_ingestion_flow_from_s3_input( input_name, input_data_uri, s3_content_type="csv", s3_has_header=True, ) outputs = [ ProcessingOutput( output_name=output_name, app_managed=True, feature_store_output=FeatureStoreOutput(feature_group_name=feature_group_name), ) ] output_content_type = "CSV" output_config = {output_name: {"content_type": output_content_type}} job_argument = [f"--output-config '{json.dumps(output_config)}'"] temp_flow_path = "./ingestion.flow" with cleanup_feature_group(feature_group): json.dump(ingestion_only_flow, open(temp_flow_path, "w")) data_wrangler_processor = DataWranglerProcessor( role=role, data_wrangler_flow_source=temp_flow_path, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, max_runtime_in_seconds=86400, ) data_wrangler_step = ProcessingStep( name="ingestion-step", processor=data_wrangler_processor, inputs=inputs, outputs=outputs, job_arguments=job_argument, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count, instance_type], steps=[data_wrangler_step], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] offline_store_s3_uri = os.path.join( "s3://", sagemaker_session.default_bucket(), feature_group_name ) feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="f11", event_time_feature_name="f10", role_arn=role, enable_online_store=False, ) _wait_for_feature_group_create(feature_group) execution = pipeline.start() response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=60, max_attempts=10) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "ingestion-step" assert execution_steps[0]["StepStatus"] == "Succeeded" athena_query = feature_group.athena_query() with timeout(minutes=10): athena_query.run( query_string=f'SELECT * FROM "{athena_query.table_name}"', output_location=f"{offline_store_s3_uri}/query_results", ) athena_query.wait() assert "SUCCEEDED" == athena_query.get_query_execution().get("QueryExecution").get( "Status" ).get("State") df = athena_query.as_dataframe() assert pd.read_csv(input_file_path).shape[0] == df.shape[0] finally: try: pipeline.delete() except Exception as e: print(f"Delete pipeline failed with error: {e}") os.remove(temp_flow_path)
def test_workflow_with_clarify( data_config, data_bias_config, model_config, model_predicted_label_config, pipeline_name, role, sagemaker_session, ): instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") instance_count = ParameterInteger(name="InstanceCount", default_value=1) analysis_config = data_config.get_config() analysis_config.update(data_bias_config.get_config()) ( probability_threshold, predictor_config, ) = model_predicted_label_config.get_predictor_config() predictor_config.update(model_config.get_predictor_config()) analysis_config["methods"] = {"post_training_bias": {"methods": "all"}} analysis_config["predictor"] = predictor_config analysis_config["probability_threshold"] = probability_threshold analysis_config["methods"]["report"] = { "name": "report", "title": "Analysis Report" } with tempfile.TemporaryDirectory() as tmpdirname: analysis_config_file = os.path.join(tmpdirname, "analysis_config.json") with open(analysis_config_file, "w") as f: json.dump(analysis_config, f) config_input = ProcessingInput( input_name="analysis_config", source=analysis_config_file, destination="/opt/ml/processing/input/config", s3_data_type="S3Prefix", s3_input_mode="File", s3_compression_type="None", ) data_input = ProcessingInput( input_name="dataset", source=data_config.s3_data_input_path, destination="/opt/ml/processing/input/data", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type=data_config.s3_data_distribution_type, s3_compression_type=data_config.s3_compression_type, ) result_output = ProcessingOutput( source="/opt/ml/processing/output", destination=data_config.s3_output_path, output_name="analysis_result", s3_upload_mode="EndOfJob", ) processor = SageMakerClarifyProcessor( role="SageMakerRole", instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, ) property_file = PropertyFile( name="BiasOutput", output_name="analysis_result", path="analysis.json", ) step_process = ProcessingStep( name="my-process", processor=processor, inputs=[data_input, config_input], outputs=[result_output], property_files=[property_file], ) cond_left = JsonGet( step=step_process, property_file="BiasOutput", json_path= "post_training_bias_metrics.facets.F1[0].metrics[0].value", ) step_condition = ConditionStep( name="bias-condition", conditions=[ConditionLessThanOrEqualTo(left=cond_left, right=1)], if_steps=[], else_steps=[], ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_type, instance_count], steps=[step_process, step_condition], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] execution = pipeline.start(parameters={}) response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=30, max_attempts=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 2 assert execution_steps[1]["StepName"] == "my-process" assert execution_steps[1]["StepStatus"] == "Succeeded" assert execution_steps[0]["StepName"] == "bias-condition" assert execution_steps[0]["StepStatus"] == "Succeeded" assert execution_steps[0]["Metadata"]["Condition"][ "Outcome"] == "True" finally: try: pipeline.delete() except Exception: pass
def get_pipeline( region, role=None, default_bucket=None, model_package_group_name="CustomerChurnPackageGroup", # Choose any name pipeline_name="CustomerChurnDemo-p-ewf8t7lvhivm", # You can find your pipeline name in the Studio UI (project -> Pipelines -> name) base_job_prefix="CustomerChurn", # Choose any name ): """Gets a SageMaker ML Pipeline instance working with on CustomerChurn data. Args: region: AWS region to create and run the pipeline. role: IAM role to create and run steps and pipeline. default_bucket: the bucket to use for storing the artifacts Returns: an instance of a pipeline """ sagemaker_session = get_session(region, default_bucket) if role is None: role = sagemaker.session.get_execution_role(sagemaker_session) # Parameters for pipeline execution processing_instance_count = ParameterInteger( name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.xlarge") training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge") model_approval_status = ParameterString( name="ModelApprovalStatus", default_value= "PendingManualApproval", # ModelApprovalStatus can be set to a default of "Approved" if you don't want manual approval. ) input_data = ParameterString( name="InputDataUrl", default_value= f"s3://EXAMPLE-BUCKET/PATH/TO/RawData.csv", # Change this to point to the s3 location of your raw input data. ) # Processing step for feature engineering sklearn_processor = SKLearnProcessor( framework_version="0.23-1", instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name= f"{base_job_prefix}/sklearn-CustomerChurn-preprocess", # choose any name sagemaker_session=sagemaker_session, role=role, ) step_process = ProcessingStep( name="CustomerChurnProcess", # choose any name processor=sklearn_processor, outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), ], code=os.path.join(BASE_DIR, "preprocess.py"), job_arguments=["--input-data", input_data], ) # Training step for generating model artifacts model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/CustomerChurnTrain" image_uri = sagemaker.image_uris.retrieve( framework= "xgboost", # we are using the Sagemaker built in xgboost algorithm region=region, version="1.0-1", py_version="py3", instance_type=training_instance_type, ) xgb_train = Estimator( image_uri=image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, base_job_name=f"{base_job_prefix}/CustomerChurn-train", sagemaker_session=sagemaker_session, role=role, ) xgb_train.set_hyperparameters( objective="binary:logistic", num_round=50, max_depth=5, eta=0.2, gamma=4, min_child_weight=6, subsample=0.7, silent=0, ) step_train = TrainingStep( name="CustomerChurnTrain", estimator=xgb_train, inputs={ "train": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig. Outputs["train"].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig. Outputs["validation"].S3Output.S3Uri, content_type="text/csv", ), }, ) # Processing step for evaluation script_eval = ScriptProcessor( image_uri=image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-CustomerChurn-eval", sagemaker_session=sagemaker_session, role=role, ) evaluation_report = PropertyFile( name="EvaluationReport", output_name="evaluation", path="evaluation.json", ) step_eval = ProcessingStep( name="CustomerChurnEval", processor=script_eval, inputs=[ ProcessingInput( source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig. Outputs["test"].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(BASE_DIR, "evaluate.py"), property_files=[evaluation_report], ) # Register model step that will be conditionally executed model_metrics = ModelMetrics(model_statistics=MetricsSource( s3_uri="{}/evaluation.json".format( step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0] ["S3Output"]["S3Uri"]), content_type="application/json", )) # Register model step that will be conditionally executed step_register = RegisterModel( name="CustomerChurnRegisterModel", estimator=xgb_train, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=model_metrics, ) # Condition step for evaluating model quality and branching execution cond_lte = ConditionGreaterThanOrEqualTo( # You can change the condition here left=JsonGet( step=step_eval, property_file=evaluation_report, json_path= "binary_classification_metrics.accuracy.value", # This should follow the structure of your report_dict defined in the evaluate.py file. ), right=0.8, # You can change the threshold here ) step_cond = ConditionStep( name="CustomerChurnAccuracyCond", conditions=[cond_lte], if_steps=[step_register], else_steps=[], ) # Pipeline instance pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data, ], steps=[step_process, step_train, step_eval, step_cond], sagemaker_session=sagemaker_session, ) return pipeline
def test_three_step_definition( sagemaker_session, region_name, role, script_dir, pipeline_name, athena_dataset_definition, ): framework_version = "0.20.0" instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") instance_count = ParameterInteger(name="InstanceCount", default_value=1) output_prefix = ParameterString(name="OutputPrefix", default_value="output") input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv" sklearn_processor = SKLearnProcessor( framework_version=framework_version, instance_type=instance_type, instance_count=instance_count, base_job_name="test-sklearn", sagemaker_session=sagemaker_session, role=role, ) step_process = ProcessingStep( name="my-process", display_name="ProcessingStep", description="description for Processing step", processor=sklearn_processor, inputs=[ ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), ProcessingInput(dataset_definition=athena_dataset_definition), ], outputs=[ ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"), ProcessingOutput( output_name="test_data", source="/opt/ml/processing/test", destination=Join( on="/", values=[ "s3:/", sagemaker_session.default_bucket(), "test-sklearn", output_prefix, ExecutionVariables.PIPELINE_EXECUTION_ID, ], ), ), ], code=os.path.join(script_dir, "preprocessing.py"), ) sklearn_train = SKLearn( framework_version=framework_version, entry_point=os.path.join(script_dir, "train.py"), instance_type=instance_type, sagemaker_session=sagemaker_session, role=role, ) step_train = TrainingStep( name="my-train", display_name="TrainingStep", description="description for Training step", estimator=sklearn_train, inputs=TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train_data" ].S3Output.S3Uri ), ) model = Model( image_uri=sklearn_train.image_uri, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, sagemaker_session=sagemaker_session, role=role, ) model_inputs = CreateModelInput( instance_type="ml.m5.large", accelerator_type="ml.eia1.medium", ) step_model = CreateModelStep( name="my-model", display_name="ModelStep", description="description for Model step", model=model, inputs=model_inputs, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_type, instance_count, output_prefix], steps=[step_process, step_train, step_model], sagemaker_session=sagemaker_session, ) definition = json.loads(pipeline.definition()) assert definition["Version"] == "2020-12-01" assert set(tuple(param.items()) for param in definition["Parameters"]) == set( [ tuple( { "Name": "InstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge", }.items() ), tuple({"Name": "InstanceCount", "Type": "Integer", "DefaultValue": 1}.items()), tuple( { "Name": "OutputPrefix", "Type": "String", "DefaultValue": "output", }.items() ), ] ) steps = definition["Steps"] assert len(steps) == 3 names_and_types = [] display_names_and_desc = [] processing_args = {} training_args = {} for step in steps: names_and_types.append((step["Name"], step["Type"])) display_names_and_desc.append((step["DisplayName"], step["Description"])) if step["Type"] == "Processing": processing_args = step["Arguments"] if step["Type"] == "Training": training_args = step["Arguments"] if step["Type"] == "Model": model_args = step["Arguments"] assert set(names_and_types) == set( [ ("my-process", "Processing"), ("my-train", "Training"), ("my-model", "Model"), ] ) assert set(display_names_and_desc) == set( [ ("ProcessingStep", "description for Processing step"), ("TrainingStep", "description for Training step"), ("ModelStep", "description for Model step"), ] ) assert processing_args["ProcessingResources"]["ClusterConfig"] == { "InstanceType": {"Get": "Parameters.InstanceType"}, "InstanceCount": {"Get": "Parameters.InstanceCount"}, "VolumeSizeInGB": 30, } assert training_args["ResourceConfig"] == { "InstanceCount": 1, "InstanceType": {"Get": "Parameters.InstanceType"}, "VolumeSizeInGB": 30, } assert training_args["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"] == { "Get": "Steps.my-process.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri" } assert model_args["PrimaryContainer"]["ModelDataUrl"] == { "Get": "Steps.my-train.ModelArtifacts.S3ModelArtifacts" } try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) finally: try: pipeline.delete() except Exception: pass
def test_sklearn_with_all_parameters(exists_mock, isfile_mock, botocore_resolver, sklearn_version, sagemaker_session): botocore_resolver.return_value.construct_endpoint.return_value = { "hostname": ECR_HOSTNAME } processor = SKLearnProcessor( role=ROLE, framework_version=sklearn_version, instance_type="ml.m4.xlarge", instance_count=1, volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="my_sklearn_processor", env={"my_env_variable": "my_env_variable_value"}, tags=[{ "Key": "my-tag", "Value": "my-tag-value" }], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, encrypt_inter_container_traffic=True, ), sagemaker_session=sagemaker_session, ) processor.run( code="/local/path/to/processing_code.py", inputs=[ ProcessingInput( source="s3://path/to/my/dataset/census.csv", destination="/container/path/", input_name="my_dataset", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", s3_compression_type="None", ) ], outputs=[ ProcessingOutput( source="/container/path/", destination="s3://uri/", output_name="my_output", s3_upload_mode="EndOfJob", ) ], arguments=["--drop-columns", "'SelfEmployed'"], wait=True, logs=False, job_name="my_job_name", experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = _get_expected_args_all_parameters( processor._current_job_name) sklearn_image_uri = ( "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:{}-cpu-py3" ).format(sklearn_version) expected_args["app_specification"]["ImageUri"] = sklearn_image_uri sagemaker_session.process.assert_called_with(**expected_args)
def test_processor_with_all_parameters(sagemaker_session): processor = Processor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, entrypoint=[ "python3", "/opt/ml/processing/input/code/processing_code.py" ], volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="processor_base_name", env={"my_env_variable": "my_env_variable_value"}, tags=[{ "Key": "my-tag", "Value": "my-tag-value" }], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, encrypt_inter_container_traffic=True, ), ) processor.run( inputs=[ ProcessingInput( source="s3://path/to/my/dataset/census.csv", destination="/container/path/", input_name="my_dataset", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", s3_compression_type="None", ) ], outputs=[ ProcessingOutput( source="/container/path/", destination="s3://uri/", output_name="my_output", s3_upload_mode="EndOfJob", ) ], arguments=["--drop-columns", "'SelfEmployed'"], wait=True, logs=False, job_name="my_job_name", experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = _get_expected_args_all_parameters( processor._current_job_name) # Drop the "code" input from expected values. expected_args["inputs"] = [expected_args["inputs"][0]] sagemaker_session.process.assert_called_with(**expected_args)
def getter(self, attr: str) -> Dict[str, Any]: data = { 'tfrecord_processing': { 'endpoint': ['python3', 'criteo_ads_data/run_processing.py'], 'inputs': [ ProcessingInput( source='s3://criteo-ads-data/prod/train_csv', destination='/opt/ml/processing/input', s3_data_distribution_type='ShardedByS3Key', ) ], 'outputs': [ ProcessingOutput( source='/opt/ml/processing/output', destination= 's3://criteo-ads-data/prod/train_tfrecord_gz', ) ], 'arguments': [ '--input_path=/opt/ml/processing/input', '--output_path=/opt/ml/processing/output', ], 'sm_config': SagemakerProcessingConfig( project_name=self.project_name, env=self.env, region_name=self.region_name, current_time=self.current_time, sm_instance_type='ml.c5.2xlarge', sm_instance_count=20, sm_volumesize=100, max_run=1 * 60 * 60, ) }, 'layer_processing': { 'endpoint': ['python3', 'criteo_ads_data/run_processing_layer.py'], 'inputs': [ ProcessingInput( source= 's3://criteo-ads-data/prod/train_tfrecord_gz/train', destination='/opt/ml/processing/input', s3_data_distribution_type='FullyReplicated', ) ], 'outputs': [ ProcessingOutput( source='/opt/ml/processing/output', destination='s3://criteo-ads-data/prod/proc_layer', ) ], 'arguments': [ '--input_path=/opt/ml/processing/input', '--output_path=/opt/ml/processing/output', ], 'sm_config': SagemakerProcessingConfig( project_name=self.project_name, env=self.env, region_name=self.region_name, current_time=self.current_time, sm_instance_type='ml.c5.9xlarge', sm_instance_count=1, sm_volumesize=100, max_run=24 * 60 * 60, ) }, 'estimator': { 'sm_input': { 'train': TrainingInput( s3_data= 's3://criteo-ads-data/prod/train_tfrecord_100000_gz/train', distribution='FullyReplicated', ), 'test': TrainingInput( s3_data= 's3://criteo-ads-data/prod/train_tfrecord_100000_gz/test', distribution='FullyReplicated', ), 'layer': TrainingInput( s3_data='s3://criteo-ads-data/prod/proc_layer_100000', distribution='FullyReplicated', ), }, 'shared_hyperparameters': { 'tf_logs_path': self.tf_logs_path, 'batch_size': 512, }, 'sm_config': SagemakerTrainingConfig( project_name=self.project_name, env=self.env, region_name=self.region_name, current_time=self.current_time, sm_instance_type='ml.c5.2xlarge', sm_instance_count=1, sm_volumesize=300, max_run=1 * 24 * 60 * 60, ) }, 'hparam_tuning': { 'objective_metric_name': 'validation:loss', 'metric_definitions': [ { 'Name': 'train:loss', 'Regex': '.*loss: ([0-9\\.]+) - auc: [0-9\\.]+.*' }, { 'Name': 'train:auc', 'Regex': '.*loss: [0-9\\.]+ - auc: ([0-9\\.]+).*' }, { 'Name': 'validation:loss', 'Regex': '.*step - loss: [0-9\\.]+ - auc: [0-9\\.]+ - val_loss: ([0-9\\.]+) - val_auc: [0-9\\.]+.*' }, { 'Name': 'validation:auc', 'Regex': '.*step - loss: [0-9\\.]+ - auc: [0-9\\.]+ - val_loss: [0-9\\.]+ - val_auc: ([0-9\\.]+).*' }, ], 'hyperparameter_ranges': { 'epochs': IntegerParameter(1, 50), 'batch_size': CategoricalParameter([64, 128, 256, 512]) }, 'objective_type': 'Minimize', 'max_jobs': 5, 'max_parallel_jobs': 5, }, } return data.get(attr)
def test_processing_step_with_placeholders(sklearn_processor_fixture, sagemaker_session, sfn_client, sfn_role_arn, sagemaker_role_arn): region = boto3.session.Session().region_name input_data = f"s3://sagemaker-sample-data-{region}/processing/census/census-income.csv" input_s3 = sagemaker_session.upload_data( path=os.path.join(DATA_DIR, 'sklearn_processing'), bucket=sagemaker_session.default_bucket(), key_prefix='integ-test-data/sklearn_processing/code') output_s3 = f"s3://{sagemaker_session.default_bucket()}/integ-test-data/sklearn_processing" inputs = [ ProcessingInput(source=input_data, destination='/opt/ml/processing/input', input_name='input-1'), ProcessingInput(source=input_s3 + '/preprocessor.py', destination='/opt/ml/processing/input/code', input_name='code'), ] outputs = [ ProcessingOutput(source='/opt/ml/processing/train', destination=output_s3 + '/train_data', output_name='train_data'), ProcessingOutput(source='/opt/ml/processing/test', destination=output_s3 + '/test_data', output_name='test_data'), ] # Build workflow definition execution_input = ExecutionInput( schema={ 'image_uri': str, 'instance_count': int, 'entrypoint': str, 'role': str, 'volume_size_in_gb': int, 'max_runtime_in_seconds': int, 'container_arguments': [str], }) parameters = { 'AppSpecification': { 'ContainerEntrypoint': execution_input['entrypoint'], 'ImageUri': execution_input['image_uri'] }, 'ProcessingResources': { 'ClusterConfig': { 'InstanceCount': execution_input['instance_count'], 'VolumeSizeInGB': execution_input['volume_size_in_gb'] } }, 'RoleArn': execution_input['role'], 'StoppingCondition': { 'MaxRuntimeInSeconds': execution_input['max_runtime_in_seconds'] } } job_name = generate_job_name() processing_step = ProcessingStep( 'create_processing_job_step', processor=sklearn_processor_fixture, job_name=job_name, inputs=inputs, outputs=outputs, container_arguments=execution_input['container_arguments'], container_entrypoint=execution_input['entrypoint'], parameters=parameters) processing_step.add_retry(SAGEMAKER_RETRY_STRATEGY) workflow_graph = Chain([processing_step]) with timeout(minutes=DEFAULT_TIMEOUT_MINUTES): workflow = create_workflow_and_check_definition( workflow_graph=workflow_graph, workflow_name=unique_name_from_base( "integ-test-processing-step-workflow"), sfn_client=sfn_client, sfn_role_arn=sfn_role_arn) execution_input = { 'image_uri': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3', 'instance_count': 1, 'entrypoint': ['python3', '/opt/ml/processing/input/code/preprocessor.py'], 'role': sagemaker_role_arn, 'volume_size_in_gb': 30, 'max_runtime_in_seconds': 500, 'container_arguments': ['--train-test-split-ratio', '0.2'] } # Execute workflow execution = workflow.execute(inputs=execution_input) execution_output = execution.get_output(wait=True) # Check workflow output assert execution_output.get("ProcessingJobStatus") == "Completed" # Cleanup state_machine_delete_wait(sfn_client, workflow.state_machine_arn)
def test_processing_step(sklearn_processor_fixture, sagemaker_session, sfn_client, sfn_role_arn): region = boto3.session.Session().region_name input_data = 's3://sagemaker-sample-data-{}/processing/census/census-income.csv'.format( region) input_s3 = sagemaker_session.upload_data( path=os.path.join(DATA_DIR, 'sklearn_processing'), bucket=sagemaker_session.default_bucket(), key_prefix='integ-test-data/sklearn_processing/code') output_s3 = 's3://' + sagemaker_session.default_bucket( ) + '/integ-test-data/sklearn_processing' inputs = [ ProcessingInput(source=input_data, destination='/opt/ml/processing/input', input_name='input-1'), ProcessingInput(source=input_s3 + '/preprocessor.py', destination='/opt/ml/processing/input/code', input_name='code'), ] outputs = [ ProcessingOutput(source='/opt/ml/processing/train', destination=output_s3 + '/train_data', output_name='train_data'), ProcessingOutput(source='/opt/ml/processing/test', destination=output_s3 + '/test_data', output_name='test_data'), ] job_name = generate_job_name() processing_step = ProcessingStep( 'create_processing_job_step', processor=sklearn_processor_fixture, job_name=job_name, inputs=inputs, outputs=outputs, container_arguments=['--train-test-split-ratio', '0.2'], container_entrypoint=[ 'python3', '/opt/ml/processing/input/code/preprocessor.py' ], ) processing_step.add_retry(SAGEMAKER_RETRY_STRATEGY) workflow_graph = Chain([processing_step]) with timeout(minutes=DEFAULT_TIMEOUT_MINUTES): # Create workflow and check definition workflow = create_workflow_and_check_definition( workflow_graph=workflow_graph, workflow_name=unique_name_from_base( "integ-test-processing-step-workflow"), sfn_client=sfn_client, sfn_role_arn=sfn_role_arn) # Execute workflow execution = workflow.execute() execution_output = execution.get_output(wait=True) # Check workflow output assert execution_output.get("ProcessingJobStatus") == "Completed" # Cleanup state_machine_delete_wait(sfn_client, workflow.state_machine_arn)
def runProcessing( self, entrypoint=None, command=None, env=None, code=None, arguments=None, inputs=list(), outputs=list(), instance_type=constants.DEFAULT_INSTANCE_TYPE_TRAINING, instance_count=constants.DEFAULT_INSTANCE_COUNT, role_name=constants.DEFAULT_IAM_ROLE, volume_size=constants.DEFAULT_VOLUME_SIZE, max_run_mins=constants.DEFAULT_MAX_RUN, tags=dict(), input_distribution="FullyReplicated", dependencies=list(), ): logger.info( f"===== Running a processing job {self.task_name} entrypoint={entrypoint} " f"command={command} code={code} arguments={arguments}... =====") job_name = self._getJobName() # ## Outputs # state - continuesly updated state_path = "/opt/ml/processing/state" outputs.append( ProcessingOutput(state_path, self.stateS3Uri, "state", "Continuous")) env["SSM_STATE"] = state_path # output - copied by end of job output_path = "/opt/ml/processing/output" output_s3_uri = sagemaker.s3.s3_path_join(self.baseTaskS3Uri, job_name, "output") outputs.append( ProcessingOutput(output_path, output_s3_uri, "output", "EndOfJob")) env["SSM_OUTPUT"] = output_path # ## Inputs # prev state bucket, prefix = sagemaker.s3.parse_s3_url(self.stateS3Uri) if self.smSession.list_s3_files(bucket, prefix): prev_state_path = "/opt/ml/processing/state_prev" inputs.append( ProcessingInput( self.stateS3Uri, prev_state_path, "state_prev", s3_data_distribution_type="FullyReplicated", )) # dependencies # append the internal dependencies dependencies.extend(self.internalDependencies) for dep in dependencies: dep = os.path.abspath(dep) basename = os.path.basename(dep) local_path = f"/opt/ml/processing/input/code/{basename}" inputs.append( ProcessingInput( dep, local_path, "DEP_" + basename, s3_data_distribution_type="FullyReplicated", )) # input data if self.inputS3Uri: data_path = "/opt/ml/processing/data" inputs.append( ProcessingInput( self.inputS3Uri, data_path, "data", s3_data_distribution_type=input_distribution, )) env["SM_CHANNEL_DATA"] = data_path tags["SimpleSagemakerTask"] = self.task_name tags["SimpleSagemakerVersion"] = VERSION tags = [{"Key": k, "Value": v} for k, v in tags.items()] additional_args = dict() if code: processor_class = ScriptProcessor additional_args["command"] = command else: assert ( not command ), "Command can't be given where code isn't given (for the `Processor` class)" processor_class = Processor additional_args["entrypoint"] = entrypoint processor = processor_class( role=role_name, image_uri=self.image_uri, instance_count=instance_count, instance_type=instance_type, volume_size_in_gb=volume_size, max_runtime_in_seconds=max_run_mins * 60, sagemaker_session=self.smSession, tags=tags, env=env, **additional_args, ) if code: processor.run( code=code, inputs=inputs, outputs=outputs, arguments=arguments, job_name=job_name, ) else: processor.run( inputs=inputs, outputs=outputs, arguments=arguments, job_name=job_name, ) proecessing_job_description = self.smSession.describe_processing_job( job_name) self.estimators.append(processor) self.jobNames.append(job_name) self.descriptions.append(proecessing_job_description) # print(proecessing_job_description) # if "Completed" != proecessing_job_description["TrainingJobStatus"]: # logger.error( # f"Task failed with status: {proecessing_job_description['TrainingJobStatus']}" # ) return job_name
instance_type='local', role=role) print('Starting processing job.') print( 'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.' ) processor.run(code='processing_script.py', inputs=[ ProcessingInput( source='./dependencies/', destination='/opt/ml/processing/dependencies/'), ProcessingInput(source='./input_data/', destination='/opt/ml/processing/input_data/') ], outputs=[ ProcessingOutput(output_name='tokenized_words_data', source='/opt/ml/processing/processed_data/') ], arguments=['job-type', 'word-tokenize']) preprocessing_job_description = processor.jobs[-1].describe() output_config = preprocessing_job_description['ProcessingOutputConfig'] print(output_config) for output in output_config['Outputs']: if output['OutputName'] == 'tokenized_words_data': tokenized_words_data_file = output['S3Output']['S3Uri'] print('Output file is located on: {}'.format(tokenized_words_data_file))
def define_inference_pipeline( sm_role, workflow_execution_role, inference_pipeline_name, return_yaml=True, dump_yaml_file="templates/sagemaker_inference_pipeline.yaml", ): """ Return YAML definition of the training pipeline, which consists of multiple Amazon StepFunction steps sm_role: ARN of the SageMaker execution role workflow_execution_role: ARN of the StepFunction execution role return_yaml: Return YAML representation or not, if False, it returns an instance of `stepfunctions.workflow.WorkflowObject` dump_yaml_file: If not None, a YAML file will be generated at this file location """ # Pass required parameters dynamically for each execution using placeholders. execution_input = ExecutionInput( schema={ "InputDataURL": str, "PreprocessingJobName": str, "InferenceJobName": str, "ProcModelS3": str, "PreprocessingCodeURL": str, "InferenceCodeURL": str, "ModelS3": str, "PreprocessedTrainDataURL": str, "PreprocessedTestDataURL": str, "OutputPathURL": str, }) """ Create Preprocessing Model from model artifact. """ # sagemaker_session = sagemaker.Session() sklearn_processor = SKLearnProcessor( framework_version="0.20.0", role=sm_role, instance_type="ml.m5.xlarge", instance_count=1, max_runtime_in_seconds=1200, ) # Create ProcessingInputs and ProcessingOutputs objects for Inputs and # Outputs respectively for the SageMaker Processing Job inputs = [ ProcessingInput( source=execution_input["InputDataURL"], destination="/opt/ml/processing/input", input_name="input-1", ), ProcessingInput( source=execution_input["PreprocessingCodeURL"], destination="/opt/ml/processing/input/code", input_name="code", ), ProcessingInput( source=execution_input["ProcModelS3"], destination="/opt/ml/processing/model", input_name="proc_model", ), ] outputs = [ ProcessingOutput( source="/opt/ml/processing/test", destination=execution_input["PreprocessedTestDataURL"], output_name="test_data", ), ] processing_step = ProcessingStep( "SageMaker pre-processing step", processor=sklearn_processor, job_name=execution_input["PreprocessingJobName"], inputs=inputs, outputs=outputs, container_arguments=["--mode", "infer"], container_entrypoint=[ "python3", "/opt/ml/processing/input/code/preprocessing.py", ], ) """ Create inference with sklearn processing step. Inputs are the preprocessed data S3 URL, the inference code S3 URL, and the model S3 URL. Output is the inferred data. """ sklearn_processor2 = SKLearnProcessor( framework_version="0.20.0", role=sm_role, instance_type="ml.m5.xlarge", instance_count=1, max_runtime_in_seconds=1200, ) inputs = [ ProcessingInput( source=execution_input["PreprocessedTestDataURL"], destination="/opt/ml/processing/input", input_name="input-1", ), ProcessingInput( source=execution_input["InferenceCodeURL"], destination="/opt/ml/processing/input/code", input_name="code", ), ProcessingInput( source=execution_input["ModelS3"], destination="/opt/ml/processing/model", input_name="model", ), ] outputs = [ ProcessingOutput( source="/opt/ml/processing/test", destination=execution_input["OutputPathURL"], output_name="test_data", ), ] inference_step = ProcessingStep( "SageMaker inference step", processor=sklearn_processor2, job_name=execution_input["InferenceJobName"], inputs=inputs, outputs=outputs, container_entrypoint=[ "python3", "/opt/ml/processing/input/code/inference.py", ], ) # Create Fail state to mark the workflow failed in case any of the steps fail. failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail( "ML Workflow failed", cause="SageMakerProcessingJobFailed") # Add the Error handling in the workflow catch_state_processing = stepfunctions.steps.states.Catch( error_equals=["States.TaskFailed"], next_step=failed_state_sagemaker_processing_failure, ) processing_step.add_catch(catch_state_processing) inference_step.add_catch(catch_state_processing) # Create the Workflow workflow_graph = Chain([processing_step, inference_step]) inference_pipeline = Workflow( name=inference_pipeline_name, definition=workflow_graph, role=workflow_execution_role, ) return inference_pipeline
def _run( self, data_config, analysis_config, wait, logs, job_name, kms_key, experiment_config, ): """Runs a ProcessingJob with the Sagemaker Clarify container and an analysis config. Args: data_config (:class:`~sagemaker.clarify.DataConfig`): Config of the input/output data. analysis_config (dict): Config following the analysis_config.json format. wait (bool): Whether the call should wait until the job completes (default: True). logs (bool): Whether to show the logs produced by the job. Only meaningful when ``wait`` is True (default: True). job_name (str): Processing job name. kms_key (str): The ARN of the KMS key that is used to encrypt the user code file (default: None). experiment_config (dict[str, str]): Experiment management configuration. Dictionary contains three optional keys: 'ExperimentName', 'TrialName', and 'TrialComponentDisplayName'. """ analysis_config["methods"]["report"] = { "name": "report", "title": "Analysis Report" } with tempfile.TemporaryDirectory() as tmpdirname: analysis_config_file = os.path.join(tmpdirname, "analysis_config.json") with open(analysis_config_file, "w") as f: json.dump(analysis_config, f) s3_analysis_config_file = _upload_analysis_config( analysis_config_file, data_config.s3_output_path, self.sagemaker_session, kms_key, ) config_input = ProcessingInput( input_name="analysis_config", source=s3_analysis_config_file, destination=self._CLARIFY_CONFIG_INPUT, s3_data_type="S3Prefix", s3_input_mode="File", s3_compression_type="None", ) data_input = ProcessingInput( input_name="dataset", source=data_config.s3_data_input_path, destination=self._CLARIFY_DATA_INPUT, s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type=data_config. s3_data_distribution_type, s3_compression_type=data_config.s3_compression_type, ) result_output = ProcessingOutput( source=self._CLARIFY_OUTPUT, destination=data_config.s3_output_path, output_name="analysis_result", s3_upload_mode="EndOfJob", ) super().run( inputs=[data_input, config_input], outputs=[result_output], wait=wait, logs=logs, job_name=job_name, kms_key=kms_key, experiment_config=experiment_config, )
def test_steps_with_map_params_pipeline( sagemaker_session, role, script_dir, pipeline_name, region_name, athena_dataset_definition, ): instance_count = ParameterInteger(name="InstanceCount", default_value=2) framework_version = "0.20.0" instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") output_prefix = ParameterString(name="OutputPrefix", default_value="output") input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv" sklearn_processor = SKLearnProcessor( framework_version=framework_version, instance_type=instance_type, instance_count=instance_count, base_job_name="test-sklearn", sagemaker_session=sagemaker_session, role=role, ) step_process = ProcessingStep( name="my-process", display_name="ProcessingStep", description="description for Processing step", processor=sklearn_processor, inputs=[ ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), ProcessingInput(dataset_definition=athena_dataset_definition), ], outputs=[ ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"), ProcessingOutput( output_name="test_data", source="/opt/ml/processing/test", destination=Join( on="/", values=[ "s3:/", sagemaker_session.default_bucket(), "test-sklearn", output_prefix, ExecutionVariables.PIPELINE_EXECUTION_ID, ], ), ), ], code=os.path.join(script_dir, "preprocessing.py"), ) sklearn_train = SKLearn( framework_version=framework_version, entry_point=os.path.join(script_dir, "train.py"), instance_type=instance_type, sagemaker_session=sagemaker_session, role=role, hyperparameters={ "batch-size": 500, "epochs": 5, }, ) step_train = TrainingStep( name="my-train", display_name="TrainingStep", description="description for Training step", estimator=sklearn_train, inputs=TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train_data" ].S3Output.S3Uri ), ) model = Model( image_uri=sklearn_train.image_uri, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, sagemaker_session=sagemaker_session, role=role, ) model_inputs = CreateModelInput( instance_type="ml.m5.large", accelerator_type="ml.eia1.medium", ) step_model = CreateModelStep( name="my-model", display_name="ModelStep", description="description for Model step", model=model, inputs=model_inputs, ) # Condition step for evaluating model quality and branching execution cond_lte = ConditionGreaterThanOrEqualTo( left=step_train.properties.HyperParameters["batch-size"], right=6.0, ) step_cond = ConditionStep( name="CustomerChurnAccuracyCond", conditions=[cond_lte], if_steps=[], else_steps=[step_model], ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_type, instance_count, output_prefix], steps=[step_process, step_train, step_cond], sagemaker_session=sagemaker_session, ) definition = json.loads(pipeline.definition()) assert definition["Version"] == "2020-12-01" steps = definition["Steps"] assert len(steps) == 3 training_args = {} condition_args = {} for step in steps: if step["Type"] == "Training": training_args = step["Arguments"] if step["Type"] == "Condition": condition_args = step["Arguments"] assert training_args["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"] == { "Get": "Steps.my-process.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri" } assert condition_args["Conditions"][0]["LeftValue"] == { "Get": "Steps.my-train.HyperParameters['batch-size']" } try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) finally: try: pipeline.delete() except Exception: pass
def test_tuning_multi_algos( sagemaker_session, role, cpu_instance_type, pipeline_name, region_name, script_dir, athena_dataset_definition, ): base_dir = os.path.join(DATA_DIR, "pytorch_mnist") entry_point = os.path.join(base_dir, "mnist.py") input_path = sagemaker_session.upload_data( path=os.path.join(base_dir, "training"), key_prefix="integ-test-data/pytorch_mnist/training", ) instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv" sklearn_processor = SKLearnProcessor( framework_version="0.20.0", instance_type=instance_type, instance_count=instance_count, base_job_name="test-sklearn", sagemaker_session=sagemaker_session, role=role, ) property_file = PropertyFile(name="DataAttributes", output_name="attributes", path="attributes.json") step_process = ProcessingStep( name="my-process", display_name="ProcessingStep", description="description for Processing step", processor=sklearn_processor, inputs=[ ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), ProcessingInput(dataset_definition=athena_dataset_definition), ], outputs=[ ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"), ProcessingOutput(output_name="attributes", source="/opt/ml/processing/attributes.json"), ], property_files=[property_file], code=os.path.join(script_dir, "preprocessing.py"), ) static_hp_1 = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") json_get_hp = JsonGet(step_name=step_process.name, property_file=property_file, json_path="train_size") pytorch_estimator = PyTorch( entry_point=entry_point, role=role, framework_version="1.5.0", py_version="py3", instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, enable_sagemaker_metrics=True, max_retry_attempts=3, hyperparameters={ "static-hp": static_hp_1, "train_size": json_get_hp }, ) min_batch_size = ParameterString(name="MinBatchSize", default_value="64") max_batch_size = json_get_hp tuner = HyperparameterTuner.create( estimator_dict={ "estimator-1": pytorch_estimator, "estimator-2": pytorch_estimator, }, objective_metric_name_dict={ "estimator-1": "test:acc", "estimator-2": "test:acc", }, hyperparameter_ranges_dict={ "estimator-1": { "batch-size": IntegerParameter(min_batch_size, max_batch_size) }, "estimator-2": { "batch-size": IntegerParameter(min_batch_size, max_batch_size) }, }, metric_definitions_dict={ "estimator-1": [{ "Name": "test:acc", "Regex": "Overall test accuracy: (.*?);" }], "estimator-2": [{ "Name": "test:acc", "Regex": "Overall test accuracy: (.*?);" }], }, ) inputs = { "estimator-1": TrainingInput(s3_data=input_path), "estimator-2": TrainingInput(s3_data=input_path), } step_tune = TuningStep( name="my-tuning-step", tuner=tuner, inputs=inputs, ) pipeline = Pipeline( name=pipeline_name, parameters=[ instance_count, instance_type, min_batch_size, max_batch_size ], steps=[step_process, step_tune], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) finally: try: pipeline.delete() except Exception: pass
def test_end_to_end_pipeline_successful_execution( sagemaker_session, region_name, role, pipeline_name, wait=False ): model_package_group_name = f"{pipeline_name}ModelPackageGroup" data_path = os.path.join(DATA_DIR, "workflow") default_bucket = sagemaker_session.default_bucket() # download the input data local_input_path = os.path.join(data_path, "abalone-dataset.csv") s3 = sagemaker_session.boto_session.resource("s3") s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file( "dataset/abalone-dataset.csv", local_input_path ) # # upload the input data to our bucket base_uri = f"s3://{default_bucket}/{pipeline_name}" with open(local_input_path) as data: body = data.read() input_data_uri = S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=f"{base_uri}/abalone-dataset.csv", sagemaker_session=sagemaker_session, ) # download batch transform data local_batch_path = os.path.join(data_path, "abalone-dataset-batch") s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file( "dataset/abalone-dataset-batch", local_batch_path ) # upload the batch transform data with open(local_batch_path) as data: body = data.read() batch_data_uri = S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=f"{base_uri}/abalone-dataset-batch", sagemaker_session=sagemaker_session, ) # define parameters processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString( name="ProcessingInstanceType", default_value="ml.m5.xlarge" ) training_instance_type = ParameterString( name="TrainingInstanceType", default_value="ml.m5.xlarge" ) model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved") input_data = ParameterString( name="InputData", default_value=input_data_uri, ) batch_data = ParameterString( name="BatchData", default_value=batch_data_uri, ) # define processing step framework_version = "0.23-1" sklearn_processor = SKLearnProcessor( framework_version=framework_version, instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name=f"{pipeline_name}-process", role=role, sagemaker_session=sagemaker_session, ) step_process = ProcessingStep( name="AbaloneProcess", processor=sklearn_processor, inputs=[ ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), ], outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), ], code=os.path.join(data_path, "abalone/preprocessing.py"), ) # define training step model_path = f"s3://{default_bucket}/{pipeline_name}Train" image_uri = image_uris.retrieve( framework="xgboost", region=region_name, version="1.0-1", py_version="py3", instance_type=training_instance_type, ) xgb_train = Estimator( image_uri=image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, role=role, sagemaker_session=sagemaker_session, ) xgb_train.set_hyperparameters( objective="reg:linear", num_round=50, max_depth=5, eta=0.2, gamma=4, min_child_weight=6, subsample=0.7, silent=0, ) step_train = TrainingStep( name="AbaloneTrain", estimator=xgb_train, inputs={ "train": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, content_type="text/csv", ), }, ) # define evaluation step script_eval = ScriptProcessor( image_uri=image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{pipeline_name}-eval", role=role, sagemaker_session=sagemaker_session, ) evaluation_report = PropertyFile( name="EvaluationReport", output_name="evaluation", path="evaluation.json" ) step_eval = ProcessingStep( name="AbaloneEval", processor=script_eval, inputs=[ ProcessingInput( source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig.Outputs[ "test" ].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(data_path, "abalone/evaluation.py"), property_files=[evaluation_report], ) # define create model step model = Model( image_uri=image_uri, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, sagemaker_session=sagemaker_session, role=role, ) inputs = CreateModelInput( instance_type="ml.m5.large", accelerator_type="ml.eia1.medium", ) step_create_model = CreateModelStep( name="AbaloneCreateModel", model=model, inputs=inputs, ) # define transform step transformer = Transformer( model_name=step_create_model.properties.ModelName, instance_type="ml.m5.xlarge", instance_count=1, output_path=f"s3://{default_bucket}/{pipeline_name}Transform", sagemaker_session=sagemaker_session, ) step_transform = TransformStep( name="AbaloneTransform", transformer=transformer, inputs=TransformInput(data=batch_data), ) # define register model step model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/evaluation.json".format( step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] ), content_type="application/json", ) ) step_register = RegisterModel( name="AbaloneRegisterModel", estimator=xgb_train, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.xlarge"], transform_instances=["ml.m5.xlarge"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=model_metrics, ) # define condition step cond_lte = ConditionLessThanOrEqualTo( left=JsonGet( step_name=step_eval.name, property_file=evaluation_report, json_path="regression_metrics.mse.value", ), right=20.0, ) step_cond = ConditionStep( name="AbaloneMSECond", conditions=[cond_lte], if_steps=[step_register, step_create_model, step_transform], else_steps=[], ) # define pipeline pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data, batch_data, ], steps=[step_process, step_train, step_eval, step_cond], sagemaker_session=sagemaker_session, ) pipeline.create(role) execution = pipeline.start() execution_arn = execution.arn if wait: execution.wait() return execution_arn
def run_model_monitor_job_processor( region, instance_type, role, data_capture_path, statistics_path, constraints_path, reports_path, instance_count=1, preprocessor_path=None, postprocessor_path=None, publish_cloudwatch_metrics="Disabled", ): data_capture_sub_path = data_capture_path[data_capture_path. rfind("datacapture/"):] data_capture_sub_path = data_capture_sub_path[data_capture_sub_path. find("/") + 1:] processing_output_paths = reports_path + "/" + data_capture_sub_path input_1 = ProcessingInput( input_name="input_1", source=data_capture_path, destination="/opt/ml/processing/input/endpoint/" + data_capture_sub_path, s3_data_type="S3Prefix", s3_input_mode="File", ) baseline = ProcessingInput( input_name="baseline", source=statistics_path, destination="/opt/ml/processing/baseline/stats", s3_data_type="S3Prefix", s3_input_mode="File", ) constraints = ProcessingInput( input_name="constraints", source=constraints_path, destination="/opt/ml/processing/baseline/constraints", s3_data_type="S3Prefix", s3_input_mode="File", ) outputs = ProcessingOutput( output_name="result", source="/opt/ml/processing/output", destination=processing_output_paths, s3_upload_mode="Continuous", ) env = { "baseline_constraints": "/opt/ml/processing/baseline/constraints/" + get_file_name(constraints_path), "baseline_statistics": "/opt/ml/processing/baseline/stats/" + get_file_name(statistics_path), "dataset_format": '{"sagemakerCaptureJson":{"captureIndexNames":["endpointInput","endpointOutput"]}}', "dataset_source": "/opt/ml/processing/input/endpoint", "output_path": "/opt/ml/processing/output", "publish_cloudwatch_metrics": publish_cloudwatch_metrics, } inputs = [input_1, baseline, constraints] if postprocessor_path: env["post_analytics_processor_script"] = "/opt/ml/processing/code/postprocessing/" + get_file_name( postprocessor_path) post_processor_script = ProcessingInput( input_name="post_processor_script", source=postprocessor_path, destination="/opt/ml/processing/code/postprocessing", s3_data_type="S3Prefix", s3_input_mode="File", ) inputs.append(post_processor_script) if preprocessor_path: env["record_preprocessor_script"] = "/opt/ml/processing/code/preprocessing/" + get_file_name( preprocessor_path) pre_processor_script = ProcessingInput( input_name="pre_processor_script", source=preprocessor_path, destination="/opt/ml/processing/code/preprocessing", s3_data_type="S3Prefix", s3_input_mode="File", ) inputs.append(pre_processor_script) processor = Processor( image_uri=get_model_monitor_container_uri(region), instance_count=instance_count, instance_type=instance_type, role=role, env=env, ) return processor.run(inputs=inputs, outputs=[outputs])
def test_local_processing_script_processor(sagemaker_local_session, sklearn_image_uri): input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") script_processor = ScriptProcessor( role="SageMakerRole", image_uri=sklearn_image_uri, command=["python3"], instance_count=1, instance_type="local", volume_size_in_gb=30, volume_kms_key=None, max_runtime_in_seconds=3600, base_job_name="test-script-processor", env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"}, tags=[{ "Key": "dummy-tag", "Value": "dummy-tag-value" }], sagemaker_session=sagemaker_local_session, ) script_processor.run( code=os.path.join(DATA_DIR, "dummy_script.py"), inputs=[ ProcessingInput( source=input_file_path, destination="/opt/ml/processing/input/container/path/", input_name="dummy_input", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", s3_compression_type="None", ) ], outputs=[ ProcessingOutput( source="/opt/ml/processing/output/container/path/", output_name="dummy_output", s3_upload_mode="EndOfJob", ) ], arguments=["-v"], wait=True, logs=True, ) job_description = script_processor.latest_job.describe() assert job_description["ProcessingInputs"][0]["InputName"] == "dummy_input" assert job_description["ProcessingInputs"][1]["InputName"] == "code" assert job_description["ProcessingJobName"].startswith( "test-script-processor") assert job_description["ProcessingJobStatus"] == "Completed" assert job_description["ProcessingOutputConfig"]["Outputs"][0][ "OutputName"] == "dummy_output" assert job_description["ProcessingResources"]["ClusterConfig"][ "InstanceCount"] == 1 assert job_description["ProcessingResources"]["ClusterConfig"][ "InstanceType"] == "local" assert job_description["ProcessingResources"]["ClusterConfig"][ "VolumeSizeInGB"] == 30 assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"] assert job_description["AppSpecification"]["ContainerEntrypoint"] == [ "python3", "/opt/ml/processing/input/code/dummy_script.py", ] assert job_description["AppSpecification"]["ImageUri"] == sklearn_image_uri assert job_description["Environment"] == { "DUMMY_ENVIRONMENT_VARIABLE": "dummy-value" }
def get_pipeline( region, role=None, default_bucket=None, model_package_group_name="sagemaker-group-insurance", pipeline_name="sagemaker-pipeline-insurance", base_job_prefix="sagemaker-featurestore-insurance", ): """Gets a SageMaker ML Pipeline instance working with on WIP data. Args: region: AWS region to create and run the pipeline. role: IAM role to create and run steps and pipeline. default_bucket: the bucket to use for storing the artifacts Returns: an instance of a pipeline """ sagemaker_session = get_session(region, default_bucket) if role is None: role = sagemaker.session.get_execution_role(sagemaker_session) # parameters for pipeline execution processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString( name="ProcessingInstanceType", default_value="ml.m5.xlarge" ) training_instance_type = ParameterString( name="TrainingInstanceType", default_value="ml.m5.xlarge" ) model_approval_status = ParameterString( name="ModelApprovalStatus", default_value="Approved" ) # processing step for feature engineering sklearn_processor = SKLearnProcessor( framework_version="0.23-1", instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name=f"{base_job_prefix}/sklearn-insurance-preprocess", sagemaker_session=sagemaker_session, role=role, ) step_process = ProcessingStep( name="PreprocessInsuranceData", processor=sklearn_processor, outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), ], code=os.path.join(BASE_DIR, "preprocess.py"), job_arguments=["--input_dataset_1", "41214", "--input_dataset_2", "41215",], ) ''' # feature store step feature_path = 's3://' + default_bucket+'/'+base_job_prefix + '/features' image_uri = sagemaker.image_uris.retrieve( framework="xgboost", region=region, version="1.0-1", py_version="py3", instance_type=training_instance_type, ) feature_processor = ScriptProcessor( image_uri=image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-insurance-feature-store", sagemaker_session=sagemaker_session, role=role, ) step_feature = ProcessingStep( name="FeatureStoreInsuranceData", processor=feature_processor, outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/training_input"), ], code=os.path.join(BASE_DIR, "feature_store.py"), job_arguments=["feature_s3_url", feature_path, "--feature_group_name", "sagemaker-featurestore-insurance"], ) ''' # training step for generating model artifacts model_path = 's3://' + default_bucket+'/'+base_job_prefix + '/features' image_uri = sagemaker.image_uris.retrieve( framework="xgboost", region=region, version="1.0-1", py_version="py3", instance_type=training_instance_type, ) xgb_train = Estimator( image_uri=image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, base_job_name=f"{base_job_prefix}/insurance-train", sagemaker_session=sagemaker_session, role=role, ) xgb_train.set_hyperparameters(objective = "reg:tweedie", num_round = 50) step_train = TrainingStep( name="TrainAbaloneModel", estimator=xgb_train, inputs={ "train": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, content_type="text/csv", ), }, ) # processing step for evaluation script_eval = ScriptProcessor( image_uri=image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-wip-eval", sagemaker_session=sagemaker_session, role=role, ) evaluation_report = PropertyFile( name="WipEvaluationReport", output_name="evaluation", path="evaluation.json", ) step_eval = ProcessingStep( name="EvaluateWipModel", processor=script_eval, inputs=[ ProcessingInput( source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig.Outputs[ "test" ].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(BASE_DIR, "evaluate.py"), property_files=[evaluation_report], ) # register model step that will be conditionally executed model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/evaluation.json".format( step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] ), content_type="application/json" ) ) step_register = RegisterModel( name="register-insurance-model", estimator=xgb_train, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=model_metrics, ) # condition step for evaluating model quality and branching execution cond_lte = ConditionLessThanOrEqualTo( left=JsonGet( step=step_eval, property_file=evaluation_report, json_path="regression_metrics.mse.value" ), right=6.0, ) step_cond = ConditionStep( name="CheckMSEWipEvaluation", conditions=[cond_lte], if_steps=[], else_steps=[step_register], ) pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, ], steps=[step_process, step_train, step_eval, step_cond], sagemaker_session=sagemaker_session, ) return pipeline
def test_processing_step_creation(sklearn_processor): inputs = [ ProcessingInput(source='dataset.csv', destination='/opt/ml/processing/input') ] outputs = [ ProcessingOutput(source='/opt/ml/processing/output/train'), ProcessingOutput(source='/opt/ml/processing/output/validation'), ProcessingOutput(source='/opt/ml/processing/output/test') ] step = ProcessingStep('Feature Transformation', sklearn_processor, 'MyProcessingJob', inputs=inputs, outputs=outputs) assert step.to_dict() == { 'Type': 'Task', 'Parameters': { 'AppSpecification': { 'ImageUri': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3' }, 'ProcessingInputs': [{ 'InputName': None, 'S3Input': { 'LocalPath': '/opt/ml/processing/input', 'S3CompressionType': 'None', 'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3Uri': 'dataset.csv' } }], 'ProcessingOutputConfig': { 'Outputs': [{ 'OutputName': None, 'S3Output': { 'LocalPath': '/opt/ml/processing/output/train', 'S3UploadMode': 'EndOfJob', 'S3Uri': None } }, { 'OutputName': None, 'S3Output': { 'LocalPath': '/opt/ml/processing/output/validation', 'S3UploadMode': 'EndOfJob', 'S3Uri': None } }, { 'OutputName': None, 'S3Output': { 'LocalPath': '/opt/ml/processing/output/test', 'S3UploadMode': 'EndOfJob', 'S3Uri': None } }] }, 'ProcessingResources': { 'ClusterConfig': { 'InstanceCount': 1, 'InstanceType': 'ml.m5.xlarge', 'VolumeSizeInGB': 30 } }, 'ProcessingJobName': 'MyProcessingJob', 'RoleArn': EXECUTION_ROLE }, 'Resource': 'arn:aws:states:::sagemaker:createProcessingJob.sync', 'End': True }
ProcessingInput( source=input_data, destination="/opt/ml/processing/input", input_name="input-1", ), ProcessingInput( source=input_code, destination="/opt/ml/processing/input/code", input_name="code", ), ] outputs = [ ProcessingOutput( source="/opt/ml/processing/train", destination="{}/{}".format(output_data, "train_data"), output_name="train_data", ), ProcessingOutput( source="/opt/ml/processing/test", destination="{}/{}".format(output_data, "test_data"), output_name="test_data", ), ] processor = SKLearnProcessor( framework_version="0.20.0", role=role.role_arn, instance_type="ml.m5.xlarge", instance_count=1, )
def get_pipeline( region, sagemaker_session, role=None, default_bucket=None, model_package_group_name="sts-sklearn-grp", pipeline_name="stsPipeline", base_job_prefix="sts", ) -> Pipeline: """Gets a SageMaker ML Pipeline instance working with on sts data. Args: region: AWS region to create and run the pipeline. role: IAM role to create and run steps and pipeline. default_bucket: the bucket to use for storing the artifacts Returns: an instance of a pipeline """ """ Instance types allowed: ml.r5.12xlarge, ml.m5.4xlarge, ml.p2.xlarge, ml.m4.16xlarge, ml.r5.24xlarge, ml.t3.xlarge, ml.r5.16xlarge, ml.m5.large, ml.p3.16xlarge, ml.p2.16xlarge, ml.c4.2xlarge, ml.c5.2xlarge, ml.c4.4xlarge, ml.c5.4xlarge, ml.c4.8xlarge, ml.c5.9xlarge, ml.c5.xlarge, ml.c4.xlarge, ml.t3.2xlarge, ml.t3.medium, ml.c5.18xlarge, ml.r5.2xlarge, ml.p3.2xlarge, ml.m5.xlarge, ml.m4.10xlarge, ml.r5.4xlarge, ml.m5.12xlarge, ml.m4.xlarge, ml.t3.large, ml.m5.24xlarge, ml.m4.2xlarge, ml.m5.2xlarge, ml.p2.8xlarge, ml.r5.8xlarge, ml.r5.xlarge, ml.r5.large, ml.p3.8xlarge, ml.m4.4xlarge see https://aws.amazon.com/blogs/machine-learning/right-sizing-resources-and-avoiding-unnecessary-costs-in-amazon-sagemaker/ """ sagemaker_session = get_session(region, default_bucket) if role is None: role = sagemaker.session.get_execution_role(sagemaker_session) # parameters for pipeline execution processing_instance_count = ParameterInteger( name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.xlarge") # as of free tier of 50 hours of m4.xlarge or m5.xlarge instances training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge") model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved") # preprocess # preprocess input data input_data = ParameterString( name="InputDataUrl", default_value=f"s3://sts-datwit-dataset/stsmsrpc.txt", ) # processing step for feature engineering sklearn_processor = SKLearnProcessor( framework_version="0.23-1", instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name=f"{base_job_prefix}/sklearn-sts-preprocess", sagemaker_session=sagemaker_session, role=role, ) step_preprocess = ProcessingStep( name="PreprocessSTSData", processor=sklearn_processor, outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), ], code=os.path.join(BASE_DIR, "preprocess.py"), job_arguments=["--input-data", input_data], ) # training step for generating model artifacts model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/stsTrain" image_uri = sagemaker.image_uris.retrieve( framework="sklearn", region=region, version="0.23-1", py_version="py3", instance_type=training_instance_type, ) sklearn_estimator = SKLearn( entry_point='training.py', source_dir=BASE_DIR, instance_type=training_instance_type, instance_count=1, output_path=model_path, framework_version="0.23-1", py_version="py3", base_job_name=f"{base_job_prefix}/sts-train", sagemaker_session=sagemaker_session, role=role, ) step_train = TrainingStep( name="TrainSTSModel", estimator=sklearn_estimator, inputs={ "train": TrainingInput( s3_data=step_preprocess.properties.ProcessingOutputConfig. Outputs["train"].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_preprocess.properties.ProcessingOutputConfig. Outputs["validation"].S3Output.S3Uri, content_type="text/csv", ), }, ) # processing step for evaluation script_eval = ScriptProcessor( image_uri=image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-sts-eval", sagemaker_session=sagemaker_session, role=role, ) evaluation_report = PropertyFile( name="stsEvaluationReport", output_name="evaluation", path="evaluation.json", ) step_eval = ProcessingStep( name="EvaluateSTSModel", processor=script_eval, inputs=[ ProcessingInput( source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model", ), ProcessingInput( source=step_preprocess.properties.ProcessingOutputConfig. Outputs["test"].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(BASE_DIR, "evaluate.py"), property_files=[evaluation_report], ) # setup model quality monitoring baseline data script_process_baseline_data = ScriptProcessor( image_uri=image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/baseline", sagemaker_session=sagemaker_session, role=role, ) step_proccess_baseline_data = ProcessingStep( name="SetupMonitoringData", processor=script_process_baseline_data, inputs=[ ProcessingInput( source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model", ), ProcessingInput( source=step_preprocess.properties.ProcessingOutputConfig. Outputs["validation"].S3Output.S3Uri, destination="/opt/ml/processing/validation", ), ], outputs=[ ProcessingOutput(output_name="validate", source="/opt/ml/processing/validate"), ], code=os.path.join(BASE_DIR, "baseline.py")) # --- # register model step that will be conditionally executed model_metrics = ModelMetrics( model_statistics=MetricsSource(s3_uri="{}/evaluation.json".format( step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0] ["S3Output"]["S3Uri"]), content_type="application/json")) step_register = RegisterModel( name="RegisterSTSModel", estimator=sklearn_estimator, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.m5.xlarge"], transform_instances=["ml.m5.xlarge"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=model_metrics, ) # condition step for evaluating model quality and branching execution cond_lte = ConditionLessThanOrEqualTo( left=JsonGet(step=step_eval, property_file=evaluation_report, json_path="regression_metrics.mse.value"), right=6.0, ) step_cond = ConditionStep( name="CheckMSESTSEvaluation", conditions=[cond_lte], if_steps=[step_register, step_proccess_baseline_data], # if_steps=[step_register], else_steps=[], ) # pipeline instance pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data, ], steps=[step_preprocess, step_train, step_eval, step_cond], sagemaker_session=sagemaker_session, ) return pipeline
def get_pipeline( region, project_name=None, model_package_group_name="AbalonePackageGroup", pipeline_name="AbalonePipeline", base_job_prefix="Abalone", ): """Gets a SageMaker ML Pipeline instance working with on abalone data. Args: region: AWS region to create and run the pipeline. processing_role: IAM role to create and run processing steps training_role: IAM role to create and run training steps data_bucket: the bucket to use for storing the artifacts Returns: an instance of a pipeline """ # Dynamically load environmental SSM parameters - provide the list of the variables to load from SSM parameter store ssm_parameters = [ {"VariableName":"DataBucketName", "ParameterName":"data-bucket-name"}, {"VariableName":"ModelBucketName", "ParameterName":"model-bucket-name"}, {"VariableName":"S3KmsKeyId", "ParameterName":"kms-s3-key-arn"}, {"VariableName":"EbsKmsKeyArn", "ParameterName":"kms-ebs-key-arn"}, ] env_data = get_environment(project_name=project_name, ssm_params=ssm_parameters) print(f"Environment data:\n{json.dumps(env_data, indent=2)}") security_group_ids = env_data["SecurityGroups"] subnets = env_data["SubnetIds"] processing_role = env_data["ExecutionRole"] training_role = env_data["ExecutionRole"] data_bucket = env_data["DataBucketName"] model_bucket = env_data["ModelBucketName"] ebs_kms_id = env_data["EbsKmsKeyArn"] s3_kms_id = env_data["S3KmsKeyId"] sagemaker_session = get_session(region, data_bucket) if processing_role is None: processing_role = sagemaker.session.get_execution_role(sagemaker_session) if training_role is None: training_role = sagemaker.session.get_execution_role(sagemaker_session) if model_bucket is None: model_bucket = sagemaker_session.default_bucket() print(f"Creating the pipeline '{pipeline_name}':") print(f"Parameters:{region}\n{security_group_ids}\n{subnets}\n{processing_role}\n\ {training_role}\n{data_bucket}\n{model_bucket}\n{model_package_group_name}\n\ {pipeline_name}\n{base_job_prefix}") # parameters for pipeline execution processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString( name="ProcessingInstanceType", default_value="ml.m5.xlarge" ) training_instance_type = ParameterString( name="TrainingInstanceType", default_value="ml.m5.xlarge" ) model_approval_status = ParameterString( name="ModelApprovalStatus", default_value="PendingManualApproval" ) input_data = ParameterString( name="InputDataUrl", default_value=f"s3://{sagemaker_session.default_bucket()}/datasets/abalone-dataset.csv", ) # configure network for encryption, network isolation and VPC configuration # Since the preprocessor job takes the data from S3, enable_network_isolation must be set to False # see https://github.com/aws/amazon-sagemaker-examples/issues/1689 network_config = NetworkConfig( enable_network_isolation=False, security_group_ids=security_group_ids, subnets=subnets, encrypt_inter_container_traffic=True) # processing step for feature engineering sklearn_processor = SKLearnProcessor( framework_version="0.23-1", instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name=f"{base_job_prefix}/sklearn-abalone-preprocess", sagemaker_session=sagemaker_session, role=processing_role, network_config=network_config, volume_kms_key=ebs_kms_id, output_kms_key=s3_kms_id ) step_process = ProcessingStep( name="PreprocessAbaloneData", processor=sklearn_processor, outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), ], code=os.path.join(BASE_DIR, "preprocess.py"), job_arguments=["--input-data", input_data], ) # training step for generating model artifacts model_path = f"s3://{model_bucket}/{base_job_prefix}/AbaloneTrain" image_uri = sagemaker.image_uris.retrieve( framework="xgboost", region=region, version="1.0-1", py_version="py3", instance_type=training_instance_type, ) xgb_train = Estimator( image_uri=image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, base_job_name=f"{base_job_prefix}/abalone-train", sagemaker_session=sagemaker_session, role=training_role, subnets=network_config.subnets, security_group_ids=network_config.security_group_ids, encrypt_inter_container_traffic=True, enable_network_isolation=False, volume_kms_key=ebs_kms_id, output_kms_key=s3_kms_id ) xgb_train.set_hyperparameters( objective="reg:linear", num_round=50, max_depth=5, eta=0.2, gamma=4, min_child_weight=6, subsample=0.7, silent=0, ) step_train = TrainingStep( name="TrainAbaloneModel", estimator=xgb_train, inputs={ "train": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, content_type="text/csv", ), }, ) # processing step for evaluation script_eval = ScriptProcessor( image_uri=image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{base_job_prefix}/script-abalone-eval", sagemaker_session=sagemaker_session, role=processing_role, network_config=network_config, volume_kms_key=ebs_kms_id, output_kms_key=s3_kms_id ) evaluation_report = PropertyFile( name="AbaloneEvaluationReport", output_name="evaluation", path="evaluation.json", ) step_eval = ProcessingStep( name="EvaluateAbaloneModel", processor=script_eval, inputs=[ ProcessingInput( source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig.Outputs[ "test" ].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(BASE_DIR, "evaluate.py"), property_files=[evaluation_report], ) # register model step that will be conditionally executed model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/evaluation.json".format( step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] ), content_type="application/json" ) ) """ There is a bug in RegisterModel implementation The RegisterModel step is implemented in the SDK as two steps, a _RepackModelStep and a _RegisterModelStep. The _RepackModelStep runs a SKLearn training step in order to repack the model.tar.gz to include any custom inference code in the archive. The _RegisterModelStep then registers the repacked model. The problem is that the _RepackModelStep does not propagate VPC configuration from the Estimator object: https://github.com/aws/sagemaker-python-sdk/blob/cdb633b3ab02398c3b77f5ecd2c03cdf41049c78/src/sagemaker/workflow/_utils.py#L88 This cause the AccessDenied exception because repacker cannot access S3 bucket (all access which is not via VPC endpoint is bloked by the bucket policy) The issue is opened against SageMaker python SDK: https://github.com/aws/sagemaker-python-sdk/issues/2302 """ vpc_config = { "Subnets":network_config.subnets, "SecurityGroupIds":network_config.security_group_ids } step_register = RegisterModel( name="RegisterAbaloneModel", estimator=xgb_train, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=model_metrics, vpc_config_override=vpc_config ) # condition step for evaluating model quality and branching execution cond_lte = ConditionLessThanOrEqualTo( left=JsonGet( step=step_eval, property_file=evaluation_report, json_path="regression_metrics.mse.value" ), right=6.0, ) step_cond = ConditionStep( name="CheckMSEAbaloneEvaluation", conditions=[cond_lte], if_steps=[step_register], else_steps=[], ) # pipeline instance pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data, ], steps=[step_process, step_train, step_eval, step_cond], sagemaker_session=sagemaker_session, ) return pipeline