def test_script_processor_with_all_parameters(exists_mock, isfile_mock,
                                              sagemaker_session):
    processor = ScriptProcessor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        command=["python3"],
        instance_type="ml.m4.xlarge",
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{
            "Key": "my-tag",
            "Value": "my-tag-value"
        }],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
            encrypt_inter_container_traffic=True,
        ),
        sagemaker_session=sagemaker_session,
    )

    processor.run(
        code="/local/path/to/processing_code.py",
        inputs=[
            ProcessingInput(
                source="s3://path/to/my/dataset/census.csv",
                destination="/container/path/",
                input_name="my_dataset",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_data_distribution_type="FullyReplicated",
                s3_compression_type="None",
            )
        ],
        outputs=[
            ProcessingOutput(
                source="/container/path/",
                destination="s3://uri/",
                output_name="my_output",
                s3_upload_mode="EndOfJob",
            )
        ],
        arguments=["--drop-columns", "'SelfEmployed'"],
        wait=True,
        logs=False,
        job_name="my_job_name",
        experiment_config={"ExperimentName": "AnExperiment"},
    )

    expected_args = _get_expected_args_all_parameters(
        processor._current_job_name)

    sagemaker_session.process.assert_called_with(**expected_args)
    assert "my_job_name" in processor._current_job_name
Exemplo n.º 2
0
def test_script_processor_with_no_inputs_or_outputs(sagemaker_session,
                                                    image_uri,
                                                    cpu_instance_type):
    script_processor = ScriptProcessor(
        role=ROLE,
        image_uri=image_uri,
        command=["python3"],
        instance_count=1,
        instance_type=cpu_instance_type,
        volume_size_in_gb=100,
        volume_kms_key=None,
        max_runtime_in_seconds=3600,
        base_job_name="test-script-processor-with-no-inputs-or-outputs",
        env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"},
        tags=[{
            "Key": "dummy-tag",
            "Value": "dummy-tag-value"
        }],
        sagemaker_session=sagemaker_session,
    )

    script_processor.run(code=os.path.join(DATA_DIR, "dummy_script.py"),
                         arguments=["-v"],
                         wait=True,
                         logs=True)

    job_description = script_processor.latest_job.describe()

    assert job_description["ProcessingInputs"][0]["InputName"] == "code"

    assert job_description["ProcessingJobName"].startswith(
        "test-script-processor-with-no-inputs")

    assert job_description["ProcessingJobStatus"] == "Completed"

    assert job_description["ProcessingResources"] == {
        "ClusterConfig": {
            "InstanceCount": 1,
            "InstanceType": "ml.m4.xlarge",
            "VolumeSizeInGB": 100
        }
    }

    assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"]
    assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
        "python3",
        "/opt/ml/processing/input/code/dummy_script.py",
    ]
    assert job_description["AppSpecification"]["ImageUri"] == image_uri

    assert job_description["Environment"] == {
        "DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"
    }

    assert ROLE in job_description["RoleArn"]

    assert job_description["StoppingCondition"] == {
        "MaxRuntimeInSeconds": 3600
    }
Exemplo n.º 3
0
def script_processor(sagemaker_session):
    return ScriptProcessor(
        role=ROLE,
        image_uri=
        "012345678901.dkr.ecr.us-west-2.amazonaws.com/my-custom-image-uri",
        command=["python3"],
        instance_type="ml.m4.xlarge",
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{
            "Key": "my-tag",
            "Value": "my-tag-value"
        }],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
            encrypt_inter_container_traffic=True,
        ),
        sagemaker_session=sagemaker_session,
    )
def test_script_processor_with_all_parameters_via_run_args(
    exists_mock, isfile_mock, sagemaker_session
):
    processor = ScriptProcessor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        command=["python3"],
        instance_type="ml.m4.xlarge",
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{"Key": "my-tag", "Value": "my-tag-value"}],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
            encrypt_inter_container_traffic=True,
        ),
        sagemaker_session=sagemaker_session,
    )

    run_args = processor.get_run_args(
        code="/local/path/to/processing_code.py",
        inputs=_get_data_inputs_all_parameters(),
        outputs=_get_data_outputs_all_parameters(),
        arguments=["--drop-columns", "'SelfEmployed'"],
    )

    processor.run(
        code=run_args.code,
        inputs=run_args.inputs,
        outputs=run_args.outputs,
        arguments=run_args.arguments,
        wait=True,
        logs=False,
        job_name="my_job_name",
        experiment_config={"ExperimentName": "AnExperiment"},
    )

    expected_args = _get_expected_args_all_parameters(processor._current_job_name)

    sagemaker_session.process.assert_called_with(**expected_args)
    assert "my_job_name" in processor._current_job_name
def lambda_handler(event, context):
    timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())    # Variables de ambiente
    local_filename, headers = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/sparkml-mleap/data/abalone/abalone.csv')
    os.rename(local_filename,'/tmp/abalone.csv')
    
    bucket = os.environ['bucket']
    role = os.environ['role']
    sagemaker_session = sagemaker.Session()
    spark_repository_uri = os.environ['spark_repository_uri']
    # Prefix constantes
    prefix = 'sagemaker/spark-preprocess-demo/' + timestamp_prefix 
    input_prefix = prefix + '/input/raw/abalone'
    input_preprocessed_prefix = prefix + '/input/preprocessed/abalone'
    mleap_model_prefix = prefix + '/mleap-model'
    # Store the value of the execution timestamp
    client = boto3.client('s3')
    client.put_object(Body=timestamp_prefix.encode('ascii'),
                        Bucket=bucket,
                        Key='execution.txt')
    # Upload data so it's present for training and inference
    
    print(sagemaker_session.upload_data(path='/tmp/abalone.csv', bucket=bucket, key_prefix=input_prefix))
    
    spark_processor = ScriptProcessor(base_job_name='spark-preprocessor',
                                  image_uri=spark_repository_uri,
                                  command=['/opt/program/submit'],
                                  role=role,
                                  instance_count=2,
                                  instance_type='ml.r5.xlarge',
                                  max_runtime_in_seconds=1200,
                                  env={'mode': 'python'})

    spark_processor.run(code=f's3://{bucket}/sparkdemo/preprocess.py',
                    arguments=['s3_input_bucket', bucket,
                              's3_input_key_prefix', input_prefix,
                              's3_output_bucket', bucket,
                              's3_output_key_prefix', input_preprocessed_prefix,
                              's3_model_bucket', bucket,
                              's3_mleap_model_prefix', mleap_model_prefix],
                    logs=True)
    
    event['s3_output_path'] = f's3://{bucket}/sagemaker/spark-preprocess-demo/{timestamp_prefix}/xgboost_model'
    event['train_data'] = f's3://{bucket}/sagemaker/spark-preprocess-demo/{timestamp_prefix}/input/preprocessed/abalone/train/part'
    event['validation_data'] = f's3://{bucket}/sagemaker/spark-preprocess-demo/{timestamp_prefix}/input/preprocessed/abalone/validation/part'
    event['training_job'] = f'{timestamp_prefix}-job' 
    return event
Exemplo n.º 6
0
def test_processing_step_normalizes_args(mock_normalize_args,
                                         sagemaker_session):
    processor = ScriptProcessor(
        role=ROLE,
        image_uri=
        "012345678901.dkr.ecr.us-west-2.amazonaws.com/my-custom-image-uri",
        command=["python3"],
        instance_type="ml.m4.xlarge",
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{
            "Key": "my-tag",
            "Value": "my-tag-value"
        }],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
            encrypt_inter_container_traffic=True,
        ),
        sagemaker_session=sagemaker_session,
    )
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    inputs = [
        ProcessingInput(
            source=f"s3://{BUCKET}/processing_manifest",
            destination="processing_manifest",
        )
    ]
    outputs = [
        ProcessingOutput(
            source=f"s3://{BUCKET}/processing_manifest",
            destination="processing_manifest",
        )
    ]
    step = ProcessingStep(
        name="MyProcessingStep",
        processor=processor,
        code="foo.py",
        inputs=inputs,
        outputs=outputs,
        job_arguments=["arg1", "arg2"],
        cache_config=cache_config,
    )
    mock_normalize_args.return_value = [step.inputs, step.outputs]
    step.to_request()
    mock_normalize_args.assert_called_with(
        arguments=step.job_arguments,
        inputs=step.inputs,
        outputs=step.outputs,
        code=step.code,
    )
Exemplo n.º 7
0
def _get_script_processor(sagemaker_session):
    return ScriptProcessor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        command=["python3"],
        instance_type="ml.m4.xlarge",
        instance_count=1,
        sagemaker_session=sagemaker_session,
    )
Exemplo n.º 8
0
def test_processing_step_with_script_processor(pipeline_session, processing_input, network_config):
    processor = ScriptProcessor(
        role=sagemaker.get_execution_role(),
        image_uri=IMAGE_URI,
        command=["python3"],
        instance_type=INSTANCE_TYPE,
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key="volume-kms-key",
        output_kms_key="output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{"Key": "my-tag", "Value": "my-tag-value"}],
        network_config=network_config,
        sagemaker_session=pipeline_session,
    )

    step_args = processor.run(
        inputs=processing_input, code=DUMMY_S3_SCRIPT_PATH, job_name="my-processing-job"
    )

    step = ProcessingStep(
        name="MyProcessingStep",
        step_args=step_args,
    )

    pipeline = Pipeline(
        name="MyPipeline",
        steps=[step],
        sagemaker_session=pipeline_session,
    )

    assert json.loads(pipeline.definition())["Steps"][0] == {
        "Name": "MyProcessingStep",
        "Type": "Processing",
        "Arguments": step_args,
    }
def test_processing_job_inputs_and_output_config(sagemaker_session, image_uri,
                                                 cpu_instance_type,
                                                 output_kms_key):
    script_processor = ScriptProcessor(
        role=ROLE,
        image_uri=image_uri,
        command=["python3"],
        instance_count=1,
        instance_type=cpu_instance_type,
        volume_size_in_gb=100,
        volume_kms_key=None,
        output_kms_key=output_kms_key,
        max_runtime_in_seconds=3600,
        base_job_name="test-script-processor",
        env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"},
        tags=[{
            "Key": "dummy-tag",
            "Value": "dummy-tag-value"
        }],
        sagemaker_session=sagemaker_session,
    )

    script_processor.run(
        code=os.path.join(DATA_DIR, "dummy_script.py"),
        inputs=_get_processing_inputs_with_all_parameters(
            sagemaker_session.default_bucket()),
        outputs=_get_processing_outputs_with_all_parameters(),
        arguments=["-v"],
        wait=False,
    )

    job_description = script_processor.latest_job.describe()
    expected_inputs_and_outputs = _get_processing_job_inputs_and_outputs(
        sagemaker_session.default_bucket(), output_kms_key)
    assert (job_description["ProcessingInputs"][:-1] ==
            expected_inputs_and_outputs["ProcessingInputs"])
    assert (job_description["ProcessingOutputConfig"] ==
            expected_inputs_and_outputs["ProcessingOutputConfig"])
Exemplo n.º 10
0
def define_data_pipeline(
    sm_role,
    workflow_execution_role,
    data_pipeline_name,
    return_yaml=True,
    dump_yaml_file="templates/sagemaker_data_pipeline.yaml",
):
    """
    Return YAML definition of the training pipeline, which consists of multiple
    Amazon StepFunction steps

    sm_role:                    ARN of the SageMaker execution role
    workflow_execution_role:    ARN of the StepFunction execution role
    return_yaml:                Return YAML representation or not, if False,
                                it returns an instance of
                                    `stepfunctions.workflow.WorkflowObject`
    dump_yaml_file:             If not None, a YAML file will be generated at
                                    this file location

    """

    # Pass required parameters dynamically for each execution using placeholders.
    execution_input = ExecutionInput(
        schema={
            "PreprocessingJobName": str,
            "PreprocessingCodeURL": str,
            "PreprocessedOutputDataURL": str,
            "S3InputPath": str,
            "S3OutputPath": str,
        }
    )

    """
    Data pre-processing and feature engineering
    """
    # processor = PySparkProcessor(
    region = "ap-southeast-1"
    image = "sagemaker-spark-processing"
    img_uri = f"759080221371.dkr.ecr.{region}.amazonaws.com/{image}:2.4-cpu"
    processor = ScriptProcessor(
        image_uri=img_uri,
        role=sm_role,
        instance_count=16,
        instance_type="ml.m5.2xlarge",
        command=["/opt/program/submit"],
        max_runtime_in_seconds=3600,
        env={"mode": "python"},
    )

    # Create ProcessingInputs and ProcessingOutputs objects for Inputs and
    # Outputs respectively for the SageMaker Processing Job
    inputs = [
        ProcessingInput(
            source=execution_input["PreprocessingCodeURL"],
            destination="/opt/ml/processing/input/code",
            input_name="code",
        ),
    ]

    outputs = [
        ProcessingOutput(
            source="/opt/ml/processing/output",
            destination=execution_input["PreprocessedOutputDataURL"],
            output_name="processed_data",
        ),
    ]

    processing_step = MLMAXProcessingStep(
        "SageMaker pre-processing step",
        processor=processor,
        job_name=execution_input["PreprocessingJobName"],
        inputs=inputs,
        outputs=outputs,
        environment={
            "S3InputPath": execution_input["S3InputPath"],
            "S3OutputPath": execution_input["S3OutputPath"],
        },
        container_entrypoint=[
            "smspark-submit",
            "/opt/ml/processing/input/code/preprocessing.py",
        ],
    )

    # Create Fail state to mark the workflow failed in case any of the steps fail.
    failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(
        "ML Workflow failed", cause="SageMakerProcessingJobFailed"
    )

    # Add the Error handling in the workflow
    catch_state_processing = stepfunctions.steps.states.Catch(
        error_equals=["States.TaskFailed"],
        next_step=failed_state_sagemaker_processing_failure,
    )
    processing_step.add_catch(catch_state_processing)

    # Create the Workflow
    workflow_graph = Chain([processing_step])
    data_pipeline = Workflow(
        name=data_pipeline_name,
        definition=workflow_graph,
        role=workflow_execution_role,
    )
    return data_pipeline
########################################################################################################################

from sagemaker.local import LocalSession
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
import boto3

s3 = boto3.client('s3')
sagemaker_session = LocalSession()
sagemaker_session.config = {'local': {'local_code': True}}

# For local training a dummy role will be sufficient
role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001'

processor = ScriptProcessor(
    command=['python3'],
    image_uri='sagemaker-delta-sharing-processing-local',
    role=role,
    instance_count=1,
    instance_type='local')

processor.run(code='processing_script.py',
              inputs=[
                  ProcessingInput(source='./profile/',
                                  destination='/opt/ml/processing/profile/')
              ],
              outputs=[
                  ProcessingOutput(output_name='delta_lake_processed_data',
                                   source='/opt/ml/processing/processed_data/')
              ])

preprocessing_job_description = processor.jobs[-1].describe()
output_config = preprocessing_job_description['ProcessingOutputConfig']
Exemplo n.º 12
0
def get_pipeline(
    region,
    sagemaker_session,
    role=None,
    default_bucket=None,
    model_package_group_name="sts-sklearn-grp",
    pipeline_name="stsPipeline",
    base_job_prefix="sts",
) -> Pipeline:
    """Gets a SageMaker ML Pipeline instance working with on sts data.

    Args:
        region: AWS region to create and run the pipeline.
        role: IAM role to create and run steps and pipeline.
        default_bucket: the bucket to use for storing the artifacts

    Returns:
        an instance of a pipeline
    """
    """
        Instance types allowed:
        
        ml.r5.12xlarge, ml.m5.4xlarge, ml.p2.xlarge, ml.m4.16xlarge, ml.r5.24xlarge, 
        ml.t3.xlarge, ml.r5.16xlarge, ml.m5.large, ml.p3.16xlarge, ml.p2.16xlarge, 
        ml.c4.2xlarge, ml.c5.2xlarge, ml.c4.4xlarge, ml.c5.4xlarge, ml.c4.8xlarge, 
        ml.c5.9xlarge, ml.c5.xlarge, ml.c4.xlarge, ml.t3.2xlarge, ml.t3.medium, 
        ml.c5.18xlarge, ml.r5.2xlarge, ml.p3.2xlarge, ml.m5.xlarge, ml.m4.10xlarge, 
        ml.r5.4xlarge, ml.m5.12xlarge, ml.m4.xlarge, ml.t3.large, ml.m5.24xlarge, 
        ml.m4.2xlarge, ml.m5.2xlarge, ml.p2.8xlarge, ml.r5.8xlarge, ml.r5.xlarge, 
        ml.r5.large, ml.p3.8xlarge, ml.m4.4xlarge

        see
        https://aws.amazon.com/blogs/machine-learning/right-sizing-resources-and-avoiding-unnecessary-costs-in-amazon-sagemaker/
    """
    sagemaker_session = get_session(region, default_bucket)
    if role is None:
        role = sagemaker.session.get_execution_role(sagemaker_session)

    # parameters for pipeline execution
    processing_instance_count = ParameterInteger(
        name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(name="ProcessingInstanceType",
                                               default_value="ml.m5.xlarge")

    # as of free tier of 50 hours of m4.xlarge or m5.xlarge instances
    training_instance_type = ParameterString(name="TrainingInstanceType",
                                             default_value="ml.m5.xlarge")
    model_approval_status = ParameterString(name="ModelApprovalStatus",
                                            default_value="Approved")

    # preprocess

    # preprocess input data
    input_data = ParameterString(
        name="InputDataUrl",
        default_value=f"s3://sts-datwit-dataset/stsmsrpc.txt",
    )

    # processing step for feature engineering
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{base_job_prefix}/sklearn-sts-preprocess",
        sagemaker_session=sagemaker_session,
        role=role,
    )

    step_preprocess = ProcessingStep(
        name="PreprocessSTSData",
        processor=sklearn_processor,
        outputs=[
            ProcessingOutput(output_name="train",
                             source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation",
                             source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test",
                             source="/opt/ml/processing/test"),
        ],
        code=os.path.join(BASE_DIR, "preprocess.py"),
        job_arguments=["--input-data", input_data],
    )

    # training step for generating model artifacts
    model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/stsTrain"
    image_uri = sagemaker.image_uris.retrieve(
        framework="sklearn",
        region=region,
        version="0.23-1",
        py_version="py3",
        instance_type=training_instance_type,
    )

    sklearn_estimator = SKLearn(
        entry_point='training.py',
        source_dir=BASE_DIR,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        framework_version="0.23-1",
        py_version="py3",
        base_job_name=f"{base_job_prefix}/sts-train",
        sagemaker_session=sagemaker_session,
        role=role,
    )

    step_train = TrainingStep(
        name="TrainSTSModel",
        estimator=sklearn_estimator,
        inputs={
            "train":
            TrainingInput(
                s3_data=step_preprocess.properties.ProcessingOutputConfig.
                Outputs["train"].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation":
            TrainingInput(
                s3_data=step_preprocess.properties.ProcessingOutputConfig.
                Outputs["validation"].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # processing step for evaluation
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-sts-eval",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    evaluation_report = PropertyFile(
        name="stsEvaluationReport",
        output_name="evaluation",
        path="evaluation.json",
    )
    step_eval = ProcessingStep(
        name="EvaluateSTSModel",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_preprocess.properties.ProcessingOutputConfig.
                Outputs["test"].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation",
                             source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "evaluate.py"),
        property_files=[evaluation_report],
    )

    # setup model quality monitoring baseline data
    script_process_baseline_data = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/baseline",
        sagemaker_session=sagemaker_session,
        role=role,
    )

    step_proccess_baseline_data = ProcessingStep(
        name="SetupMonitoringData",
        processor=script_process_baseline_data,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_preprocess.properties.ProcessingOutputConfig.
                Outputs["validation"].S3Output.S3Uri,
                destination="/opt/ml/processing/validation",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="validate",
                             source="/opt/ml/processing/validate"),
        ],
        code=os.path.join(BASE_DIR, "baseline.py"))
    # ---

    # register model step that will be conditionally executed
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]
            ["S3Output"]["S3Uri"]),
                                       content_type="application/json"))

    step_register = RegisterModel(
        name="RegisterSTSModel",
        estimator=sklearn_estimator,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.m5.xlarge"],
        transform_instances=["ml.m5.xlarge"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
    )

    # condition step for evaluating model quality and branching execution
    cond_lte = ConditionLessThanOrEqualTo(
        left=JsonGet(step=step_eval,
                     property_file=evaluation_report,
                     json_path="regression_metrics.mse.value"),
        right=6.0,
    )
    step_cond = ConditionStep(
        name="CheckMSESTSEvaluation",
        conditions=[cond_lte],
        if_steps=[step_register, step_proccess_baseline_data],
        # if_steps=[step_register],
        else_steps=[],
    )

    # pipeline instance
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
        ],
        steps=[step_preprocess, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )
    return pipeline
Exemplo n.º 13
0
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep, JsonGet

from config import read_conf

conf = read_conf("../config_bitti.ini")

# data augmentation step
# the data set that was collected is slow, just around fifty images;
# furthermore, all the magazines are placed with the same top-down view
# with labels horizontally aligned. We are going to change that.
magazine_augmentor = ScriptProcessor(
    image_uri=str(conf.processing_turicreate_uri),
    command=["python3"],
    instance_type=conf.processing_instance_type,
    instance_count=conf.processing_instance_count,
    #env={"RotationAngle": "15"},  # TODO: add configurability
    base_job_name="script-magazine-augmentation",
    role=conf.role)

step_data_augmentation = ProcessingStep(
    name="DataAugmentation",
    processor=magazine_augmentor,
    inputs=[
        ProcessingInput(source=conf.input_data,
                        destination="/opt/ml/processing/input")
    ],
    outputs=[
        ProcessingOutput(output_name="augmented",
                         source="/opt/ml/processing/output")
    ],
def get_pipeline(
    region,
    security_group_ids,
    subnets,
    processing_role=None,
    training_role=None,
    data_bucket=None,
    model_bucket=None,
    model_package_group_name="AbalonePackageGroup",
    pipeline_name="AbalonePipeline",
    base_job_prefix="Abalone",

):
    """Gets a SageMaker ML Pipeline instance working with on abalone data.

    Args:
        region: AWS region to create and run the pipeline.
        processing_role: IAM role to create and run processing steps
        training_role: IAM role to create and run training steps
        data_bucket: the bucket to use for storing the artifacts

    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, data_bucket)

    if processing_role is None:
        processing_role = sagemaker.session.get_execution_role(sagemaker_session)
    if training_role is None:
        training_role = sagemaker.session.get_execution_role(sagemaker_session)
    if model_bucket is None:
        model_bucket = sagemaker_session.default_bucket()

    print(f"Creating the pipeline '{pipeline_name}':")
    print(f"Parameters:{region}\n{security_group_ids}\n{subnets}\n{processing_role}\n\
    {training_role}\n{data_bucket}\n{model_bucket}\n{model_package_group_name}\n\
    {pipeline_name}\n{base_job_prefix}")

    # parameters for pipeline execution
    processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(
        name="ProcessingInstanceType", default_value="ml.m5.xlarge"
    )
    training_instance_type = ParameterString(
        name="TrainingInstanceType", default_value="ml.m5.xlarge"
    )
    model_approval_status = ParameterString(
        name="ModelApprovalStatus", default_value="PendingManualApproval"
    )
    input_data = ParameterString(
        name="InputDataUrl",
        default_value=f"s3://{sagemaker_session.default_bucket()}/datasets/abalone-dataset.csv",
    )

    # configure network for encryption, network isolation and VPC configuration
    # Since the preprocessor job takes the data from S3, enable_network_isolation must be set to False
    # see https://github.com/aws/amazon-sagemaker-examples/issues/1689
    network_config = NetworkConfig(
        enable_network_isolation=False, 
        security_group_ids=security_group_ids.split(","),
        subnets=subnets.split(","),
        encrypt_inter_container_traffic=True)
    
    # processing step for feature engineering
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{base_job_prefix}/sklearn-abalone-preprocess",
        sagemaker_session=sagemaker_session,
        role=processing_role,
        network_config=network_config
    )
    
    step_process = ProcessingStep(
        name="PreprocessAbaloneData",
        processor=sklearn_processor,
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
        ],
        code=os.path.join(BASE_DIR, "preprocess.py"),
        job_arguments=["--input-data", input_data],
    )

    # training step for generating model artifacts
    model_path = f"s3://{model_bucket}/{base_job_prefix}/AbaloneTrain"
    image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        base_job_name=f"{base_job_prefix}/abalone-train",
        sagemaker_session=sagemaker_session,
        role=training_role,
        subnets=network_config.subnets,
        security_group_ids=network_config.security_group_ids,
        encrypt_inter_container_traffic=True,
        enable_network_isolation=False
    )
    xgb_train.set_hyperparameters(
        objective="reg:linear",
        num_round=50,
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.7,
        silent=0,
    )
    
    step_train = TrainingStep(
        name="TrainAbaloneModel",
        estimator=xgb_train,
        inputs={
            "train": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # processing step for evaluation
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-abalone-eval",
        sagemaker_session=sagemaker_session,
        role=processing_role,
        network_config=network_config
    )
    
    evaluation_report = PropertyFile(
        name="AbaloneEvaluationReport",
        output_name="evaluation",
        path="evaluation.json",
    )
    step_eval = ProcessingStep(
        name="EvaluateAbaloneModel",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "evaluate.py"),
        property_files=[evaluation_report],
    )

    # register model step that will be conditionally executed
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/evaluation.json".format(
                step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json"
        )
    )

    """
    There is a bug in RegisterModel implementation
    The RegisterModel step is implemented in the SDK as two steps, a _RepackModelStep and a _RegisterModelStep. 
    The _RepackModelStep runs a SKLearn training step in order to repack the model.tar.gz to include any custom inference code in the archive. 
    The _RegisterModelStep then registers the repacked model.
    
    The problem is that the _RepackModelStep does not propagate VPC configuration from the Estimator object:
    https://github.com/aws/sagemaker-python-sdk/blob/cdb633b3ab02398c3b77f5ecd2c03cdf41049c78/src/sagemaker/workflow/_utils.py#L88

    This cause the AccessDenied exception because repacker cannot access S3 bucket (all access which is not via VPC endpoint is bloked by the bucket policy)
    
    The issue is opened against SageMaker python SDK: https://github.com/aws/sagemaker-python-sdk/issues/2302
    """

    vpc_config = {
        "Subnets":network_config.subnets,
        "SecurityGroupIds":network_config.security_group_ids
    }

    step_register = RegisterModel(
        name="RegisterAbaloneModel",
        estimator=xgb_train,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
        vpc_config_override=vpc_config
    )

    # condition step for evaluating model quality and branching execution
    cond_lte = ConditionLessThanOrEqualTo(
        left=JsonGet(
            step=step_eval,
            property_file=evaluation_report,
            json_path="regression_metrics.mse.value"
        ),
        right=6.0,
    )
    step_cond = ConditionStep(
        name="CheckMSEAbaloneEvaluation",
        conditions=[cond_lte],
        if_steps=[step_register],
        else_steps=[],
    )

    # pipeline instance
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )
    return pipeline
#   3. Open terminal and run the following commands:
#       docker build -t sagemaker-scikit-learn-processing-local container/.
########################################################################################################################

from sagemaker.local import LocalSession
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput

sagemaker_session = LocalSession()
sagemaker_session.config = {'local': {'local_code': True}}

# For local training a dummy role will be sufficient
role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001'

processor = ScriptProcessor(
    command=['python3'],
    image_uri='sagemaker-scikit-learn-processing-local',
    role=role,
    instance_count=1,
    instance_type='local')

processor.run(code='processing_script.py',
              inputs=[
                  ProcessingInput(source='./input_data/',
                                  destination='/opt/ml/processing/input_data/')
              ],
              outputs=[
                  ProcessingOutput(output_name='word_count_data',
                                   source='/opt/ml/processing/processed_data/')
              ],
              arguments=['job-type', 'word-count'])

preprocessing_job_description = processor.jobs[-1].describe()
Exemplo n.º 16
0
def test_byo_container_with_script_processor(sagemaker_session):
    script_processor = ScriptProcessor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        command=["python3"],
        instance_count=1,
        instance_type="ml.m4.xlarge",
        sagemaker_session=sagemaker_session,
    )

    with patch("os.path.isfile", return_value=True):
        script_processor.run(
            code="/local/path/to/sklearn_transformer.py",
            inputs=[
                ProcessingInput(source="/local/path/to/my/dataset/census.csv",
                                destination="/data/")
            ],
            experiment_config={"ExperimentName": "AnExperiment"},
        )

    expected_args = {
        "inputs": [
            {
                "InputName": "input-1",
                "S3Input": {
                    "S3Uri": "mocked_s3_uri_from_upload_data",
                    "LocalPath": "/data/",
                    "S3DataType": "S3Prefix",
                    "S3InputMode": "File",
                    "S3DataDistributionType": "FullyReplicated",
                    "S3CompressionType": "None",
                },
            },
            {
                "InputName": "code",
                "S3Input": {
                    "S3Uri": "mocked_s3_uri_from_upload_data",
                    "LocalPath": "/opt/ml/processing/input/code",
                    "S3DataType": "S3Prefix",
                    "S3InputMode": "File",
                    "S3DataDistributionType": "FullyReplicated",
                    "S3CompressionType": "None",
                },
            },
        ],
        "output_config": {
            "Outputs": []
        },
        "job_name":
        script_processor._current_job_name,
        "resources": {
            "ClusterConfig": {
                "InstanceType": "ml.m4.xlarge",
                "InstanceCount": 1,
                "VolumeSizeInGB": 30,
            }
        },
        "stopping_condition":
        None,
        "app_specification": {
            "ImageUri":
            CUSTOM_IMAGE_URI,
            "ContainerEntrypoint": [
                "python3",
                "/opt/ml/processing/input/code/sklearn_transformer.py",
            ],
        },
        "environment":
        None,
        "network_config":
        None,
        "role_arn":
        ROLE,
        "tags":
        None,
        "experiment_config": {
            "ExperimentName": "AnExperiment"
        },
    }
    sagemaker_session.process.assert_called_with(**expected_args)
Exemplo n.º 17
0
## locations and vars
BUCKET = sagemaker.Session().default_bucket()
INPUT_FOLDER = 'stock-data-raw-csv'
OUTPUT_FOLDER = 'DEMO-xgboost-as-a-built-in-algo'
ROLE_ARN = sagemaker.get_execution_role()
## image uri code
ACCOUNT_ID = boto3.client('sts').get_caller_identity().get('Account')
REGION = boto3.Session().region_name
ECR_REPOSITORY = 'sagemaker-processing-container'
TAG = ':latest'
IMAGE_URI = '{}.dkr.ecr.{}.amazonaws.com/{}'.format(ACCOUNT_ID, REGION,
                                                    ECR_REPOSITORY + TAG)
## call processing job
script_processor = ScriptProcessor(command=['python3'],
                                   image_uri=IMAGE_URI,
                                   role=ROLE_ARN,
                                   instance_count=1,
                                   instance_type='ml.m5.xlarge')

script_processor.run(
    code='train_val_test_split.py',
    inputs=[
        ProcessingInput(source=f's3://{BUCKET}/{INPUT_FOLDER}/',
                        destination='/opt/ml/processing/input')
    ],
    outputs=[
        ProcessingOutput(source='/opt/ml/processing/output/train',
                         destination=f's3://{BUCKET}/{OUTPUT_FOLDER}/train'),
        ProcessingOutput(
            source='/opt/ml/processing/output/validation',
            destination=f's3://{BUCKET}/{OUTPUT_FOLDER}/validation'),
def get_pipeline(
        region,
        role=None,
        default_bucket=None,
        model_package_group_name="CustomerChurnPackageGroup",  # Choose any name
        pipeline_name="CustomerChurnDemo-p-ewf8t7lvhivm",  # You can find your pipeline name in the Studio UI (project -> Pipelines -> name)
        base_job_prefix="CustomerChurn",  # Choose any name
):
    """Gets a SageMaker ML Pipeline instance working with on CustomerChurn data.
    Args:
        region: AWS region to create and run the pipeline.
        role: IAM role to create and run steps and pipeline.
        default_bucket: the bucket to use for storing the artifacts
    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, default_bucket)
    if role is None:
        role = sagemaker.session.get_execution_role(sagemaker_session)

    # Parameters for pipeline execution
    processing_instance_count = ParameterInteger(
        name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(name="ProcessingInstanceType",
                                               default_value="ml.m5.xlarge")
    training_instance_type = ParameterString(name="TrainingInstanceType",
                                             default_value="ml.m5.xlarge")
    model_approval_status = ParameterString(
        name="ModelApprovalStatus",
        default_value=
        "PendingManualApproval",  # ModelApprovalStatus can be set to a default of "Approved" if you don't want manual approval.
    )
    input_data = ParameterString(
        name="InputDataUrl",
        default_value=
        f"s3://sm-pipelines-demo-data-123456789/churn.txt",  # Change this to point to the s3 location of your raw input data.
    )

    # Processing step for feature engineering
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=
        f"{base_job_prefix}/sklearn-CustomerChurn-preprocess",  # choose any name
        sagemaker_session=sagemaker_session,
        role=role,
    )
    step_process = ProcessingStep(
        name="CustomerChurnProcess",  # choose any name
        processor=sklearn_processor,
        outputs=[
            ProcessingOutput(output_name="train",
                             source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation",
                             source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test",
                             source="/opt/ml/processing/test"),
        ],
        code=os.path.join(BASE_DIR, "preprocess.py"),
        job_arguments=["--input-data", input_data],
    )

    # Training step for generating model artifacts
    model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/CustomerChurnTrain"
    image_uri = sagemaker.image_uris.retrieve(
        framework=
        "xgboost",  # we are using the Sagemaker built in xgboost algorithm
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        base_job_name=f"{base_job_prefix}/CustomerChurn-train",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    xgb_train.set_hyperparameters(
        objective="binary:logistic",
        num_round=50,
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.7,
        silent=0,
    )
    step_train = TrainingStep(
        name="CustomerChurnTrain",
        estimator=xgb_train,
        inputs={
            "train":
            TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.
                Outputs["train"].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation":
            TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.
                Outputs["validation"].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # Processing step for evaluation
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-CustomerChurn-eval",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    evaluation_report = PropertyFile(
        name="EvaluationReport",
        output_name="evaluation",
        path="evaluation.json",
    )
    step_eval = ProcessingStep(
        name="CustomerChurnEval",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.
                Outputs["test"].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation",
                             source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "evaluate.py"),
        property_files=[evaluation_report],
    )

    # Register model step that will be conditionally executed
    model_metrics = ModelMetrics(model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]
            ["S3Output"]["S3Uri"]),
        content_type="application/json",
    ))

    # Register model step that will be conditionally executed
    step_register = RegisterModel(
        name="CustomerChurnRegisterModel",
        estimator=xgb_train,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
    )

    # Condition step for evaluating model quality and branching execution
    cond_lte = ConditionGreaterThanOrEqualTo(  # You can change the condition here
        left=JsonGet(
            step=step_eval,
            property_file=evaluation_report,
            json_path=
            "binary_classification_metrics.accuracy.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
        ),
        right=0.8,  # You can change the threshold here
    )
    step_cond = ConditionStep(
        name="CustomerChurnAccuracyCond",
        conditions=[cond_lte],
        if_steps=[step_register],
        else_steps=[],
    )

    # Pipeline instance
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )
    return pipeline
Exemplo n.º 19
0
def test_end_to_end_pipeline_successful_execution(
    sagemaker_session, region_name, role, pipeline_name, wait=False
):
    model_package_group_name = f"{pipeline_name}ModelPackageGroup"
    data_path = os.path.join(DATA_DIR, "workflow")
    default_bucket = sagemaker_session.default_bucket()

    # download the input data
    local_input_path = os.path.join(data_path, "abalone-dataset.csv")
    s3 = sagemaker_session.boto_session.resource("s3")
    s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file(
        "dataset/abalone-dataset.csv", local_input_path
    )

    # # upload the input data to our bucket
    base_uri = f"s3://{default_bucket}/{pipeline_name}"
    with open(local_input_path) as data:
        body = data.read()
        input_data_uri = S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=f"{base_uri}/abalone-dataset.csv",
            sagemaker_session=sagemaker_session,
        )

    # download batch transform data
    local_batch_path = os.path.join(data_path, "abalone-dataset-batch")
    s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file(
        "dataset/abalone-dataset-batch", local_batch_path
    )

    # upload the batch transform data
    with open(local_batch_path) as data:
        body = data.read()
        batch_data_uri = S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=f"{base_uri}/abalone-dataset-batch",
            sagemaker_session=sagemaker_session,
        )

    # define parameters
    processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(
        name="ProcessingInstanceType", default_value="ml.m5.xlarge"
    )
    training_instance_type = ParameterString(
        name="TrainingInstanceType", default_value="ml.m5.xlarge"
    )
    model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved")
    input_data = ParameterString(
        name="InputData",
        default_value=input_data_uri,
    )
    batch_data = ParameterString(
        name="BatchData",
        default_value=batch_data_uri,
    )

    # define processing step
    framework_version = "0.23-1"
    sklearn_processor = SKLearnProcessor(
        framework_version=framework_version,
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{pipeline_name}-process",
        role=role,
        sagemaker_session=sagemaker_session,
    )
    step_process = ProcessingStep(
        name="AbaloneProcess",
        processor=sklearn_processor,
        inputs=[
            ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
        ],
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
        ],
        code=os.path.join(data_path, "abalone/preprocessing.py"),
    )

    # define training step
    model_path = f"s3://{default_bucket}/{pipeline_name}Train"
    image_uri = image_uris.retrieve(
        framework="xgboost",
        region=region_name,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        role=role,
        sagemaker_session=sagemaker_session,
    )
    xgb_train.set_hyperparameters(
        objective="reg:linear",
        num_round=50,
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.7,
        silent=0,
    )
    step_train = TrainingStep(
        name="AbaloneTrain",
        estimator=xgb_train,
        inputs={
            "train": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # define evaluation step
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{pipeline_name}-eval",
        role=role,
        sagemaker_session=sagemaker_session,
    )
    evaluation_report = PropertyFile(
        name="EvaluationReport", output_name="evaluation", path="evaluation.json"
    )
    step_eval = ProcessingStep(
        name="AbaloneEval",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(data_path, "abalone/evaluation.py"),
        property_files=[evaluation_report],
    )

    # define create model step
    model = Model(
        image_uri=image_uri,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        sagemaker_session=sagemaker_session,
        role=role,
    )
    inputs = CreateModelInput(
        instance_type="ml.m5.large",
        accelerator_type="ml.eia1.medium",
    )
    step_create_model = CreateModelStep(
        name="AbaloneCreateModel",
        model=model,
        inputs=inputs,
    )

    # define transform step
    transformer = Transformer(
        model_name=step_create_model.properties.ModelName,
        instance_type="ml.m5.xlarge",
        instance_count=1,
        output_path=f"s3://{default_bucket}/{pipeline_name}Transform",
        sagemaker_session=sagemaker_session,
    )
    step_transform = TransformStep(
        name="AbaloneTransform",
        transformer=transformer,
        inputs=TransformInput(data=batch_data),
    )

    # define register model step
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/evaluation.json".format(
                step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json",
        )
    )
    step_register = RegisterModel(
        name="AbaloneRegisterModel",
        estimator=xgb_train,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
        transform_instances=["ml.m5.xlarge"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
    )

    # define condition step
    cond_lte = ConditionLessThanOrEqualTo(
        left=JsonGet(
            step_name=step_eval.name,
            property_file=evaluation_report,
            json_path="regression_metrics.mse.value",
        ),
        right=20.0,
    )

    step_cond = ConditionStep(
        name="AbaloneMSECond",
        conditions=[cond_lte],
        if_steps=[step_register, step_create_model, step_transform],
        else_steps=[],
    )

    # define pipeline
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
            batch_data,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )

    pipeline.create(role)
    execution = pipeline.start()
    execution_arn = execution.arn

    if wait:
        execution.wait()

    return execution_arn
Exemplo n.º 20
0
def test_script_processor(sagemaker_session, image_uri, cpu_instance_type,
                          output_kms_key):
    input_file_path = os.path.join(DATA_DIR, "dummy_input.txt")

    script_processor = ScriptProcessor(
        role=ROLE,
        image_uri=image_uri,
        command=["python3"],
        instance_count=1,
        instance_type=cpu_instance_type,
        volume_size_in_gb=100,
        volume_kms_key=None,
        output_kms_key=output_kms_key,
        max_runtime_in_seconds=3600,
        base_job_name="test-script-processor",
        env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"},
        tags=[{
            "Key": "dummy-tag",
            "Value": "dummy-tag-value"
        }],
        sagemaker_session=sagemaker_session,
    )

    script_processor.run(
        code=os.path.join(DATA_DIR, "dummy_script.py"),
        inputs=[
            ProcessingInput(
                source=input_file_path,
                destination="/opt/ml/processing/input/container/path/",
                input_name="dummy_input",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_data_distribution_type="FullyReplicated",
                s3_compression_type="None",
            )
        ],
        outputs=[
            ProcessingOutput(
                source="/opt/ml/processing/output/container/path/",
                output_name="dummy_output",
                s3_upload_mode="EndOfJob",
            )
        ],
        arguments=["-v"],
        wait=True,
        logs=True,
    )

    job_description = script_processor.latest_job.describe()

    assert job_description["ProcessingInputs"][0]["InputName"] == "dummy_input"

    assert job_description["ProcessingInputs"][1]["InputName"] == "code"

    assert job_description["ProcessingJobName"].startswith(
        "test-script-processor")

    assert job_description["ProcessingJobStatus"] == "Completed"

    assert job_description["ProcessingOutputConfig"][
        "KmsKeyId"] == output_kms_key
    assert job_description["ProcessingOutputConfig"]["Outputs"][0][
        "OutputName"] == "dummy_output"

    assert job_description["ProcessingResources"]["ClusterConfig"][
        "InstanceCount"] == 1
    assert (job_description["ProcessingResources"]["ClusterConfig"]
            ["InstanceType"] == cpu_instance_type)
    assert job_description["ProcessingResources"]["ClusterConfig"][
        "VolumeSizeInGB"] == 100

    assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"]
    assert job_description["AppSpecification"]["ContainerEntrypoint"] == [
        "python3",
        "/opt/ml/processing/input/code/dummy_script.py",
    ]
    assert job_description["AppSpecification"]["ImageUri"] == image_uri

    assert job_description["Environment"] == {
        "DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"
    }

    assert ROLE in job_description["RoleArn"]

    assert job_description["StoppingCondition"] == {
        "MaxRuntimeInSeconds": 3600
    }
Exemplo n.º 21
0
Key concepts:
- By default, the code "processor.py" will be placed under "/opt/ml/processing/input/code"
- recommend to put input files under "/opt/ml/processing/input/data", so that we can easily read all the files without filtering
- recommend to put output files under "/opt/ml/processing/output"
- data science step function sdk can be used to join all together with subsequent training job
"""

from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput

image_uri = "342474125894.dkr.ecr.ap-southeast-1.amazonaws.com/sagemaker-processor:latest"
role = "arn:aws:iam::342474125894:role/service-role/AmazonSageMaker-ExecutionRole-20190405T234154"

script_processor = ScriptProcessor(
    role=role,
    image_uri=image_uri,
    command=["python3"],
    instance_count=1,
    instance_type="ml.m5.xlarge",
)


script_processor.run(
    code="s3://sagemaker-ap-southeast-1-342474125894/riotinto/preprocessing/code/processor.py",
    inputs=[
        ProcessingInput(
            # Include data files, Can be s3 or local path
            source="s3://sagemaker-ap-southeast-1-342474125894/riotinto/preprocessing/input",
            destination="/opt/ml/processing/input/data",
            input_name="parquet",
            s3_data_type="S3Prefix",
        ),
Exemplo n.º 22
0
def get_pipeline(
    region,
    sagemaker_project_arn=None,
    role=None,
    default_bucket=None,
    model_package_group_name="restatePackageGroup",  # Choose any name
    pipeline_name="restate-p-XXXXXXXXX",  # You can find your pipeline name in the Studio UI (project -> Pipelines -> name)
    base_job_prefix="restate",  # Choose any name
):
    """Gets a SageMaker ML Pipeline instance working with on RE data.
    Args:
        region: AWS region to create and run the pipeline.
        role: IAM role to create and run steps and pipeline.
        default_bucket: the bucket to use for storing the artifacts
    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, default_bucket)
    if role is None:
        role = sagemaker.session.get_execution_role(sagemaker_session)

    # Parameters for pipeline execution
    processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(
        name="ProcessingInstanceType", default_value="ml.m5.2xlarge"
    )
    training_instance_type = ParameterString(
        name="TrainingInstanceType", default_value="ml.m5.xlarge"
    )
    model_approval_status = ParameterString(
        name="ModelApprovalStatus",
        default_value="PendingManualApproval",  # ModelApprovalStatus can be set to a default of "Approved" if you don't want manual approval.
    )
    input_data = ParameterString(
        name="InputDataUrl",
        default_value=f"",  # Change this to point to the s3 location of your raw input data.
    )

    data_sources = []
    # Sagemaker session
    sess = sagemaker_session

    # You can configure this with your own bucket name, e.g.
    # bucket = "my-bucket"
    bucket = sess.default_bucket()

    data_sources.append(
        ProcessingInput(
            input_name="restate-california",
            dataset_definition=DatasetDefinition(
                local_path="/opt/ml/processing/restate-california",
                data_distribution_type="FullyReplicated",
                # You can override below to point to other database or use different queries
                athena_dataset_definition=AthenaDatasetDefinition(
                    catalog="AwsDataCatalog",
                    database="restate",
                    query_string="SELECT * FROM restate.california_10",
                    output_s3_uri=f"s3://{bucket}/athena/",
                    output_format="PARQUET",
                ),
            ),
        )
    )

    print(f"Data Wrangler export storage bucket: {bucket}")

    # unique flow export ID
    flow_export_id = f"{time.strftime('%d-%H-%M-%S', time.gmtime())}-{str(uuid.uuid4())[:8]}"
    flow_export_name = f"flow-{flow_export_id}"

    # Output name is auto-generated from the select node's ID + output name from the flow file.
    output_name = "99ae1ec3-dd5f-453c-bfae-721dac423cd7.default"

    s3_output_prefix = f"export-{flow_export_name}/output"
    s3_output_path = f"s3://{bucket}/{s3_output_prefix}"
    print(f"Flow S3 export result path: {s3_output_path}")

    processing_job_output = ProcessingOutput(
        output_name=output_name,
        source="/opt/ml/processing/output",
        destination=s3_output_path,
        s3_upload_mode="EndOfJob",
    )

    # name of the flow file which should exist in the current notebook working directory
    flow_file_name = "sagemaker-pipeline/restate-athena-california.flow"

    # Load .flow file from current notebook working directory
    #!echo "Loading flow file from current notebook working directory: $PWD"

    with open(flow_file_name) as f:
        flow = json.load(f)

    # Upload flow to S3
    s3_client = boto3.client("s3")
    s3_client.upload_file(
        flow_file_name,
        bucket,
        f"data_wrangler_flows/{flow_export_name}.flow",
        ExtraArgs={"ServerSideEncryption": "aws:kms"},
    )

    flow_s3_uri = f"s3://{bucket}/data_wrangler_flows/{flow_export_name}.flow"

    print(f"Data Wrangler flow {flow_file_name} uploaded to {flow_s3_uri}")

    ## Input - Flow: restate-athena-russia.flow
    flow_input = ProcessingInput(
        source=flow_s3_uri,
        destination="/opt/ml/processing/flow",
        input_name="flow",
        s3_data_type="S3Prefix",
        s3_input_mode="File",
        s3_data_distribution_type="FullyReplicated",
    )

    # IAM role for executing the processing job.
    iam_role = role

    # Unique processing job name. Give a unique name every time you re-execute processing jobs
    processing_job_name = f"data-wrangler-flow-processing-{flow_export_id}"

    # Data Wrangler Container URL.
    container_uri = sagemaker.image_uris.retrieve(
        framework="data-wrangler",  # we are using the Sagemaker built in xgboost algorithm
        region=region,
    )

    # Processing Job Instance count and instance type.
    instance_count = 2
    instance_type = "ml.m5.4xlarge"

    # Size in GB of the EBS volume to use for storing data during processing
    volume_size_in_gb = 30

    # Content type for each output. Data Wrangler supports CSV as default and Parquet.
    output_content_type = "CSV"

    # Network Isolation mode; default is off
    enable_network_isolation = False

    # List of tags to be passed to the processing job
    user_tags = []

    # Output configuration used as processing job container arguments
    output_config = {output_name: {"content_type": output_content_type}}

    # KMS key for per object encryption; default is None
    kms_key = None

    processor = Processor(
        role=iam_role,
        image_uri=container_uri,
        instance_count=instance_count,
        instance_type=instance_type,
        volume_size_in_gb=volume_size_in_gb,
        network_config=NetworkConfig(enable_network_isolation=enable_network_isolation),
        sagemaker_session=sess,
        output_kms_key=kms_key,
        tags=user_tags,
    )

    data_wrangler_step = ProcessingStep(
        name="DataWranglerProcess",
        processor=processor,
        inputs=[flow_input] + data_sources,
        outputs=[processing_job_output],
        job_arguments=[f"--output-config '{json.dumps(output_config)}'"],
    )

    # Processing step for feature engineering
    # this processor does not have awswrangler installed
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{base_job_prefix}/sklearn-restate-preprocess",  # choose any name
        sagemaker_session=sagemaker_session,
        role=role,
    )

    step_process = ProcessingStep(
        name="Preprocess",  # choose any name
        processor=sklearn_processor,
        inputs=[
            ProcessingInput(
                source=data_wrangler_step.properties.ProcessingOutputConfig.Outputs[
                    output_name
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/data/raw-data-dir",
            )
        ],
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
        ],
        code=os.path.join(BASE_DIR, "preprocess.py"),
        job_arguments=[
            "--input-data",
            data_wrangler_step.properties.ProcessingOutputConfig.Outputs[
                output_name
            ].S3Output.S3Uri,
        ],
    )

    # Training step for generating model artifacts
    model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/restateTrain"
    model_bucket_key = f"{sagemaker_session.default_bucket()}/{base_job_prefix}/restateTrain"
    cache_config = CacheConfig(enable_caching=True, expire_after="30d")

    xgb_image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",  # we are using the Sagemaker built in xgboost algorithm
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=xgb_image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        base_job_name=f"{base_job_prefix}/restate-xgb-train",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    xgb_train.set_hyperparameters(
        #    #objective="binary:logistic",
        #    objective="reg:linear",
        num_round=50,
        #    max_depth=5,
        #    eta=0.2,
        #    gamma=4,
        #    min_child_weight=6,
        #    subsample=0.7,
        #    silent=0,
    )

    xgb_train.set_hyperparameters(grow_policy="lossguide")

    xgb_objective_metric_name = "validation:mse"
    xgb_hyperparameter_ranges = {
        "max_depth": IntegerParameter(2, 10, scaling_type="Linear"),
    }

    xgb_tuner_log = HyperparameterTuner(
        xgb_train,
        xgb_objective_metric_name,
        xgb_hyperparameter_ranges,
        max_jobs=3,
        max_parallel_jobs=3,
        strategy="Random",
        objective_type="Minimize",
    )

    xgb_step_tuning = TuningStep(
        name="XGBHPTune",
        tuner=xgb_tuner_log,
        inputs={
            "train": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
        cache_config=cache_config,
    )

    # dtree_image_uri = '625467769535.dkr.ecr.ap-southeast-1.amazonaws.com/sagemaker-decision-tree:latest'
    dtree_image_uri = sagemaker_session.sagemaker_client.describe_image_version(
        ImageName="restate-dtree"
    )["ContainerImage"]

    dtree_train = Estimator(
        image_uri=dtree_image_uri,
        role=role,
        instance_count=1,
        instance_type=training_instance_type,
        base_job_name=f"{base_job_prefix}/restate-dtree-train",
        output_path=model_path,
        sagemaker_session=sagemaker_session,
    )

    dtree_objective_metric_name = "validation:mse"
    dtree_metric_definitions = [{"Name": "validation:mse", "Regex": "mse:(\S+)"}]

    dtree_hyperparameter_ranges = {
        "max_depth": IntegerParameter(10, 50, scaling_type="Linear"),
        "max_leaf_nodes": IntegerParameter(2, 12, scaling_type="Linear"),
    }

    dtree_tuner_log = HyperparameterTuner(
        dtree_train,
        dtree_objective_metric_name,
        dtree_hyperparameter_ranges,
        dtree_metric_definitions,
        max_jobs=3,
        max_parallel_jobs=3,
        strategy="Random",
        objective_type="Minimize",
    )

    dtree_step_tuning = TuningStep(
        name="DTreeHPTune",
        tuner=dtree_tuner_log,
        inputs={
            "training": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
        cache_config=cache_config,
    )

    dtree_script_eval = ScriptProcessor(
        image_uri=dtree_image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-dtree-eval",
        sagemaker_session=sagemaker_session,
        role=role,
    )

    dtree_evaluation_report = PropertyFile(
        name="EvaluationReportDTree",
        output_name="dtree_evaluation",
        path="dtree_evaluation.json",
    )

    dtree_step_eval = ProcessingStep(
        name="DTreeEval",
        processor=dtree_script_eval,
        inputs=[
            ProcessingInput(
                # source=dtree_step_train.properties.ModelArtifacts.S3ModelArtifacts,
                source=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(
                output_name="dtree_evaluation", source="/opt/ml/processing/evaluation"
            ),
        ],
        code=os.path.join(BASE_DIR, "dtree_evaluate.py"),
        property_files=[dtree_evaluation_report],
    )

    xgb_script_eval = ScriptProcessor(
        image_uri=xgb_image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-xgb-eval",
        sagemaker_session=sagemaker_session,
        role=role,
    )

    xgb_evaluation_report = PropertyFile(
        name="EvaluationReportXGBoost",
        output_name="xgb_evaluation",
        path="xgb_evaluation.json",
    )

    xgb_step_eval = ProcessingStep(
        name="XGBEval",
        processor=xgb_script_eval,
        inputs=[
            ProcessingInput(
                source=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="xgb_evaluation", source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "xgb_evaluate.py"),
        property_files=[xgb_evaluation_report],
    )

    xgb_model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/xgb_evaluation.json".format(
                xgb_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json",
        )
    )

    dtree_model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/dtree_evaluation.json".format(
                dtree_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"][
                    "S3Uri"
                ]
            ),
            content_type="application/json",
        )
    )

    xgb_eval_metrics = JsonGet(
        step=xgb_step_eval,
        property_file=xgb_evaluation_report,
        json_path="regression_metrics.r2s.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
    )

    dtree_eval_metrics = JsonGet(
        step=dtree_step_eval,
        property_file=dtree_evaluation_report,
        json_path="regression_metrics.r2s.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
    )

    # Register model step that will be conditionally executed
    dtree_step_register = RegisterModel(
        name="DTreeReg",
        estimator=dtree_train,
        model_data=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=dtree_model_metrics,
    )

    # Register model step that will be conditionally executed
    xgb_step_register = RegisterModel(
        name="XGBReg",
        estimator=xgb_train,
        model_data=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=xgb_model_metrics,
    )

    # Condition step for evaluating model quality and branching execution
    cond_lte = ConditionGreaterThanOrEqualTo(  # You can change the condition here
        left=JsonGet(
            step=dtree_step_eval,
            property_file=dtree_evaluation_report,
            json_path="regression_metrics.r2s.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
        ),
        right=JsonGet(
            step=xgb_step_eval,
            property_file=xgb_evaluation_report,
            json_path="regression_metrics.r2s.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
        ),  # You can change the threshold here
    )

    step_cond = ConditionStep(
        name="AccuracyCond",
        conditions=[cond_lte],
        if_steps=[dtree_step_register],
        else_steps=[xgb_step_register],
    )
    create_date = time.strftime("%Y-%m-%d-%H-%M-%S")

    # Pipeline instance
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data
        ],
        pipeline_experiment_config=PipelineExperimentConfig(
            pipeline_name + "-" + create_date, "restate-{}".format(create_date)
        ),
        steps=[
            data_wrangler_step,
            step_process,
            dtree_step_tuning,
            xgb_step_tuning,
            dtree_step_eval,
            xgb_step_eval,
            step_cond,
        ],
        sagemaker_session=sagemaker_session,
    )
    return pipeline
Exemplo n.º 23
0
def get_pipeline(
    region,
    role=None,
    default_bucket=None,
    model_package_group_name="AbalonePackageGroup",
    pipeline_name="AbalonePipeline",
    base_job_prefix="Abalone",
):
    """Gets a SageMaker ML Pipeline instance working with on abalone data.

    Args:
        region: AWS region to create and run the pipeline.
        role: IAM role to create and run steps and pipeline.
        default_bucket: the bucket to use for storing the artifacts

    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, default_bucket)
    if role is None:
        role = sagemaker.session.get_execution_role(sagemaker_session)

    # Create cache configuration
    cache_config = CacheConfig(enable_caching=True, expire_after="T30m")

    # Create SKlean processor object
    sklearn_processor = SKLearnProcessor(
        framework_version="0.20.0",
        role=role,
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name="credit-processing-job"
    )

    # Use the sklearn_processor in a Sagemaker pipelines ProcessingStep
    step_preprocess_data = ProcessingStep(
        name="PreprocessCreditData",
        processor=sklearn_processor,
        cache_config=cache_config,
        inputs=[
          ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),  
        ],
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/output/train"),
            ProcessingOutput(output_name="validation", source="/opt/ml/processing/output/validation"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/output/test"),
            ProcessingOutput(output_name="baseline_with_headers", source="/opt/ml/processing/output/baseline")
        ],
        code=os.path.join(BASE_DIR, "preprocessing.py"),
    )


    # Where to store the trained model
    model_path = f"s3://{default_bucket}/CreditTrain"

    # Fetch container to use for training
    image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.2-2",
        py_version="py3",
        instance_type=training_instance_type,
    )

    # Create XGBoost estimator object
    xgb_estimator = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        role=role,
        disable_profiler=True,
    )

    # Specify hyperparameters
    xgb_estimator.set_hyperparameters(max_depth=5,
                            eta=0.2,
                            gamma=4,
                            min_child_weight=6,
                            subsample=0.8,
                            objective='binary:logistic',
                            num_round=25)

    # Use the xgb_estimator in a Sagemaker pipelines ProcessingStep. 
    # NOTE how the input to the training job directly references the output of the previous step.
    step_train_model = TrainingStep(
        name="TrainCreditModel",
        estimator=xgb_estimator,
        cache_config=cache_config,
        inputs={
            "train": TrainingInput(
                s3_data=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv"
            ),
            "validation": TrainingInput(
                s3_data=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv"
            )
        },
    )

    # Create ScriptProcessor object.
    evaluate_model_processor = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name="script-credit-eval",
        role=role,
    )

    # Create a PropertyFile
    # We use a PropertyFile to be able to reference outputs from a processing step, for instance to use in a condition step, which we'll see later on.
    # For more information, visit https://docs.aws.amazon.com/sagemaker/latest/dg/build-and-manage-propertyfile.html
    evaluation_report = PropertyFile(
        name="EvaluationReport",
        output_name="evaluation",
        path="evaluation.json"
    )

    # Use the evaluate_model_processor in a Sagemaker pipelines ProcessingStep. 
    step_evaluate_model = ProcessingStep(
        name="EvaluateCreditModel",
        processor=evaluate_model_processor,
        cache_config=cache_config,
        inputs=[
            ProcessingInput(
                source=step_train_model.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model"
            ),
            ProcessingInput(
                source=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test"
            )
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "evaluation.py"),
        property_files=[evaluation_report],
    )


    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/evaluation.json".format(
                step_evaluate_model.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json"
        )
    )

    # Crete a RegisterModel step, which registers your model with Sagemaker Model Registry.
    step_register_model = RegisterModel(
        name="RegisterCreditModel",
        estimator=xgb_estimator,
        model_data=step_train_model.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.xlarge", "ml.m5.large"],
        transform_instances=["ml.m5.xlarge"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics
    )


    # Create Processor object using the model monitor image
    baseline_processor = sagemaker.processing.Processor(
        base_job_name="credit-risk-baseline-processor",
        image_uri=sagemaker.image_uris.retrieve(framework='model-monitor', region='eu-west-1'),
        role=role,
        instance_count=1,
        instance_type=processing_instance_type,
        env = {
            "dataset_format": "{\"csv\": {\"header\": true} }",
            "dataset_source": "/opt/ml/processing/sm_input",
            "output_path": "/opt/ml/processing/sm_output",
            "publish_cloudwatch_metrics": "Disabled"
        }
    )

    # Create a Sagemaker Pipeline step, using the baseline_processor.
    step_create_data_baseline = ProcessingStep(
        name="CreateModelQualityBaseline",
        processor=baseline_processor,
        cache_config=cache_config,
        inputs=[
            ProcessingInput(
                source=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[
                    "baseline_with_headers"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/sm_input",
            )
        ],
        outputs=[
            ProcessingOutput(
                source="/opt/ml/processing/sm_output",
                destination="s3://{}/{}/baseline".format(default_bucket, base_job_prefix),
                output_name="baseline_result",
            )
        ],
    )



    # Create Condition
    cond_gte = ConditionGreaterThanOrEqualTo(
        left=JsonGet(
            step=step_evaluate_model,
            property_file=evaluation_report,
            json_path="binary_classification_metrics.accuracy.value"
        ),
        right=0.7
    )

    # Create a Sagemaker Pipelines ConditionStep, using the condition we just created.
    step_cond = ConditionStep(
        name="AccuracyCondition",
        conditions=[cond_gte],
        if_steps=[step_register_model],
        else_steps=[], 
    )

    from sagemaker.workflow.pipeline import Pipeline

    # Create a Sagemaker Pipeline
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type, 
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
        ],
        steps=[step_preprocess_data, step_train_model, step_evaluate_model, step_create_data_baseline, step_cond],
    )
    
    return pipeline
Exemplo n.º 24
0
        's3://sagemaker-us-east-1-513905722774/sagemaker_examples/data/temp_audio_raw',
        destination="/opt/ml/processing/input/data"),
]
OUTPUTS = [
    ProcessingOutput(
        source="/opt/ml/processing/output/train",
        destination=
        's3://sagemaker-us-east-1-513905722774/sagemaker_examples/data/temp_audio_features/train'
    ),
    ProcessingOutput(
        source="/opt/ml/processing/output/test",
        destination=
        's3://sagemaker-us-east-1-513905722774/sagemaker_examples/data/temp_audio_features/test'
    ),
]

# PROCESSOR BUILD AND RUN
processor = ScriptProcessor(base_job_name=JOB_NAME,
                            tags=TAGS,
                            role=ROLE_SAGEMAKER,
                            instance_type=INSTANCE_TYPE,
                            instance_count=INSTANCE_COUNT,
                            image_uri=IMAGE_URI,
                            command=['python3'])

processor.run(
    code=SCRIPT,
    arguments=ARGUMENTS,
    inputs=INPUTS,
    outputs=OUTPUTS,
)
Exemplo n.º 25
0
def setup_workflow(project, purpose, workflow_execution_role, script_dir,
                   ecr_repository):
    """ to setup all needed for a step function with sagemaker.
    arg: 
        project: project name under sagemaker
        purpose: subproject
        workflow_execution_role: arn to execute step functions
        script_dir: processing file name, like a .py file
        ecr_repository: ecr repository name
    return:
        workflow: a stepfunctions.workflow.Workflow instance  
    example: 
        PROJECT = '[dpt-proj-2022]'
        PURPOSE = '[processing]'
        WORKFLOW_EXECUTION_ROLE = "arn:aws-cn:iam::[*********]:role/[**************]"
        SCRIPT_DIR = "[processing].py"
        ECR_REPOSITORY = '[ecr-2022]'
    """

    # SageMaker Session setup
    # ========================================================================================
    # SageMaker Session
    # ====================================
    account_id = boto3.client('sts').get_caller_identity().get('Account')
    role = sagemaker.get_execution_role()

    # Storage
    # ====================================
    session = sagemaker.Session()
    region = session.boto_region_name
    s3_output = session.default_bucket()

    # Code storage
    # ==================
    s3_prefix = '{}/{}'.format(project, purpose)
    s3_prefix_code = '{}/code'.format(s3_prefix)
    s3CodePath = 's3://{}/{}/code'.format(s3_output, s3_prefix)

    ## preprocess & prediction
    script_list = [script_dir]

    for script in script_list:
        session.upload_data(script,
                            bucket=session.default_bucket(),
                            key_prefix=s3_prefix_code)

    # ECR environment
    # ====================================
    uri_suffix = 'amazonaws.com.cn'
    tag = ':latest'
    ecr_repository_uri = '{}.dkr.ecr.{}.{}/{}'.format(account_id, region,
                                                      uri_suffix,
                                                      ecr_repository + tag)

    # SageMaker Experiments setup
    # ========================================================================================
    experiment = Experiment.create(
        experiment_name="{}-{}".format(project, int(time.time())),
        description="machine learning project",
        sagemaker_boto_client=boto3.client('sagemaker'))
    print(experiment)

    execution_input = ExecutionInput(schema={
        "ProcessingJobName": str,
        "ResultPath": str,
    })

    # setup script processor
    script_processor = ScriptProcessor(command=['python3'],
                                       image_uri=ecr_repository_uri,
                                       role=role,
                                       instance_count=1,
                                       instance_type='ml.m5.4xlarge')

    # Step
    # ========================================================================================

    optimizing_step = steps.ProcessingStep(
        "Processing Step",
        processor=script_processor,
        job_name=execution_input["ProcessingJobName"],
        inputs=[
            ProcessingInput(source=s3CodePath,
                            destination='/opt/ml/processing/input/code',
                            input_name='code')
        ],
        outputs=[
            ProcessingOutput(output_name=purpose,
                             destination=execution_input["ResultPath"],
                             source='/opt/ml/processing/{}'.format(purpose))
        ],
        container_entrypoint=[
            "python3", "/opt/ml/processing/input/code/" + script_dir
        ],
    )

    # Fail Sate
    # ========================================================================================
    failed_state = steps.states.Fail("Processing Workflow failed",
                                     cause="SageMakerProcessingJobFailed")

    catch_state_processing = steps.states.Catch(
        error_equals=["States.TaskFailed"], next_step=failed_state)

    # Create Workflow
    # ========================================================================================
    optimizing_step.add_catch(catch_state_processing)

    workflow_name = workflow_name = "workflow-{}-{}".format(project,
                                                            purpose).upper()
    workflow_graph = steps.Chain([optimizing_step])

    workflow = Workflow(name=workflow_name,
                        definition=workflow_graph,
                        role=workflow_execution_role)

    workflow.create()
    return workflow
Exemplo n.º 26
0
def process(
    session: SagemakerSession,
    role,
    script,
    # script_source,
    inputs=None,
    outputs=None,
    dependencies=None,
    requirements=None,
    configuration_script=None,
    configuration_command=None,
    base_job_name=PROCESSING_JOB_NAME,
    job_name=None,
    image=Images.PROCESSING.tag,
    image_path=Images.PROCESSING.path,
    image_accounts=",".join(Images.PROCESSING.accounts),
    instance=PROCESSING_INSTANCE,
    volume_size=30,
    runtime_seconds=PROCESSING_RUNTIME_SECONDS,
    output_mount=OUTPUT_MOUNT,
    input_mount=INPUT_MOUNT,
    module_mount=MODULE_MOUNT,
    python='python3',
    wait=True,
    logs=True,
    arguments=None,
    tags=None,
    output_json=None,
    env=None
):
    iam = session.boto_session.client('iam')

    image_uri = ecr_ensure_image(
        image=Image(
            path=image_path,
            tag=image,
            accounts=image_accounts.split(",")
        ),
        session=session.boto_session
    )
    role = ensure_processing_role(iam=iam, role_name=role)
    if inputs is None:
        inputs = {}
    if outputs is None:
        outputs = {}
    if dependencies is None:
        dependencies = {}
    if tags is None:
        tags = {}
    else:
        tags = tags.copy()
    if arguments is None:
        arguments = {}
    else:
        arguments = arguments.copy()
    # if module_mount is not None and len(module_mount)> 0:
    #    command = ["PYTHONPATH={module_mount};${{PYTHONPATH}}".format(module_mount=module_mount), "python3"]
    # else:
    #    command = ['python3']
    command = ['sh']
    path_arguments = {}
    processing_inputs = []
    s3 = session.boto_session.client('s3')
    for name, source in inputs.items():
        processing_input, path_argument = make_processing_input(
            mount=input_mount,
            name=name,
            source=source.local,
            mode=source.mode,
            s3=s3
        )
        processing_inputs.append(processing_input)
        path_arguments[name] = path_argument
    for name, source in dependencies.items():
        processing_input, path_argument = make_processing_input(
            mount=module_mount,
            name=name,
            source=source,
            s3=s3
        )
        processing_inputs.append(processing_input)
        path_arguments[name] = path_argument

    script_remote = "{}/{}".format(module_mount, os.path.basename(script))
    processing_inputs.append(
        ProcessingInput(
            source=script,
            destination=module_mount,
            input_name="aws_sagemaker_remote_script",
            # s3_data_type='S3Prefix',
            s3_input_mode='File',
            # s3_data_distribution_type='FullyReplicated',
            # s3_compression_type='None'
        )
    )
    if env:
        env = env.copy()
    else:
        env = {}
    env.update({
        "AWS_SAGEMAKER_REMOTE_MODULE_MOUNT": module_mount,
        "AWS_SAGEMAKER_REMOTE_PYTHON": python,
        "AWS_SAGEMAKER_REMOTE_SCRIPT": script_remote,
        "IS_SAGEMAKER": "1"
    })

    if requirements:
        requirements_remote = "{}/requirements_txt/{}".format(
            module_mount, 'requirements.txt')
        env['AWS_SAGEMAKER_REMOTE_REQUIREMENTS'] = requirements_remote
        processing_inputs.append(
            ProcessingInput(
                source=requirements,
                destination="{}/requirements_txt".format(module_mount),
                input_name="aws_sagemaker_remote_requirements",
                s3_input_mode='File',
            )
        )

    if configuration_script:
        configuration_script_remote = "{}/{}".format(
            module_mount, os.path.basename(configuration_script))
        env['AWS_SAGEMAKER_REMOTE_CONFIGURATION_SCRIPT'] = configuration_script_remote
        processing_inputs.append(
            ProcessingInput(
                source=configuration_script,
                destination=module_mount,
                input_name="aws_sagemaker_remote_configuration_script",
                s3_input_mode='File'
            )
        )

    if configuration_command and len(configuration_command) > 0:
        env['AWS_SAGEMAKER_REMOTE_CONFIGURATION_COMMAND'] = configuration_command

    tags["Source"] = 'aws-sagemaker-remote'
    tags["BaseJobName"] = base_job_name
    tags = make_tags(tags)
    print("Tags: {}".format(tags))
    processor = ScriptProcessor(
        role,
        image_uri=image_uri,
        instance_count=1,
        instance_type=instance,
        command=command,
        volume_size_in_gb=volume_size,
        # volume_kms_key=None,
        # output_kms_key=None,
        max_runtime_in_seconds=runtime_seconds,
        base_job_name=base_job_name,
        sagemaker_session=session,
        env=env,
        tags=tags
        # network_config=None
    )
    processing_outputs = []
    for name, dest in outputs.items():
        # todo: move into PathArgument class
        if not ((not dest.remote) or dest.remote == 'default' or dest.remote.startswith('s3://')):
            raise ValueError("Argument [{}] must be either `default` or an S3 url (`s3://...`). Value given was [{}].".format(
                variable_to_argparse("{}_s3".format(name)), dest.remote))
        source = "{}/{}".format(output_mount, name)
        if dest.mode:
            assert dest.mode in ['EndOfJob', 'Continuous']
        processing_outputs.append(
            ProcessingOutput(
                source=source,
                destination=dest.remote if dest.remote and dest.remote != 'default' else None,
                output_name=name,
                s3_upload_mode=dest.mode or 'EndOfJob'
            ))
        path_arguments[name] = source

    ensure_eol(PROCESSING_SCRIPT)
    code = Path(PROCESSING_SCRIPT).as_uri()
    if job_name is None or len(str(job_name).strip()) == 0:
        job_name = None
    else:
        job_name = str(job_name).strip()

    arguments.update(path_arguments)
    processor.run(
        code=code,
        inputs=processing_inputs,
        outputs=processing_outputs,
        wait=False,
        logs=logs,
        job_name=job_name,
        arguments=sagemaker_arguments(vargs=arguments)
    )
    job = processor.latest_job
    if output_json:
        obj = job.describe()
        #print("Describe: {}".format(obj))
        os.makedirs(os.path.dirname(
            os.path.abspath(output_json)), exist_ok=True)
        with open(output_json, 'w') as f:
            json.dump(obj, f, default=json_converter, indent=4)

    if wait:
        job.wait(logs=logs)
    return processor
Exemplo n.º 27
0
def get_pipeline(
    region,
    role=None,
    default_bucket=None,
    model_package_group_name="TestPackageGroup",
    pipeline_name="TestPipeline",
    base_job_prefix="Test",
):
    """Gets a SageMaker ML Pipeline instance working with on abalone data.

    Args:
        region: AWS region to create and run the pipeline.
        role: IAM role to create and run steps and pipeline.
        default_bucket: the bucket to use for storing the artifacts

    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, default_bucket)
    if role is None:
        role = sagemaker.session.get_execution_role(sagemaker_session)

    # parameters for pipeline execution
    processing_instance_count = ParameterInteger(
        name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(name="ProcessingInstanceType",
                                               default_value="ml.m5.xlarge")
    training_instance_type = ParameterString(name="TrainingInstanceType",
                                             default_value="ml.m5.xlarge")
    model_approval_status = ParameterString(
        name="ModelApprovalStatus", default_value="PendingManualApproval")
    input_data = ParameterString(
        name="InputDataUrl",
        default_value=
        f"s3://sagemaker-servicecatalog-seedcode-{region}/dataset/abalone-dataset.csv",
    )

    # processing step for feature engineering
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{base_job_prefix}/sklearn-test-preprocess",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    step_process = ProcessingStep(
        name="PreprocessTestData",
        processor=sklearn_processor,
        outputs=[
            ProcessingOutput(output_name="train",
                             source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation",
                             source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test",
                             source="/opt/ml/processing/test"),
        ],
        code=os.path.join(BASE_DIR, "preprocess.py"),
        job_arguments=["--input-data", input_data],
    )

    # training step for generating model artifacts
    model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/TestTrain"
    image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        base_job_name=f"{base_job_prefix}/test-train",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    xgb_train.set_hyperparameters(
        objective="reg:linear",
        num_round=50,
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.7,
        silent=0,
    )
    step_train = TrainingStep(
        name="TrainTestModel",
        estimator=xgb_train,
        inputs={
            "train":
            TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.
                Outputs["train"].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation":
            TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.
                Outputs["validation"].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # processing step for evaluation
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-test-eval",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    evaluation_report = PropertyFile(
        name="TestEvaluationReport",
        output_name="evaluation",
        path="evaluation.json",
    )
    step_eval = ProcessingStep(
        name="EvaluateTestModel",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.
                Outputs["test"].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation",
                             source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "evaluate.py"),
        property_files=[evaluation_report],
    )

    # register model step that will be conditionally executed
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]
            ["S3Output"]["S3Uri"]),
                                       content_type="application/json"))
    step_register = RegisterModel(
        name="RegisterTestModel",
        estimator=xgb_train,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
    )

    # condition step for evaluating model quality and branching execution
    cond_lte = ConditionLessThanOrEqualTo(
        left=JsonGet(step=step_eval,
                     property_file=evaluation_report,
                     json_path="regression_metrics.mse.value"),
        right=6.0,
    )
    step_cond = ConditionStep(
        name="CheckMSETestEvaluation",
        conditions=[cond_lte],
        if_steps=[step_register],
        else_steps=[],
    )

    # pipeline instance
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )
    return pipeline
Exemplo n.º 28
0
def get_pipeline(
    region,
    role=None,
    default_bucket=None,
    model_package_group_name="sagemaker-group-insurance",
    pipeline_name="sagemaker-pipeline-insurance",
    base_job_prefix="sagemaker-featurestore-insurance",
):
    """Gets a SageMaker ML Pipeline instance working with on WIP data.

    Args:
        region: AWS region to create and run the pipeline.
        role: IAM role to create and run steps and pipeline.
        default_bucket: the bucket to use for storing the artifacts

    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, default_bucket)
    if role is None:
        role = sagemaker.session.get_execution_role(sagemaker_session)

    # parameters for pipeline execution
    processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(
        name="ProcessingInstanceType", default_value="ml.m5.xlarge"
    )
    training_instance_type = ParameterString(
        name="TrainingInstanceType", default_value="ml.m5.xlarge"
    )
    model_approval_status = ParameterString(
        name="ModelApprovalStatus", default_value="Approved"
    )

    # processing step for feature engineering
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{base_job_prefix}/sklearn-insurance-preprocess",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    step_process = ProcessingStep(
        name="PreprocessInsuranceData",
        processor=sklearn_processor,
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
        ],
        code=os.path.join(BASE_DIR, "preprocess.py"),
        job_arguments=["--input_dataset_1", "41214", 
                       "--input_dataset_2", "41215",],
    )
    
    '''
    # feature store step
    feature_path = 's3://' + default_bucket+'/'+base_job_prefix + '/features'
    image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    feature_processor = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-insurance-feature-store",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    step_feature = ProcessingStep(
        name="FeatureStoreInsuranceData",
        processor=feature_processor,
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/training_input"),
        ],
        code=os.path.join(BASE_DIR, "feature_store.py"),
        job_arguments=["feature_s3_url", feature_path, "--feature_group_name", "sagemaker-featurestore-insurance"],
    )
    '''    

    # training step for generating model artifacts
    model_path = 's3://' + default_bucket+'/'+base_job_prefix + '/features'
    image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        base_job_name=f"{base_job_prefix}/insurance-train",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    xgb_train.set_hyperparameters(objective = "reg:tweedie",
                                   num_round = 50)        
    step_train = TrainingStep(
        name="TrainAbaloneModel",
        estimator=xgb_train,
        inputs={
            "train": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # processing step for evaluation
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-wip-eval",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    evaluation_report = PropertyFile(
        name="WipEvaluationReport",
        output_name="evaluation",
        path="evaluation.json",
    )
    step_eval = ProcessingStep(
        name="EvaluateWipModel",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "evaluate.py"),
        property_files=[evaluation_report],
    )

    # register model step that will be conditionally executed
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/evaluation.json".format(
                step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json"
        )
    )
    step_register = RegisterModel(
        name="register-insurance-model",
        estimator=xgb_train,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
    )

    # condition step for evaluating model quality and branching execution
    cond_lte = ConditionLessThanOrEqualTo(
        left=JsonGet(
            step=step_eval,
            property_file=evaluation_report,
            json_path="regression_metrics.mse.value"
        ),
        right=6.0,
    )
    step_cond = ConditionStep(
        name="CheckMSEWipEvaluation",
        conditions=[cond_lte],
        if_steps=[],
        else_steps=[step_register],
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )
    return pipeline