Пример #1
0
def test_training_step(sagemaker_session):
    estimator = Estimator(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=1,
        instance_type="c4.4xlarge",
        profiler_config=ProfilerConfig(system_monitor_interval_millis=500),
        rules=[],
        sagemaker_session=sagemaker_session,
    )
    inputs = TrainingInput(f"s3://{BUCKET}/train_manifest")
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    step = TrainingStep(name="MyTrainingStep",
                        estimator=estimator,
                        inputs=inputs,
                        cache_config=cache_config)
    assert step.to_request() == {
        "Name": "MyTrainingStep",
        "Type": "Training",
        "Arguments": {
            "AlgorithmSpecification": {
                "TrainingImage": IMAGE_URI,
                "TrainingInputMode": "File"
            },
            "InputDataConfig": [{
                "ChannelName": "training",
                "DataSource": {
                    "S3DataSource": {
                        "S3DataDistributionType": "FullyReplicated",
                        "S3DataType": "S3Prefix",
                        "S3Uri": f"s3://{BUCKET}/train_manifest",
                    }
                },
            }],
            "OutputDataConfig": {
                "S3OutputPath": f"s3://{BUCKET}/"
            },
            "ResourceConfig": {
                "InstanceCount": 1,
                "InstanceType": "c4.4xlarge",
                "VolumeSizeInGB": 30,
            },
            "RoleArn":
            ROLE,
            "StoppingCondition": {
                "MaxRuntimeInSeconds": 86400
            },
            "ProfilerConfig": {
                "ProfilingIntervalInMilliseconds": 500,
                "S3OutputPath": f"s3://{BUCKET}/",
            },
        },
        "CacheConfig": {
            "Enabled": True,
            "ExpireAfter": "PT1H"
        },
    }
    assert step.properties.TrainingJobName.expr == {
        "Get": "Steps.MyTrainingStep.TrainingJobName"
    }
Пример #2
0
def test_processing_step_normalizes_args_with_no_code(mock_normalize_args,
                                                      script_processor):
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    inputs = [
        ProcessingInput(
            source=f"s3://{BUCKET}/processing_manifest",
            destination="processing_manifest",
        )
    ]
    outputs = [
        ProcessingOutput(
            source=f"s3://{BUCKET}/processing_manifest",
            destination="processing_manifest",
        )
    ]
    step = ProcessingStep(
        name="MyProcessingStep",
        processor=script_processor,
        inputs=inputs,
        outputs=outputs,
        job_arguments=["arg1", "arg2"],
        cache_config=cache_config,
    )
    mock_normalize_args.return_value = [step.inputs, step.outputs]
    step.to_request()
    mock_normalize_args.assert_called_with(
        job_name=None,
        arguments=step.job_arguments,
        inputs=step.inputs,
        outputs=step.outputs,
        code=None,
    )
Пример #3
0
def test_processing_step(sagemaker_session):
    processing_input_data_uri_parameter = ParameterString(
        name="ProcessingInputDataUri", default_value=f"s3://{BUCKET}/processing_manifest"
    )
    instance_type_parameter = ParameterString(name="InstanceType", default_value="ml.m4.4xlarge")
    instance_count_parameter = ParameterInteger(name="InstanceCount", default_value=1)
    processor = Processor(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=instance_count_parameter,
        instance_type=instance_type_parameter,
        sagemaker_session=sagemaker_session,
    )
    inputs = [
        ProcessingInput(
            source=processing_input_data_uri_parameter,
            destination="processing_manifest",
        )
    ]
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    step = ProcessingStep(
        name="MyProcessingStep",
        processor=processor,
        inputs=inputs,
        outputs=[],
        cache_config=cache_config,
    )
    assert step.to_request() == {
        "Name": "MyProcessingStep",
        "Type": "Processing",
        "Arguments": {
            "AppSpecification": {"ImageUri": "fakeimage"},
            "ProcessingInputs": [
                {
                    "InputName": "input-1",
                    "AppManaged": False,
                    "S3Input": {
                        "LocalPath": "processing_manifest",
                        "S3CompressionType": "None",
                        "S3DataDistributionType": "FullyReplicated",
                        "S3DataType": "S3Prefix",
                        "S3InputMode": "File",
                        "S3Uri": processing_input_data_uri_parameter,
                    },
                }
            ],
            "ProcessingResources": {
                "ClusterConfig": {
                    "InstanceCount": instance_count_parameter,
                    "InstanceType": instance_type_parameter,
                    "VolumeSizeInGB": 30,
                }
            },
            "RoleArn": "DummyRole",
        },
        "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
    }
    assert step.properties.ProcessingJobName.expr == {
        "Get": "Steps.MyProcessingStep.ProcessingJobName"
    }
Пример #4
0
def test_processing_step_normalizes_args(mock_normalize_args,
                                         sagemaker_session):
    processor = ScriptProcessor(
        role=ROLE,
        image_uri=
        "012345678901.dkr.ecr.us-west-2.amazonaws.com/my-custom-image-uri",
        command=["python3"],
        instance_type="ml.m4.xlarge",
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{
            "Key": "my-tag",
            "Value": "my-tag-value"
        }],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
            encrypt_inter_container_traffic=True,
        ),
        sagemaker_session=sagemaker_session,
    )
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    inputs = [
        ProcessingInput(
            source=f"s3://{BUCKET}/processing_manifest",
            destination="processing_manifest",
        )
    ]
    outputs = [
        ProcessingOutput(
            source=f"s3://{BUCKET}/processing_manifest",
            destination="processing_manifest",
        )
    ]
    step = ProcessingStep(
        name="MyProcessingStep",
        processor=processor,
        code="foo.py",
        inputs=inputs,
        outputs=outputs,
        job_arguments=["arg1", "arg2"],
        cache_config=cache_config,
    )
    mock_normalize_args.return_value = [step.inputs, step.outputs]
    step.to_request()
    mock_normalize_args.assert_called_with(
        arguments=step.job_arguments,
        inputs=step.inputs,
        outputs=step.outputs,
        code=step.code,
    )
Пример #5
0
def test_processing_step_with_processor(pipeline_session, processing_input):
    processor = Processor(
        image_uri=IMAGE_URI,
        role=sagemaker.get_execution_role(),
        instance_count=1,
        instance_type=INSTANCE_TYPE,
        sagemaker_session=pipeline_session,
    )

    with warnings.catch_warnings(record=True) as w:
        step_args = processor.run(inputs=processing_input)
        assert len(w) == 1
        assert issubclass(w[-1].category, UserWarning)
        assert "Running within a PipelineSession" in str(w[-1].message)

    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    evaluation_report = PropertyFile(
        name="EvaluationReport", output_name="evaluation", path="evaluation.json"
    )

    with warnings.catch_warnings(record=True) as w:
        step = ProcessingStep(
            name="MyProcessingStep",
            step_args=step_args,
            description="ProcessingStep description",
            display_name="MyProcessingStep",
            depends_on=["TestStep", "SecondTestStep"],
            cache_config=cache_config,
            property_files=[evaluation_report],
        )
        assert len(w) == 0

    pipeline = Pipeline(
        name="MyPipeline",
        steps=[step],
        sagemaker_session=pipeline_session,
    )
    assert json.loads(pipeline.definition())["Steps"][0] == {
        "Name": "MyProcessingStep",
        "Description": "ProcessingStep description",
        "DisplayName": "MyProcessingStep",
        "Type": "Processing",
        "DependsOn": ["TestStep", "SecondTestStep"],
        "Arguments": step_args,
        "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
        "PropertyFiles": [
            {
                "FilePath": "evaluation.json",
                "OutputName": "evaluation",
                "PropertyFileName": "EvaluationReport",
            }
        ],
    }
    assert step.properties.ProcessingJobName.expr == {
        "Get": "Steps.MyProcessingStep.ProcessingJobName"
    }
Пример #6
0
def test_add_depends_on(sagemaker_session):
    processing_input_data_uri_parameter = ParameterString(
        name="ProcessingInputDataUri",
        default_value=f"s3://{BUCKET}/processing_manifest")
    instance_type_parameter = ParameterString(name="InstanceType",
                                              default_value="ml.m4.4xlarge")
    instance_count_parameter = ParameterInteger(name="InstanceCount",
                                                default_value=1)
    processor = Processor(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=instance_count_parameter,
        instance_type=instance_type_parameter,
        sagemaker_session=sagemaker_session,
    )
    inputs = [
        ProcessingInput(
            source=processing_input_data_uri_parameter,
            destination="processing_manifest",
        )
    ]
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")

    step_1 = ProcessingStep(
        name="MyProcessingStep-1",
        processor=processor,
        inputs=inputs,
        outputs=[],
        cache_config=cache_config,
    )

    step_2 = ProcessingStep(
        name="MyProcessingStep-2",
        depends_on=[step_1],
        processor=processor,
        inputs=inputs,
        outputs=[],
        cache_config=cache_config,
    )

    step_3 = ProcessingStep(
        name="MyProcessingStep-3",
        depends_on=[step_1],
        processor=processor,
        inputs=inputs,
        outputs=[],
        cache_config=cache_config,
    )
    step_3.add_depends_on([step_2.name])

    assert "DependsOn" not in step_1.to_request()
    assert step_2.to_request()["DependsOn"] == ["MyProcessingStep-1"]
    assert step_3.to_request()["DependsOn"] == [
        "MyProcessingStep-1", "MyProcessingStep-2"
    ]
Пример #7
0
def test_emr_step_with_one_step_config(sagemaker_session):
    emr_step_config = EMRStepConfig(
        jar="s3:/script-runner/script-runner.jar",
        args=["--arg_0", "arg_0_value"],
        main_class="com.my.main",
        properties=[{"Key": "Foo", "Value": "Foo_value"}, {"Key": "Bar", "Value": "Bar_value"}],
    )

    emr_step = EMRStep(
        name="MyEMRStep",
        display_name="MyEMRStep",
        description="MyEMRStepDescription",
        cluster_id="MyClusterID",
        step_config=emr_step_config,
        depends_on=["TestStep"],
        cache_config=CacheConfig(enable_caching=True, expire_after="PT1H"),
    )
    emr_step.add_depends_on(["SecondTestStep"])
    assert emr_step.to_request() == {
        "Name": "MyEMRStep",
        "Type": "EMR",
        "Arguments": {
            "ClusterId": "MyClusterID",
            "StepConfig": {
                "HadoopJarStep": {
                    "Args": ["--arg_0", "arg_0_value"],
                    "Jar": "s3:/script-runner/script-runner.jar",
                    "MainClass": "com.my.main",
                    "Properties": [
                        {"Key": "Foo", "Value": "Foo_value"},
                        {"Key": "Bar", "Value": "Bar_value"},
                    ],
                }
            },
        },
        "DependsOn": ["TestStep", "SecondTestStep"],
        "DisplayName": "MyEMRStep",
        "Description": "MyEMRStepDescription",
        "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
    }

    assert emr_step.properties.ClusterId == "MyClusterID"
    assert emr_step.properties.ActionOnFailure.expr == {"Get": "Steps.MyEMRStep.ActionOnFailure"}
    assert emr_step.properties.Config.Args.expr == {"Get": "Steps.MyEMRStep.Config.Args"}
    assert emr_step.properties.Config.Jar.expr == {"Get": "Steps.MyEMRStep.Config.Jar"}
    assert emr_step.properties.Config.MainClass.expr == {"Get": "Steps.MyEMRStep.Config.MainClass"}
    assert emr_step.properties.Id.expr == {"Get": "Steps.MyEMRStep.Id"}
    assert emr_step.properties.Name.expr == {"Get": "Steps.MyEMRStep.Name"}
    assert emr_step.properties.Status.State.expr == {"Get": "Steps.MyEMRStep.Status.State"}
    assert emr_step.properties.Status.FailureDetails.Reason.expr == {
        "Get": "Steps.MyEMRStep.Status.FailureDetails.Reason"
    }
Пример #8
0
def test_transform_step(sagemaker_session):
    transformer = Transformer(
        model_name=MODEL_NAME,
        instance_count=1,
        instance_type="c4.4xlarge",
        sagemaker_session=sagemaker_session,
    )
    inputs = TransformInput(data=f"s3://{BUCKET}/transform_manifest")
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    step = TransformStep(
        name="MyTransformStep",
        depends_on=["TestStep"],
        transformer=transformer,
        display_name="TransformStep",
        description="TestDescription",
        inputs=inputs,
        cache_config=cache_config,
    )
    step.add_depends_on(["SecondTestStep"])
    assert step.to_request() == {
        "Name": "MyTransformStep",
        "Type": "Transform",
        "Description": "TestDescription",
        "DisplayName": "TransformStep",
        "DependsOn": ["TestStep", "SecondTestStep"],
        "Arguments": {
            "ModelName": "gisele",
            "TransformInput": {
                "DataSource": {
                    "S3DataSource": {
                        "S3DataType": "S3Prefix",
                        "S3Uri": "s3://my-bucket/transform_manifest",
                    }
                }
            },
            "TransformOutput": {
                "S3OutputPath": None
            },
            "TransformResources": {
                "InstanceCount": 1,
                "InstanceType": "c4.4xlarge",
            },
        },
        "CacheConfig": {
            "Enabled": True,
            "ExpireAfter": "PT1H"
        },
    }
    assert step.properties.TransformJobName.expr == {
        "Get": "Steps.MyTransformStep.TransformJobName"
    }
Пример #9
0
def test_training_step_no_profiler_warning(sagemaker_session):
    estimator = TensorFlow(
        entry_point=DUMMY_SCRIPT_PATH,
        role=ROLE,
        model_dir=False,
        image_uri=IMAGE_URI,
        source_dir="s3://mybucket/source",
        framework_version="2.4.1",
        py_version="py37",
        disable_profiler=True,
        instance_count=1,
        instance_type="ml.p3.16xlarge",
        sagemaker_session=sagemaker_session,
        hyperparameters={
            "batch-size": 500,
            "epochs": 5,
        },
        debugger_hook_config=False,
        distribution={"smdistributed": {
            "dataparallel": {
                "enabled": True
            }
        }},
    )

    inputs = TrainingInput(s3_data=f"s3://{BUCKET}/train_manifest")
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    with warnings.catch_warnings(record=True) as w:
        # profiler disabled, cache config not None
        TrainingStep(name="MyTrainingStep",
                     estimator=estimator,
                     inputs=inputs,
                     cache_config=cache_config)
        assert len(w) == 0

    with warnings.catch_warnings(record=True) as w:
        # profiler enabled, cache config is None
        estimator.disable_profiler = False
        TrainingStep(name="MyTrainingStep",
                     estimator=estimator,
                     inputs=inputs,
                     cache_config=None)
        assert len(w) == 0
def test_data_quality_check_step(
    sagemaker_session,
    check_job_config,
    model_package_group_name,
    supplied_baseline_statistics_uri,
    supplied_baseline_constraints_uri,
):
    data_quality_check_config = DataQualityCheckConfig(
        baseline_dataset=ParameterString(name="BaselineDataset"),
        dataset_format=DatasetFormat.csv(header=True),
        output_s3_uri="s3://...",
        record_preprocessor_script=
        "s3://my_bucket/data_quality/preprocessor.py",
        post_analytics_processor_script=
        "s3://my_bucket/data_quality/postprocessor.py",
    )
    data_quality_check_step = QualityCheckStep(
        name="DataQualityCheckStep",
        skip_check=False,
        register_new_baseline=False,
        quality_check_config=data_quality_check_config,
        check_job_config=check_job_config,
        model_package_group_name=model_package_group_name,
        supplied_baseline_statistics=supplied_baseline_statistics_uri,
        supplied_baseline_constraints=supplied_baseline_constraints_uri,
        cache_config=CacheConfig(enable_caching=True, expire_after="PT1H"),
    )
    pipeline = Pipeline(
        name="MyPipeline",
        parameters=[
            supplied_baseline_statistics_uri,
            supplied_baseline_constraints_uri,
            model_package_group_name,
        ],
        steps=[data_quality_check_step],
        sagemaker_session=sagemaker_session,
    )
    step_definition = _get_step_definition_for_test(
        pipeline, ["baseline_dataset_input", "quality_check_output"])

    assert step_definition == _expected_data_quality_dsl
def test_data_bias_check_step(sagemaker_session, check_job_config,
                              model_package_group_name, bias_config):
    data_bias_data_config = DataConfig(
        s3_data_input_path=_S3_INPUT_PATH,
        s3_output_path=_S3_OUTPUT_PATH,
        s3_analysis_config_output_path=_S3_ANALYSIS_CONFIG_OUTPUT_PATH,
        label="fraud",
        dataset_type="text/csv",
    )
    data_bias_check_config = DataBiasCheckConfig(
        data_config=data_bias_data_config,
        data_bias_config=bias_config,
        methods="all",
        kms_key="kms_key",
    )
    data_bias_check_step = ClarifyCheckStep(
        name="DataBiasCheckStep",
        clarify_check_config=data_bias_check_config,
        check_job_config=check_job_config,
        skip_check=False,
        register_new_baseline=False,
        model_package_group_name=model_package_group_name,
        supplied_baseline_constraints="supplied_baseline_constraints",
        cache_config=CacheConfig(enable_caching=True, expire_after="PT1H"),
    )
    pipeline = Pipeline(
        name="MyPipeline",
        parameters=[model_package_group_name],
        steps=[data_bias_check_step],
        sagemaker_session=sagemaker_session,
    )

    assert json.loads(
        pipeline.definition())["Steps"][0] == _expected_data_bias_dsl
    assert re.match(
        f"{_S3_ANALYSIS_CONFIG_OUTPUT_PATH}/{_BIAS_MONITORING_CFG_BASE_NAME}-configuration"
        +
        f"/{_BIAS_MONITORING_CFG_BASE_NAME}-config.*/.*/analysis_config.json",
        data_bias_check_config.monitoring_analysis_config_uri,
    )
def test_lambda_step(sagemaker_session):
    param = ParameterInteger(name="MyInt")
    output_param1 = LambdaOutput(output_name="output1", output_type=LambdaOutputTypeEnum.String)
    output_param2 = LambdaOutput(output_name="output2", output_type=LambdaOutputTypeEnum.Boolean)
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    lambda_step = LambdaStep(
        name="MyLambdaStep",
        depends_on=["TestStep"],
        lambda_func=Lambda(
            function_arn="arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda",
            session=sagemaker_session,
        ),
        display_name="MyLambdaStep",
        description="MyLambdaStepDescription",
        inputs={"arg1": "foo", "arg2": 5, "arg3": param},
        outputs=[output_param1, output_param2],
        cache_config=cache_config,
    )
    lambda_step.add_depends_on(["SecondTestStep"])
    pipeline = Pipeline(
        name="MyPipeline",
        parameters=[param],
        steps=[lambda_step],
        sagemaker_session=sagemaker_session,
    )
    assert json.loads(pipeline.definition())["Steps"][0] == {
        "Name": "MyLambdaStep",
        "Type": "Lambda",
        "DependsOn": ["TestStep", "SecondTestStep"],
        "DisplayName": "MyLambdaStep",
        "Description": "MyLambdaStepDescription",
        "FunctionArn": "arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda",
        "OutputParameters": [
            {"OutputName": "output1", "OutputType": "String"},
            {"OutputName": "output2", "OutputType": "Boolean"},
        ],
        "Arguments": {"arg1": "foo", "arg2": 5, "arg3": {"Get": "Parameters.MyInt"}},
        "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
    }
def test_one_step_sparkjar_processing_pipeline(
    sagemaker_session,
    role,
    cpu_instance_type,
    pipeline_name,
    region_name,
    configuration,
    build_jar,
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=2)
    cache_config = CacheConfig(enable_caching=True, expire_after="T30m")
    spark_path = os.path.join(DATA_DIR, "spark")

    spark_jar_processor = SparkJarProcessor(
        role=role,
        instance_count=2,
        instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
        framework_version="2.4",
    )
    bucket = spark_jar_processor.sagemaker_session.default_bucket()
    with open(os.path.join(spark_path, "files", "data.jsonl")) as data:
        body = data.read()
        input_data_uri = f"s3://{bucket}/spark/input/data.jsonl"
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session,
        )
    output_data_uri = f"s3://{bucket}/spark/output/sales/{datetime.now().isoformat()}"

    java_project_dir = os.path.join(spark_path, "code", "java",
                                    "hello-java-spark")
    spark_run_args = spark_jar_processor.get_run_args(
        submit_app=f"{java_project_dir}/hello-spark-java.jar",
        submit_class="com.amazonaws.sagemaker.spark.test.HelloJavaSparkApp",
        arguments=["--input", input_data_uri, "--output", output_data_uri],
        configuration=configuration,
    )

    step_pyspark = ProcessingStep(
        name="sparkjar-process",
        processor=spark_jar_processor,
        inputs=spark_run_args.inputs,
        outputs=spark_run_args.outputs,
        job_arguments=spark_run_args.arguments,
        code=spark_run_args.code,
        cache_config=cache_config,
    )
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count],
        steps=[step_pyspark],
        sagemaker_session=sagemaker_session,
    )

    try:
        # NOTE: We should exercise the case when role used in the pipeline execution is
        # different than that required of the steps in the pipeline itself. The role in
        # the pipeline definition needs to create training and processing jobs and other
        # sagemaker entities. However, the jobs created in the steps themselves execute
        # under a potentially different role, often requiring access to S3 and other
        # artifacts not required to during creation of the jobs in the pipeline steps.
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )

        pipeline.parameters = [
            ParameterInteger(name="InstanceCount", default_value=1)
        ]
        response = pipeline.update(role)
        update_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            update_arn,
        )

        execution = pipeline.start(parameters={})
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )

        response = execution.describe()
        assert response["PipelineArn"] == create_arn

        # Check CacheConfig
        response = json.loads(
            pipeline.describe()
            ["PipelineDefinition"])["Steps"][0]["CacheConfig"]
        assert response["Enabled"] == cache_config.enable_caching
        assert response["ExpireAfter"] == cache_config.expire_after

        try:
            execution.wait(delay=30, max_attempts=3)
        except WaiterError:
            pass
        execution_steps = execution.list_steps()
        assert len(execution_steps) == 1
        assert execution_steps[0]["StepName"] == "sparkjar-process"
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
def test_one_step_pyspark_processing_pipeline(
    sagemaker_session,
    role,
    cpu_instance_type,
    pipeline_name,
    region_name,
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=2)
    script_path = os.path.join(DATA_DIR, "dummy_script.py")

    cache_config = CacheConfig(enable_caching=True, expire_after="T30m")

    pyspark_processor = PySparkProcessor(
        base_job_name="sm-spark",
        framework_version="2.4",
        role=role,
        instance_count=instance_count,
        instance_type=cpu_instance_type,
        max_runtime_in_seconds=1200,
        sagemaker_session=sagemaker_session,
    )

    spark_run_args = pyspark_processor.get_run_args(
        submit_app=script_path,
        arguments=[
            "--s3_input_bucket",
            sagemaker_session.default_bucket(),
            "--s3_input_key_prefix",
            "spark-input",
            "--s3_output_bucket",
            sagemaker_session.default_bucket(),
            "--s3_output_key_prefix",
            "spark-output",
        ],
    )

    step_pyspark = ProcessingStep(
        name="pyspark-process",
        processor=pyspark_processor,
        inputs=spark_run_args.inputs,
        outputs=spark_run_args.outputs,
        job_arguments=spark_run_args.arguments,
        code=spark_run_args.code,
        cache_config=cache_config,
    )
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count],
        steps=[step_pyspark],
        sagemaker_session=sagemaker_session,
    )

    try:
        # NOTE: We should exercise the case when role used in the pipeline execution is
        # different than that required of the steps in the pipeline itself. The role in
        # the pipeline definition needs to create training and processing jobs and other
        # sagemaker entities. However, the jobs created in the steps themselves execute
        # under a potentially different role, often requiring access to S3 and other
        # artifacts not required to during creation of the jobs in the pipeline steps.
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )

        pipeline.parameters = [
            ParameterInteger(name="InstanceCount", default_value=1)
        ]
        response = pipeline.update(role)
        update_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            update_arn,
        )

        execution = pipeline.start(parameters={})
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )

        response = execution.describe()
        assert response["PipelineArn"] == create_arn

        # Check CacheConfig
        response = json.loads(
            pipeline.describe()
            ["PipelineDefinition"])["Steps"][0]["CacheConfig"]
        assert response["Enabled"] == cache_config.enable_caching
        assert response["ExpireAfter"] == cache_config.expire_after

        try:
            execution.wait(delay=30, max_attempts=3)
        except WaiterError:
            pass
        execution_steps = execution.list_steps()
        assert len(execution_steps) == 1
        assert execution_steps[0]["StepName"] == "pyspark-process"
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
def test_one_step_framework_processing_pipeline(
    sagemaker_session,
    role,
    sklearn_latest_version,
    cpu_instance_type,
    pipeline_name,
    region_name,
    athena_dataset_definition,
):
    """Use `SKLearnProcessor` to test `FrameworkProcessor`."""
    instance_count = ParameterInteger(name="InstanceCount", default_value=2)
    script_path = os.path.join(DATA_DIR, "dummy_script.py")
    input_file_path = os.path.join(DATA_DIR, "dummy_input.txt")

    inputs = [
        ProcessingInput(source=input_file_path,
                        destination="/opt/ml/processing/inputs/"),
        ProcessingInput(dataset_definition=athena_dataset_definition),
    ]

    cache_config = CacheConfig(enable_caching=True, expire_after="T30m")

    sklearn_processor = SKLearnProcessor(
        framework_version=sklearn_latest_version,
        role=role,
        instance_type=cpu_instance_type,
        instance_count=instance_count,
        sagemaker_session=sagemaker_session,
        base_job_name="test-sklearn",
    )

    run_args = sklearn_processor.get_run_args(code=script_path, inputs=inputs)

    step_sklearn = ProcessingStep(
        name="sklearn-process",
        processor=sklearn_processor,
        inputs=run_args.inputs,
        outputs=run_args.outputs,
        job_arguments=run_args.arguments,
        code=run_args.code,
        cache_config=cache_config,
    )
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count],
        steps=[step_sklearn],
        sagemaker_session=sagemaker_session,
    )

    try:
        # NOTE: We should exercise the case when role used in the pipeline execution is
        # different than that required of the steps in the pipeline itself. The role in
        # the pipeline definition needs to create training and processing jobs and other
        # sagemaker entities. However, the jobs created in the steps themselves execute
        # under a potentially different role, often requiring access to S3 and other
        # artifacts not required to during creation of the jobs in the pipeline steps.
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )

        pipeline.parameters = [
            ParameterInteger(name="InstanceCount", default_value=1)
        ]
        response = pipeline.update(role)
        update_arn = response["PipelineArn"]
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            update_arn,
        )

        execution = pipeline.start(parameters={})
        assert re.match(
            rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )

        response = execution.describe()
        assert response["PipelineArn"] == create_arn

        # Check CacheConfig
        response = json.loads(
            pipeline.describe()
            ["PipelineDefinition"])["Steps"][0]["CacheConfig"]
        assert response["Enabled"] == cache_config.enable_caching
        assert response["ExpireAfter"] == cache_config.expire_after

        try:
            execution.wait(delay=30, max_attempts=3)
        except WaiterError:
            pass
        execution_steps = execution.list_steps()
        assert len(execution_steps) == 1
        assert execution_steps[0]["StepName"] == "sklearn-process"
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
Пример #16
0
def get_pipeline(
    region,
    role=None,
    default_bucket=None,
    model_package_group_name="AbalonePackageGroup",
    pipeline_name="AbalonePipeline",
    base_job_prefix="Abalone",
):
    """Gets a SageMaker ML Pipeline instance working with on abalone data.

    Args:
        region: AWS region to create and run the pipeline.
        role: IAM role to create and run steps and pipeline.
        default_bucket: the bucket to use for storing the artifacts

    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, default_bucket)
    if role is None:
        role = sagemaker.session.get_execution_role(sagemaker_session)

    # Create cache configuration
    cache_config = CacheConfig(enable_caching=True, expire_after="T30m")

    # Create SKlean processor object
    sklearn_processor = SKLearnProcessor(
        framework_version="0.20.0",
        role=role,
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name="credit-processing-job"
    )

    # Use the sklearn_processor in a Sagemaker pipelines ProcessingStep
    step_preprocess_data = ProcessingStep(
        name="PreprocessCreditData",
        processor=sklearn_processor,
        cache_config=cache_config,
        inputs=[
          ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),  
        ],
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/output/train"),
            ProcessingOutput(output_name="validation", source="/opt/ml/processing/output/validation"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/output/test"),
            ProcessingOutput(output_name="baseline_with_headers", source="/opt/ml/processing/output/baseline")
        ],
        code=os.path.join(BASE_DIR, "preprocessing.py"),
    )


    # Where to store the trained model
    model_path = f"s3://{default_bucket}/CreditTrain"

    # Fetch container to use for training
    image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.2-2",
        py_version="py3",
        instance_type=training_instance_type,
    )

    # Create XGBoost estimator object
    xgb_estimator = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        role=role,
        disable_profiler=True,
    )

    # Specify hyperparameters
    xgb_estimator.set_hyperparameters(max_depth=5,
                            eta=0.2,
                            gamma=4,
                            min_child_weight=6,
                            subsample=0.8,
                            objective='binary:logistic',
                            num_round=25)

    # Use the xgb_estimator in a Sagemaker pipelines ProcessingStep. 
    # NOTE how the input to the training job directly references the output of the previous step.
    step_train_model = TrainingStep(
        name="TrainCreditModel",
        estimator=xgb_estimator,
        cache_config=cache_config,
        inputs={
            "train": TrainingInput(
                s3_data=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv"
            ),
            "validation": TrainingInput(
                s3_data=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv"
            )
        },
    )

    # Create ScriptProcessor object.
    evaluate_model_processor = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name="script-credit-eval",
        role=role,
    )

    # Create a PropertyFile
    # We use a PropertyFile to be able to reference outputs from a processing step, for instance to use in a condition step, which we'll see later on.
    # For more information, visit https://docs.aws.amazon.com/sagemaker/latest/dg/build-and-manage-propertyfile.html
    evaluation_report = PropertyFile(
        name="EvaluationReport",
        output_name="evaluation",
        path="evaluation.json"
    )

    # Use the evaluate_model_processor in a Sagemaker pipelines ProcessingStep. 
    step_evaluate_model = ProcessingStep(
        name="EvaluateCreditModel",
        processor=evaluate_model_processor,
        cache_config=cache_config,
        inputs=[
            ProcessingInput(
                source=step_train_model.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model"
            ),
            ProcessingInput(
                source=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test"
            )
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "evaluation.py"),
        property_files=[evaluation_report],
    )


    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/evaluation.json".format(
                step_evaluate_model.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json"
        )
    )

    # Crete a RegisterModel step, which registers your model with Sagemaker Model Registry.
    step_register_model = RegisterModel(
        name="RegisterCreditModel",
        estimator=xgb_estimator,
        model_data=step_train_model.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.xlarge", "ml.m5.large"],
        transform_instances=["ml.m5.xlarge"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics
    )


    # Create Processor object using the model monitor image
    baseline_processor = sagemaker.processing.Processor(
        base_job_name="credit-risk-baseline-processor",
        image_uri=sagemaker.image_uris.retrieve(framework='model-monitor', region='eu-west-1'),
        role=role,
        instance_count=1,
        instance_type=processing_instance_type,
        env = {
            "dataset_format": "{\"csv\": {\"header\": true} }",
            "dataset_source": "/opt/ml/processing/sm_input",
            "output_path": "/opt/ml/processing/sm_output",
            "publish_cloudwatch_metrics": "Disabled"
        }
    )

    # Create a Sagemaker Pipeline step, using the baseline_processor.
    step_create_data_baseline = ProcessingStep(
        name="CreateModelQualityBaseline",
        processor=baseline_processor,
        cache_config=cache_config,
        inputs=[
            ProcessingInput(
                source=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[
                    "baseline_with_headers"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/sm_input",
            )
        ],
        outputs=[
            ProcessingOutput(
                source="/opt/ml/processing/sm_output",
                destination="s3://{}/{}/baseline".format(default_bucket, base_job_prefix),
                output_name="baseline_result",
            )
        ],
    )



    # Create Condition
    cond_gte = ConditionGreaterThanOrEqualTo(
        left=JsonGet(
            step=step_evaluate_model,
            property_file=evaluation_report,
            json_path="binary_classification_metrics.accuracy.value"
        ),
        right=0.7
    )

    # Create a Sagemaker Pipelines ConditionStep, using the condition we just created.
    step_cond = ConditionStep(
        name="AccuracyCondition",
        conditions=[cond_gte],
        if_steps=[step_register_model],
        else_steps=[], 
    )

    from sagemaker.workflow.pipeline import Pipeline

    # Create a Sagemaker Pipeline
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type, 
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
        ],
        steps=[step_preprocess_data, step_train_model, step_evaluate_model, step_create_data_baseline, step_cond],
    )
    
    return pipeline
Пример #17
0
def test_training_step_tensorflow(sagemaker_session):
    instance_type_parameter = ParameterString(name="InstanceType",
                                              default_value="ml.p3.16xlarge")
    instance_count_parameter = ParameterInteger(name="InstanceCount",
                                                default_value=1)
    data_source_uri_parameter = ParameterString(
        name="DataSourceS3Uri", default_value=f"s3://{BUCKET}/train_manifest")
    training_epochs_parameter = ParameterInteger(name="TrainingEpochs",
                                                 default_value=5)
    training_batch_size_parameter = ParameterInteger(name="TrainingBatchSize",
                                                     default_value=500)
    estimator = TensorFlow(
        entry_point=os.path.join(DATA_DIR, SCRIPT_FILE),
        role=ROLE,
        model_dir=False,
        image_uri=IMAGE_URI,
        source_dir="s3://mybucket/source",
        framework_version="2.4.1",
        py_version="py37",
        instance_count=instance_count_parameter,
        instance_type=instance_type_parameter,
        sagemaker_session=sagemaker_session,
        # subnets=subnets,
        hyperparameters={
            "batch-size": training_batch_size_parameter,
            "epochs": training_epochs_parameter,
        },
        # security_group_ids=security_group_ids,
        debugger_hook_config=False,
        # Training using SMDataParallel Distributed Training Framework
        distribution={"smdistributed": {
            "dataparallel": {
                "enabled": True
            }
        }},
    )

    inputs = TrainingInput(s3_data=data_source_uri_parameter)
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    step = TrainingStep(name="MyTrainingStep",
                        estimator=estimator,
                        inputs=inputs,
                        cache_config=cache_config)
    step_request = step.to_request()
    step_request["Arguments"]["HyperParameters"].pop("sagemaker_job_name",
                                                     None)
    step_request["Arguments"]["HyperParameters"].pop("sagemaker_program", None)
    step_request["Arguments"].pop("ProfilerRuleConfigurations", None)
    assert step_request == {
        "Name": "MyTrainingStep",
        "Type": "Training",
        "Arguments": {
            "AlgorithmSpecification": {
                "TrainingInputMode": "File",
                "TrainingImage": "fakeimage",
                "EnableSageMakerMetricsTimeSeries": True,
            },
            "OutputDataConfig": {
                "S3OutputPath": "s3://my-bucket/"
            },
            "StoppingCondition": {
                "MaxRuntimeInSeconds": 86400
            },
            "ResourceConfig": {
                "InstanceCount": instance_count_parameter,
                "InstanceType": instance_type_parameter,
                "VolumeSizeInGB": 30,
            },
            "RoleArn":
            "DummyRole",
            "InputDataConfig": [{
                "DataSource": {
                    "S3DataSource": {
                        "S3DataType": "S3Prefix",
                        "S3Uri": data_source_uri_parameter,
                        "S3DataDistributionType": "FullyReplicated",
                    }
                },
                "ChannelName": "training",
            }],
            "HyperParameters": {
                "batch-size": training_batch_size_parameter,
                "epochs": training_epochs_parameter,
                "sagemaker_submit_directory": '"s3://mybucket/source"',
                "sagemaker_container_log_level": "20",
                "sagemaker_region": '"us-west-2"',
                "sagemaker_distributed_dataparallel_enabled": "true",
                "sagemaker_instance_type": instance_type_parameter,
                "sagemaker_distributed_dataparallel_custom_mpi_options": '""',
            },
            "ProfilerConfig": {
                "S3OutputPath": "s3://my-bucket/"
            },
        },
        "CacheConfig": {
            "Enabled": True,
            "ExpireAfter": "PT1H"
        },
    }
    assert step.properties.TrainingJobName.expr == {
        "Get": "Steps.MyTrainingStep.TrainingJobName"
    }
Пример #18
0
def test_training_step_base_estimator(sagemaker_session):
    instance_type_parameter = ParameterString(name="InstanceType",
                                              default_value="c4.4xlarge")
    instance_count_parameter = ParameterInteger(name="InstanceCount",
                                                default_value=1)
    data_source_uri_parameter = ParameterString(
        name="DataSourceS3Uri", default_value=f"s3://{BUCKET}/train_manifest")
    training_epochs_parameter = ParameterInteger(name="TrainingEpochs",
                                                 default_value=5)
    training_batch_size_parameter = ParameterInteger(name="TrainingBatchSize",
                                                     default_value=500)
    estimator = Estimator(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=instance_count_parameter,
        instance_type=instance_type_parameter,
        profiler_config=ProfilerConfig(system_monitor_interval_millis=500),
        hyperparameters={
            "batch-size": training_batch_size_parameter,
            "epochs": training_epochs_parameter,
        },
        rules=[],
        sagemaker_session=sagemaker_session,
    )
    inputs = TrainingInput(s3_data=data_source_uri_parameter)
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    step = TrainingStep(
        name="MyTrainingStep",
        depends_on=["TestStep"],
        estimator=estimator,
        inputs=inputs,
        cache_config=cache_config,
    )
    step.add_depends_on(["AnotherTestStep"])
    assert step.to_request() == {
        "Name": "MyTrainingStep",
        "Type": "Training",
        "DependsOn": ["TestStep", "AnotherTestStep"],
        "Arguments": {
            "AlgorithmSpecification": {
                "TrainingImage": IMAGE_URI,
                "TrainingInputMode": "File"
            },
            "HyperParameters": {
                "batch-size": training_batch_size_parameter,
                "epochs": training_epochs_parameter,
            },
            "InputDataConfig": [{
                "ChannelName": "training",
                "DataSource": {
                    "S3DataSource": {
                        "S3DataDistributionType": "FullyReplicated",
                        "S3DataType": "S3Prefix",
                        "S3Uri": data_source_uri_parameter,
                    }
                },
            }],
            "OutputDataConfig": {
                "S3OutputPath": f"s3://{BUCKET}/"
            },
            "ResourceConfig": {
                "InstanceCount": instance_count_parameter,
                "InstanceType": instance_type_parameter,
                "VolumeSizeInGB": 30,
            },
            "RoleArn":
            ROLE,
            "StoppingCondition": {
                "MaxRuntimeInSeconds": 86400
            },
            "ProfilerConfig": {
                "ProfilingIntervalInMilliseconds": 500,
                "S3OutputPath": f"s3://{BUCKET}/",
            },
        },
        "CacheConfig": {
            "Enabled": True,
            "ExpireAfter": "PT1H"
        },
    }
    assert step.properties.TrainingJobName.expr == {
        "Get": "Steps.MyTrainingStep.TrainingJobName"
    }
Пример #19
0
def read_conf(cfg_file):
    """Reads config file, returns a dict with workflow parameters"""

    # FIXME: refactor! the function is ugly, instead we can set the
    #        three names from one present in the .ini file. That would
    #        likely need to have some snake case to PascalCase and back
    #        conversion hacks
    sg_cfg = SGConf()
    config = ConfigParser(interpolation=ExtendedInterpolation())
    config.read_file(open(cfg_file))

    region = boto3.Session().region_name
    sg_cfg.bucket = config['metadata'].get('bucket')
    sg_cfg.pipeline_name = config['metadata'].get('pipeline_name')

    # fetch the execution role and account id from Secrets Manager
    session = boto3.session.Session()
    client_secrets_fetch = session.client(service_name='secretsmanager',
                                          region_name=region).get_secret_value

    secret_role = config['secretsmanager'].get('secret_role')
    secret_account_id = config['secretsmanager'].get('secret_account_id')
    sg_cfg.role = client_secrets_fetch(SecretId=secret_role)['SecretString']
    account_id = client_secrets_fetch(
        SecretId=secret_account_id)['SecretString']

    # will reuse the same cache config for all the steps
    sg_cfg.cache_config = CacheConfig(
        enable_caching=config['metadata'].getboolean('cache_steps'),
        expire_after=config['metadata'].get('cache_expire_after'))
    # FIXME: resolve with pathlib!
    sg_cfg.source_dir = config['metadata'].get('source_dir')

    # start defining workflow parameters for sagemaker pipeline steps
    # first off, processing steps
    sg_cfg.input_data = ParameterString(
        name='InputData', default_value=config['processing'].get('input_data'))

    sg_cfg.processing_instance_count = ParameterInteger(
        name='ProcessingInstanceCount',
        default_value=config['processing'].getint('instance_count'))

    sg_cfg.processing_instance_type = ParameterString(
        name='ProcessingInstanceType',
        default_value=config['processing'].get('instance_type'))

    sg_cfg.processing_train_test_split = ParameterFloat(
        name='TrainTestSplit',
        default_value=config['processing'].getfloat(
            'train_test_split_fraction'))

    sg_cfg.processing_turicreate_uri = ParameterString(
        name='TuriCreateProcessingURI',
        default_value=config['processing'].get('image_uri_fmt').format(
            account_id))

    # control settings for the training job
    sg_cfg.training_instance_count = ParameterInteger(
        name='TrainingInstanceCount',
        default_value=config['training'].getint('instance_count'))

    sg_cfg.training_instance_type = ParameterString(
        name='TrainingInstanceType',
        default_value=config['training'].get('instance_type'))

    sg_cfg.training_batch_size = ParameterInteger(
        name='TrainingBatchSize',
        default_value=config['training'].getint('batch_size'))

    sg_cfg.training_max_iterations = ParameterInteger(
        name='MaxIterations',
        default_value=config['training'].getint('max_iterations'))

    sg_cfg.training_turicreate_uri = ParameterString(
        name='TuriCreateTrainingURI',
        default_value=config['training'].get('image_uri_fmt').format(
            account_id))

    # settings for model card creation
    sg_cfg.summarizing_instance_count = ParameterInteger(
        name='SummarizingInstanceCount',
        default_value=config['summarizing'].getint('instance_count'))

    sg_cfg.summarizing_instance_type = ParameterString(
        name='SummarizingInstanceType',
        default_value=config['summarizing'].get('instance_type'))

    sg_cfg.summarizing_turicreate_uri = ParameterString(
        name='TuriCreateProcessingURI',
        default_value=config['summarizing'].get('image_uri_fmt').format(
            account_id))

    # workflow parameters for model approval / rejection
    sg_cfg.model_approval_status = ParameterString(
        name='ModelApprovalStatus',
        default_value=config['evaluation'].get('approval_status'))

    sg_cfg.model_approval_map_threshold = ParameterFloat(
        name='ModelApprovalmAPThreshold',
        default_value=config['evaluation'].getfloat('approval_map_threshold'))

    sg_cfg.model_package_group_name = config['metadata'].get(
        'model_package_group_name')
    return sg_cfg
Пример #20
0
def get_pipeline(
    region,
    sagemaker_project_arn=None,
    role=None,
    default_bucket=None,
    model_package_group_name="restatePackageGroup",  # Choose any name
    pipeline_name="restate-p-XXXXXXXXX",  # You can find your pipeline name in the Studio UI (project -> Pipelines -> name)
    base_job_prefix="restate",  # Choose any name
):
    """Gets a SageMaker ML Pipeline instance working with on RE data.
    Args:
        region: AWS region to create and run the pipeline.
        role: IAM role to create and run steps and pipeline.
        default_bucket: the bucket to use for storing the artifacts
    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, default_bucket)
    if role is None:
        role = sagemaker.session.get_execution_role(sagemaker_session)

    # Parameters for pipeline execution
    processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(
        name="ProcessingInstanceType", default_value="ml.m5.2xlarge"
    )
    training_instance_type = ParameterString(
        name="TrainingInstanceType", default_value="ml.m5.xlarge"
    )
    model_approval_status = ParameterString(
        name="ModelApprovalStatus",
        default_value="PendingManualApproval",  # ModelApprovalStatus can be set to a default of "Approved" if you don't want manual approval.
    )
    input_data = ParameterString(
        name="InputDataUrl",
        default_value=f"",  # Change this to point to the s3 location of your raw input data.
    )

    data_sources = []
    # Sagemaker session
    sess = sagemaker_session

    # You can configure this with your own bucket name, e.g.
    # bucket = "my-bucket"
    bucket = sess.default_bucket()

    data_sources.append(
        ProcessingInput(
            input_name="restate-california",
            dataset_definition=DatasetDefinition(
                local_path="/opt/ml/processing/restate-california",
                data_distribution_type="FullyReplicated",
                # You can override below to point to other database or use different queries
                athena_dataset_definition=AthenaDatasetDefinition(
                    catalog="AwsDataCatalog",
                    database="restate",
                    query_string="SELECT * FROM restate.california_10",
                    output_s3_uri=f"s3://{bucket}/athena/",
                    output_format="PARQUET",
                ),
            ),
        )
    )

    print(f"Data Wrangler export storage bucket: {bucket}")

    # unique flow export ID
    flow_export_id = f"{time.strftime('%d-%H-%M-%S', time.gmtime())}-{str(uuid.uuid4())[:8]}"
    flow_export_name = f"flow-{flow_export_id}"

    # Output name is auto-generated from the select node's ID + output name from the flow file.
    output_name = "99ae1ec3-dd5f-453c-bfae-721dac423cd7.default"

    s3_output_prefix = f"export-{flow_export_name}/output"
    s3_output_path = f"s3://{bucket}/{s3_output_prefix}"
    print(f"Flow S3 export result path: {s3_output_path}")

    processing_job_output = ProcessingOutput(
        output_name=output_name,
        source="/opt/ml/processing/output",
        destination=s3_output_path,
        s3_upload_mode="EndOfJob",
    )

    # name of the flow file which should exist in the current notebook working directory
    flow_file_name = "sagemaker-pipeline/restate-athena-california.flow"

    # Load .flow file from current notebook working directory
    #!echo "Loading flow file from current notebook working directory: $PWD"

    with open(flow_file_name) as f:
        flow = json.load(f)

    # Upload flow to S3
    s3_client = boto3.client("s3")
    s3_client.upload_file(
        flow_file_name,
        bucket,
        f"data_wrangler_flows/{flow_export_name}.flow",
        ExtraArgs={"ServerSideEncryption": "aws:kms"},
    )

    flow_s3_uri = f"s3://{bucket}/data_wrangler_flows/{flow_export_name}.flow"

    print(f"Data Wrangler flow {flow_file_name} uploaded to {flow_s3_uri}")

    ## Input - Flow: restate-athena-russia.flow
    flow_input = ProcessingInput(
        source=flow_s3_uri,
        destination="/opt/ml/processing/flow",
        input_name="flow",
        s3_data_type="S3Prefix",
        s3_input_mode="File",
        s3_data_distribution_type="FullyReplicated",
    )

    # IAM role for executing the processing job.
    iam_role = role

    # Unique processing job name. Give a unique name every time you re-execute processing jobs
    processing_job_name = f"data-wrangler-flow-processing-{flow_export_id}"

    # Data Wrangler Container URL.
    container_uri = sagemaker.image_uris.retrieve(
        framework="data-wrangler",  # we are using the Sagemaker built in xgboost algorithm
        region=region,
    )

    # Processing Job Instance count and instance type.
    instance_count = 2
    instance_type = "ml.m5.4xlarge"

    # Size in GB of the EBS volume to use for storing data during processing
    volume_size_in_gb = 30

    # Content type for each output. Data Wrangler supports CSV as default and Parquet.
    output_content_type = "CSV"

    # Network Isolation mode; default is off
    enable_network_isolation = False

    # List of tags to be passed to the processing job
    user_tags = []

    # Output configuration used as processing job container arguments
    output_config = {output_name: {"content_type": output_content_type}}

    # KMS key for per object encryption; default is None
    kms_key = None

    processor = Processor(
        role=iam_role,
        image_uri=container_uri,
        instance_count=instance_count,
        instance_type=instance_type,
        volume_size_in_gb=volume_size_in_gb,
        network_config=NetworkConfig(enable_network_isolation=enable_network_isolation),
        sagemaker_session=sess,
        output_kms_key=kms_key,
        tags=user_tags,
    )

    data_wrangler_step = ProcessingStep(
        name="DataWranglerProcess",
        processor=processor,
        inputs=[flow_input] + data_sources,
        outputs=[processing_job_output],
        job_arguments=[f"--output-config '{json.dumps(output_config)}'"],
    )

    # Processing step for feature engineering
    # this processor does not have awswrangler installed
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{base_job_prefix}/sklearn-restate-preprocess",  # choose any name
        sagemaker_session=sagemaker_session,
        role=role,
    )

    step_process = ProcessingStep(
        name="Preprocess",  # choose any name
        processor=sklearn_processor,
        inputs=[
            ProcessingInput(
                source=data_wrangler_step.properties.ProcessingOutputConfig.Outputs[
                    output_name
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/data/raw-data-dir",
            )
        ],
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
        ],
        code=os.path.join(BASE_DIR, "preprocess.py"),
        job_arguments=[
            "--input-data",
            data_wrangler_step.properties.ProcessingOutputConfig.Outputs[
                output_name
            ].S3Output.S3Uri,
        ],
    )

    # Training step for generating model artifacts
    model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/restateTrain"
    model_bucket_key = f"{sagemaker_session.default_bucket()}/{base_job_prefix}/restateTrain"
    cache_config = CacheConfig(enable_caching=True, expire_after="30d")

    xgb_image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",  # we are using the Sagemaker built in xgboost algorithm
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=xgb_image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        base_job_name=f"{base_job_prefix}/restate-xgb-train",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    xgb_train.set_hyperparameters(
        #    #objective="binary:logistic",
        #    objective="reg:linear",
        num_round=50,
        #    max_depth=5,
        #    eta=0.2,
        #    gamma=4,
        #    min_child_weight=6,
        #    subsample=0.7,
        #    silent=0,
    )

    xgb_train.set_hyperparameters(grow_policy="lossguide")

    xgb_objective_metric_name = "validation:mse"
    xgb_hyperparameter_ranges = {
        "max_depth": IntegerParameter(2, 10, scaling_type="Linear"),
    }

    xgb_tuner_log = HyperparameterTuner(
        xgb_train,
        xgb_objective_metric_name,
        xgb_hyperparameter_ranges,
        max_jobs=3,
        max_parallel_jobs=3,
        strategy="Random",
        objective_type="Minimize",
    )

    xgb_step_tuning = TuningStep(
        name="XGBHPTune",
        tuner=xgb_tuner_log,
        inputs={
            "train": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
        cache_config=cache_config,
    )

    # dtree_image_uri = '625467769535.dkr.ecr.ap-southeast-1.amazonaws.com/sagemaker-decision-tree:latest'
    dtree_image_uri = sagemaker_session.sagemaker_client.describe_image_version(
        ImageName="restate-dtree"
    )["ContainerImage"]

    dtree_train = Estimator(
        image_uri=dtree_image_uri,
        role=role,
        instance_count=1,
        instance_type=training_instance_type,
        base_job_name=f"{base_job_prefix}/restate-dtree-train",
        output_path=model_path,
        sagemaker_session=sagemaker_session,
    )

    dtree_objective_metric_name = "validation:mse"
    dtree_metric_definitions = [{"Name": "validation:mse", "Regex": "mse:(\S+)"}]

    dtree_hyperparameter_ranges = {
        "max_depth": IntegerParameter(10, 50, scaling_type="Linear"),
        "max_leaf_nodes": IntegerParameter(2, 12, scaling_type="Linear"),
    }

    dtree_tuner_log = HyperparameterTuner(
        dtree_train,
        dtree_objective_metric_name,
        dtree_hyperparameter_ranges,
        dtree_metric_definitions,
        max_jobs=3,
        max_parallel_jobs=3,
        strategy="Random",
        objective_type="Minimize",
    )

    dtree_step_tuning = TuningStep(
        name="DTreeHPTune",
        tuner=dtree_tuner_log,
        inputs={
            "training": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
        cache_config=cache_config,
    )

    dtree_script_eval = ScriptProcessor(
        image_uri=dtree_image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-dtree-eval",
        sagemaker_session=sagemaker_session,
        role=role,
    )

    dtree_evaluation_report = PropertyFile(
        name="EvaluationReportDTree",
        output_name="dtree_evaluation",
        path="dtree_evaluation.json",
    )

    dtree_step_eval = ProcessingStep(
        name="DTreeEval",
        processor=dtree_script_eval,
        inputs=[
            ProcessingInput(
                # source=dtree_step_train.properties.ModelArtifacts.S3ModelArtifacts,
                source=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(
                output_name="dtree_evaluation", source="/opt/ml/processing/evaluation"
            ),
        ],
        code=os.path.join(BASE_DIR, "dtree_evaluate.py"),
        property_files=[dtree_evaluation_report],
    )

    xgb_script_eval = ScriptProcessor(
        image_uri=xgb_image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-xgb-eval",
        sagemaker_session=sagemaker_session,
        role=role,
    )

    xgb_evaluation_report = PropertyFile(
        name="EvaluationReportXGBoost",
        output_name="xgb_evaluation",
        path="xgb_evaluation.json",
    )

    xgb_step_eval = ProcessingStep(
        name="XGBEval",
        processor=xgb_script_eval,
        inputs=[
            ProcessingInput(
                source=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="xgb_evaluation", source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "xgb_evaluate.py"),
        property_files=[xgb_evaluation_report],
    )

    xgb_model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/xgb_evaluation.json".format(
                xgb_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json",
        )
    )

    dtree_model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/dtree_evaluation.json".format(
                dtree_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"][
                    "S3Uri"
                ]
            ),
            content_type="application/json",
        )
    )

    xgb_eval_metrics = JsonGet(
        step=xgb_step_eval,
        property_file=xgb_evaluation_report,
        json_path="regression_metrics.r2s.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
    )

    dtree_eval_metrics = JsonGet(
        step=dtree_step_eval,
        property_file=dtree_evaluation_report,
        json_path="regression_metrics.r2s.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
    )

    # Register model step that will be conditionally executed
    dtree_step_register = RegisterModel(
        name="DTreeReg",
        estimator=dtree_train,
        model_data=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=dtree_model_metrics,
    )

    # Register model step that will be conditionally executed
    xgb_step_register = RegisterModel(
        name="XGBReg",
        estimator=xgb_train,
        model_data=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=xgb_model_metrics,
    )

    # Condition step for evaluating model quality and branching execution
    cond_lte = ConditionGreaterThanOrEqualTo(  # You can change the condition here
        left=JsonGet(
            step=dtree_step_eval,
            property_file=dtree_evaluation_report,
            json_path="regression_metrics.r2s.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
        ),
        right=JsonGet(
            step=xgb_step_eval,
            property_file=xgb_evaluation_report,
            json_path="regression_metrics.r2s.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
        ),  # You can change the threshold here
    )

    step_cond = ConditionStep(
        name="AccuracyCond",
        conditions=[cond_lte],
        if_steps=[dtree_step_register],
        else_steps=[xgb_step_register],
    )
    create_date = time.strftime("%Y-%m-%d-%H-%M-%S")

    # Pipeline instance
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data
        ],
        pipeline_experiment_config=PipelineExperimentConfig(
            pipeline_name + "-" + create_date, "restate-{}".format(create_date)
        ),
        steps=[
            data_wrangler_step,
            step_process,
            dtree_step_tuning,
            xgb_step_tuning,
            dtree_step_eval,
            xgb_step_eval,
            step_cond,
        ],
        sagemaker_session=sagemaker_session,
    )
    return pipeline