示例#1
0
def test_invalid_step_retry_policy():
    try:
        StepRetryPolicy(
            exception_types=[SageMakerJobExceptionTypeEnum.INTERNAL_ERROR],
            interval_seconds=5,
            max_attempts=3,
        )
        assert False
    except Exception:
        assert True
示例#2
0
def test_valid_step_retry_policy():
    retry_policy = StepRetryPolicy(
        exception_types=[
            StepExceptionTypeEnum.SERVICE_FAULT,
            StepExceptionTypeEnum.THROTTLING
        ],
        interval_seconds=5,
        max_attempts=3,
    )
    assert retry_policy.to_request() == {
        "ExceptionType": ["Step.SERVICE_FAULT", "Step.THROTTLING"],
        "IntervalSeconds": 5,
        "BackoffRate": 2.0,
        "MaxAttempts": 3,
    }

    retry_policy = StepRetryPolicy(
        exception_types=[
            StepExceptionTypeEnum.SERVICE_FAULT,
            StepExceptionTypeEnum.THROTTLING
        ],
        interval_seconds=5,
        backoff_rate=2.0,
        expire_after_mins=30,
    )
    assert retry_policy.to_request() == {
        "ExceptionType": ["Step.SERVICE_FAULT", "Step.THROTTLING"],
        "IntervalSeconds": 5,
        "BackoffRate": 2.0,
        "ExpireAfterMin": 30,
    }
示例#3
0
def test_custom_step_with_retry_policy():
    step = CustomStep(
        name="MyStep",
        retry_policies=[
            StepRetryPolicy(
                exception_types=[
                    StepExceptionTypeEnum.SERVICE_FAULT,
                    StepExceptionTypeEnum.THROTTLING,
                ],
                expire_after_mins=1,
            ),
            SageMakerJobStepRetryPolicy(
                exception_types=[SageMakerJobExceptionTypeEnum.CAPACITY_ERROR],
                max_attempts=3,
            ),
        ],
    )
    assert step.to_request() == {
        "Name":
        "MyStep",
        "Type":
        "Training",
        "RetryPolicies": [
            {
                "ExceptionType": ["Step.SERVICE_FAULT", "Step.THROTTLING"],
                "IntervalSeconds": 1,
                "BackoffRate": 2.0,
                "ExpireAfterMin": 1,
            },
            {
                "ExceptionType": ["SageMaker.CAPACITY_ERROR"],
                "IntervalSeconds": 1,
                "BackoffRate": 2.0,
                "MaxAttempts": 3,
            },
        ],
        "Arguments":
        dict(),
    }

    step.add_retry_policy(
        SageMakerJobStepRetryPolicy(
            exception_types=[SageMakerJobExceptionTypeEnum.INTERNAL_ERROR],
            interval_seconds=5,
            backoff_rate=2.0,
            expire_after_mins=5,
        ))
    assert step.to_request() == {
        "Name":
        "MyStep",
        "Type":
        "Training",
        "RetryPolicies": [
            {
                "ExceptionType": ["Step.SERVICE_FAULT", "Step.THROTTLING"],
                "IntervalSeconds": 1,
                "BackoffRate": 2.0,
                "ExpireAfterMin": 1,
            },
            {
                "ExceptionType": ["SageMaker.CAPACITY_ERROR"],
                "IntervalSeconds": 1,
                "BackoffRate": 2.0,
                "MaxAttempts": 3,
            },
            {
                "ExceptionType": ["SageMaker.JOB_INTERNAL_ERROR"],
                "IntervalSeconds": 5,
                "BackoffRate": 2.0,
                "ExpireAfterMin": 5,
            },
        ],
        "Arguments":
        dict(),
    }

    step = CustomStep(name="MyStep")
    assert step.to_request() == {
        "Name": "MyStep",
        "Type": "Training",
        "Arguments": dict(),
    }
def test_pipeline_execution_processing_step_with_retry(
    sagemaker_session,
    smclient,
    role,
    sklearn_latest_version,
    cpu_instance_type,
    pipeline_name,
    athena_dataset_definition,
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=2)
    script_path = os.path.join(DATA_DIR, "dummy_script.py")
    input_file_path = os.path.join(DATA_DIR, "dummy_input.txt")
    inputs = [
        ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/"),
        ProcessingInput(dataset_definition=athena_dataset_definition),
    ]

    sklearn_processor = SKLearnProcessor(
        framework_version=sklearn_latest_version,
        role=role,
        instance_type=cpu_instance_type,
        instance_count=instance_count,
        command=["python3"],
        sagemaker_session=sagemaker_session,
        base_job_name="test-sklearn",
    )

    step_sklearn = ProcessingStep(
        name="sklearn-process",
        processor=sklearn_processor,
        inputs=inputs,
        code=script_path,
        retry_policies=[
            StepRetryPolicy(
                exception_types=[
                    StepExceptionTypeEnum.SERVICE_FAULT,
                    StepExceptionTypeEnum.THROTTLING,
                ],
                backoff_rate=2.0,
                interval_seconds=30,
                expire_after_mins=5,
            ),
            SageMakerJobStepRetryPolicy(
                exception_types=[SageMakerJobExceptionTypeEnum.CAPACITY_ERROR], max_attempts=10
            ),
        ],
    )
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count],
        steps=[step_sklearn],
        sagemaker_session=sagemaker_session,
    )

    try:
        pipeline.create(role)
        execution = pipeline.start(parameters={})

        try:
            execution.wait(delay=30, max_attempts=3)
        except WaiterError:
            pass
        execution_steps = execution.list_steps()
        assert len(execution_steps) == 1
        assert execution_steps[0]["StepName"] == "sklearn-process"
        # assert execution_steps[0]["AttemptCount"] >= 1
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
def test_model_registration_with_model_repack(
    sagemaker_session,
    role,
    pipeline_name,
    region_name,
):
    base_dir = os.path.join(DATA_DIR, "pytorch_mnist")
    entry_point = os.path.join(base_dir, "mnist.py")
    input_path = sagemaker_session.upload_data(
        path=os.path.join(base_dir, "training"),
        key_prefix="integ-test-data/pytorch_mnist/training",
    )
    inputs = TrainingInput(s3_data=input_path)

    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge")
    good_enough_input = ParameterInteger(name="GoodEnoughInput", default_value=1)

    pytorch_estimator = PyTorch(
        entry_point=entry_point,
        role=role,
        framework_version="1.5.0",
        py_version="py3",
        instance_count=instance_count,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
    )
    step_train = TrainingStep(
        name="pytorch-train",
        estimator=pytorch_estimator,
        inputs=inputs,
        retry_policies=[
            StepRetryPolicy(exception_types=[StepExceptionTypeEnum.THROTTLING], max_attempts=3)
        ],
    )

    step_register = RegisterModel(
        name="pytorch-register-model",
        estimator=pytorch_estimator,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        description="test-description",
        entry_point=entry_point,
        register_model_step_retry_policies=[
            StepRetryPolicy(exception_types=[StepExceptionTypeEnum.THROTTLING], max_attempts=3)
        ],
        repack_model_step_retry_policies=[
            StepRetryPolicy(exception_types=[StepExceptionTypeEnum.THROTTLING], max_attempts=3)
        ],
    )

    model = Model(
        image_uri=pytorch_estimator.training_image_uri(),
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        sagemaker_session=sagemaker_session,
        role=role,
    )
    model_inputs = CreateModelInput(
        instance_type="ml.m5.large",
        accelerator_type="ml.eia1.medium",
    )
    step_model = CreateModelStep(
        name="pytorch-model",
        model=model,
        inputs=model_inputs,
    )

    step_cond = ConditionStep(
        name="cond-good-enough",
        conditions=[ConditionGreaterThanOrEqualTo(left=good_enough_input, right=1)],
        if_steps=[step_train, step_register],
        else_steps=[step_model],
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[good_enough_input, instance_count, instance_type],
        steps=[step_cond],
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn
        )

        execution = pipeline.start(parameters={})
        assert re.match(
            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )

        execution = pipeline.start(parameters={"GoodEnoughInput": 0})
        assert re.match(
            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
def test_estimator_transformer(estimator):
    model_data = f"s3://{BUCKET}/model.tar.gz"
    model_inputs = CreateModelInput(
        instance_type="c4.4xlarge",
        accelerator_type="ml.eia1.medium",
    )
    service_fault_retry_policy = StepRetryPolicy(
        exception_types=[StepExceptionTypeEnum.SERVICE_FAULT], max_attempts=10)
    transform_inputs = TransformInput(data=f"s3://{BUCKET}/transform_manifest")
    estimator_transformer = EstimatorTransformer(
        name="EstimatorTransformerStep",
        estimator=estimator,
        model_data=model_data,
        model_inputs=model_inputs,
        instance_count=1,
        instance_type="ml.c4.4xlarge",
        transform_inputs=transform_inputs,
        depends_on=["TestStep"],
        model_step_retry_policies=[service_fault_retry_policy],
        transform_step_retry_policies=[service_fault_retry_policy],
        repack_model_step_retry_policies=[service_fault_retry_policy],
    )
    request_dicts = estimator_transformer.request_dicts()
    assert len(request_dicts) == 2

    for request_dict in request_dicts:
        if request_dict["Type"] == "Model":
            assert request_dict == {
                "Name": "EstimatorTransformerStepCreateModelStep",
                "Type": "Model",
                "DependsOn": ["TestStep"],
                "RetryPolicies": [service_fault_retry_policy.to_request()],
                "Arguments": {
                    "ExecutionRoleArn": "DummyRole",
                    "PrimaryContainer": {
                        "Environment": {},
                        "Image": "fakeimage",
                        "ModelDataUrl": "s3://my-bucket/model.tar.gz",
                    },
                },
            }
        elif request_dict["Type"] == "Transform":
            assert request_dict[
                "Name"] == "EstimatorTransformerStepTransformStep"
            assert request_dict["RetryPolicies"] == [
                service_fault_retry_policy.to_request()
            ]
            arguments = request_dict["Arguments"]
            assert isinstance(arguments["ModelName"], Properties)
            arguments.pop("ModelName")
            assert "DependsOn" not in request_dict
            assert arguments == {
                "TransformInput": {
                    "DataSource": {
                        "S3DataSource": {
                            "S3DataType": "S3Prefix",
                            "S3Uri": f"s3://{BUCKET}/transform_manifest",
                        }
                    }
                },
                "TransformOutput": {
                    "S3OutputPath": None
                },
                "TransformResources": {
                    "InstanceCount": 1,
                    "InstanceType": "ml.c4.4xlarge"
                },
            }
        else:
            raise Exception(
                "A step exists in the collection of an invalid type.")
def test_register_model_with_model_repack_with_pipeline_model(
        pipeline_model, model_metrics, drift_check_baselines):
    model_data = f"s3://{BUCKET}/model.tar.gz"
    service_fault_retry_policy = StepRetryPolicy(
        exception_types=[StepExceptionTypeEnum.SERVICE_FAULT], max_attempts=10)
    register_model = RegisterModel(
        name="RegisterModelStep",
        model=pipeline_model,
        model_data=model_data,
        content_types=["content_type"],
        response_types=["response_type"],
        inference_instances=["inference_instance"],
        transform_instances=["transform_instance"],
        model_package_group_name="mpg",
        model_metrics=model_metrics,
        drift_check_baselines=drift_check_baselines,
        approval_status="Approved",
        description="description",
        depends_on=["TestStep"],
        repack_model_step_retry_policies=[service_fault_retry_policy],
        register_model_step_retry_policies=[service_fault_retry_policy],
        tags=[{
            "Key": "myKey",
            "Value": "myValue"
        }],
    )

    request_dicts = register_model.request_dicts()
    assert len(request_dicts) == 2

    for request_dict in request_dicts:
        if request_dict["Type"] == "Training":
            assert request_dict["Name"] == "modelNameRepackModel"
            assert len(request_dict["DependsOn"]) == 1
            assert request_dict["DependsOn"][0] == "TestStep"
            arguments = request_dict["Arguments"]
            repacker_job_name = arguments["HyperParameters"][
                "sagemaker_job_name"]
            assert ordered(arguments) == ordered({
                "AlgorithmSpecification": {
                    "TrainingImage": MODEL_REPACKING_IMAGE_URI,
                    "TrainingInputMode": "File",
                },
                "DebugHookConfig": {
                    "CollectionConfigurations": [],
                    "S3OutputPath": f"s3://{BUCKET}/",
                },
                "HyperParameters": {
                    "dependencies":
                    "null",
                    "inference_script":
                    '"dummy_script.py"',
                    "model_archive":
                    '"model.tar.gz"',
                    "sagemaker_submit_directory":
                    '"s3://{}/{}/source/sourcedir.tar.gz"'.format(
                        BUCKET, repacker_job_name.replace('"', "")),
                    "sagemaker_program":
                    '"_repack_model.py"',
                    "sagemaker_container_log_level":
                    "20",
                    "sagemaker_job_name":
                    repacker_job_name,
                    "sagemaker_region":
                    f'"{REGION}"',
                    "source_dir":
                    "null",
                },
                "InputDataConfig": [{
                    "ChannelName": "training",
                    "DataSource": {
                        "S3DataSource": {
                            "S3DataDistributionType": "FullyReplicated",
                            "S3DataType": "S3Prefix",
                            "S3Uri": f"s3://{BUCKET}",
                        }
                    },
                }],
                "OutputDataConfig": {
                    "S3OutputPath": f"s3://{BUCKET}/"
                },
                "ResourceConfig": {
                    "InstanceCount": 1,
                    "InstanceType": "ml.m5.large",
                    "VolumeSizeInGB": 30,
                },
                "RoleArn":
                ROLE,
                "StoppingCondition": {
                    "MaxRuntimeInSeconds": 86400
                },
                "Tags": [{
                    "Key": "myKey",
                    "Value": "myValue"
                }],
                "VpcConfig": [
                    ("SecurityGroupIds", ["123", "456"]),
                    ("Subnets", ["abc", "def"]),
                ],
            })
        elif request_dict["Type"] == "RegisterModel":
            assert request_dict["Name"] == "RegisterModelStep"
            assert "DependsOn" not in request_dict
            arguments = request_dict["Arguments"]
            assert len(arguments["InferenceSpecification"]["Containers"]) == 1
            assert (arguments["InferenceSpecification"]["Containers"][0]
                    ["Image"] == pipeline_model.models[0].image_uri)
            assert isinstance(
                arguments["InferenceSpecification"]["Containers"][0]
                ["ModelDataUrl"], Properties)
            del arguments["InferenceSpecification"]["Containers"]
            assert ordered(arguments) == ordered({
                "InferenceSpecification": {
                    "SupportedContentTypes": ["content_type"],
                    "SupportedRealtimeInferenceInstanceTypes":
                    ["inference_instance"],
                    "SupportedResponseMIMETypes": ["response_type"],
                    "SupportedTransformInstanceTypes": ["transform_instance"],
                },
                "ModelApprovalStatus":
                "Approved",
                "ModelMetrics": {
                    "Bias": {},
                    "Explainability": {},
                    "ModelQuality": {
                        "Statistics": {
                            "ContentType": "text/csv",
                            "S3Uri": f"s3://{BUCKET}/metrics.csv",
                        },
                    },
                },
                "DriftCheckBaselines": {
                    "ModelQuality": {
                        "Constraints": {
                            "ContentType": "text/csv",
                            "S3Uri": f"s3://{BUCKET}/constraints_metrics.csv",
                        }
                    }
                },
                "ModelPackageDescription":
                "description",
                "ModelPackageGroupName":
                "mpg",
                "Tags": [{
                    "Key": "myKey",
                    "Value": "myValue"
                }],
            })
        else:
            raise Exception(
                "A step exists in the collection of an invalid type.")
def test_estimator_transformer_with_model_repack_with_estimator(estimator):
    model_data = f"s3://{BUCKET}/model.tar.gz"
    model_inputs = CreateModelInput(
        instance_type="c4.4xlarge",
        accelerator_type="ml.eia1.medium",
    )
    service_fault_retry_policy = StepRetryPolicy(
        exception_types=[StepExceptionTypeEnum.SERVICE_FAULT], max_attempts=10
    )
    transform_inputs = TransformInput(data=f"s3://{BUCKET}/transform_manifest")
    estimator_transformer = EstimatorTransformer(
        name="EstimatorTransformerStep",
        estimator=estimator,
        model_data=model_data,
        model_inputs=model_inputs,
        instance_count=1,
        instance_type="ml.c4.4xlarge",
        transform_inputs=transform_inputs,
        depends_on=["TestStep"],
        model_step_retry_policies=[service_fault_retry_policy],
        transform_step_retry_policies=[service_fault_retry_policy],
        repack_model_step_retry_policies=[service_fault_retry_policy],
        entry_point=f"{DATA_DIR}/dummy_script.py",
    )
    request_dicts = estimator_transformer.request_dicts()
    assert len(request_dicts) == 3

    for request_dict in request_dicts:
        if request_dict["Type"] == "Training":
            assert request_dict["Name"] == "EstimatorTransformerStepRepackModel"
            assert request_dict["DependsOn"] == ["TestStep"]
            assert request_dict["RetryPolicies"] == [service_fault_retry_policy.to_request()]
            arguments = request_dict["Arguments"]
            # pop out the dynamic generated fields
            arguments["HyperParameters"].pop("sagemaker_submit_directory")
            assert arguments == {
                "AlgorithmSpecification": {
                    "TrainingInputMode": "File",
                    "TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/"
                    + "sagemaker-scikit-learn:0.23-1-cpu-py3",
                },
                "OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"},
                "StoppingCondition": {"MaxRuntimeInSeconds": 86400},
                "ResourceConfig": {
                    "InstanceCount": 1,
                    "InstanceType": "ml.m5.large",
                    "VolumeSizeInGB": 30,
                },
                "RoleArn": "DummyRole",
                "InputDataConfig": [
                    {
                        "DataSource": {
                            "S3DataSource": {
                                "S3DataType": "S3Prefix",
                                "S3Uri": "s3://my-bucket/model.tar.gz",
                                "S3DataDistributionType": "FullyReplicated",
                            }
                        },
                        "ChannelName": "training",
                    }
                ],
                "HyperParameters": {
                    "inference_script": '"dummy_script.py"',
                    "model_archive": '"s3://my-bucket/model.tar.gz"',
                    "dependencies": "null",
                    "source_dir": "null",
                    "sagemaker_program": '"_repack_model.py"',
                    "sagemaker_container_log_level": "20",
                    "sagemaker_region": '"us-west-2"',
                },
                "VpcConfig": {"Subnets": ["abc", "def"], "SecurityGroupIds": ["123", "456"]},
                "DebugHookConfig": {
                    "S3OutputPath": "s3://my-bucket/",
                    "CollectionConfigurations": [],
                },
            }
        elif request_dict["Type"] == "Model":
            assert request_dict["Name"] == "EstimatorTransformerStepCreateModelStep"
            assert request_dict["RetryPolicies"] == [service_fault_retry_policy.to_request()]
            arguments = request_dict["Arguments"]
            assert isinstance(arguments["PrimaryContainer"]["ModelDataUrl"], Properties)
            arguments["PrimaryContainer"].pop("ModelDataUrl")
            assert "DependsOn" not in request_dict
            assert arguments == {
                "ExecutionRoleArn": "DummyRole",
                "PrimaryContainer": {
                    "Environment": {},
                    "Image": "fakeimage",
                },
            }
        elif request_dict["Type"] == "Transform":
            assert request_dict["Name"] == "EstimatorTransformerStepTransformStep"
            assert request_dict["RetryPolicies"] == [service_fault_retry_policy.to_request()]
            arguments = request_dict["Arguments"]
            assert isinstance(arguments["ModelName"], Properties)
            arguments.pop("ModelName")
            assert "DependsOn" not in request_dict
            assert arguments == {
                "TransformInput": {
                    "DataSource": {
                        "S3DataSource": {
                            "S3DataType": "S3Prefix",
                            "S3Uri": f"s3://{BUCKET}/transform_manifest",
                        }
                    }
                },
                "TransformOutput": {"S3OutputPath": None},
                "TransformResources": {"InstanceCount": 1, "InstanceType": "ml.c4.4xlarge"},
            }
        else:
            raise Exception("A step exists in the collection of an invalid type.")