def test_invalid_step_retry_policy(): try: StepRetryPolicy( exception_types=[SageMakerJobExceptionTypeEnum.INTERNAL_ERROR], interval_seconds=5, max_attempts=3, ) assert False except Exception: assert True
def test_valid_step_retry_policy(): retry_policy = StepRetryPolicy( exception_types=[ StepExceptionTypeEnum.SERVICE_FAULT, StepExceptionTypeEnum.THROTTLING ], interval_seconds=5, max_attempts=3, ) assert retry_policy.to_request() == { "ExceptionType": ["Step.SERVICE_FAULT", "Step.THROTTLING"], "IntervalSeconds": 5, "BackoffRate": 2.0, "MaxAttempts": 3, } retry_policy = StepRetryPolicy( exception_types=[ StepExceptionTypeEnum.SERVICE_FAULT, StepExceptionTypeEnum.THROTTLING ], interval_seconds=5, backoff_rate=2.0, expire_after_mins=30, ) assert retry_policy.to_request() == { "ExceptionType": ["Step.SERVICE_FAULT", "Step.THROTTLING"], "IntervalSeconds": 5, "BackoffRate": 2.0, "ExpireAfterMin": 30, }
def test_custom_step_with_retry_policy(): step = CustomStep( name="MyStep", retry_policies=[ StepRetryPolicy( exception_types=[ StepExceptionTypeEnum.SERVICE_FAULT, StepExceptionTypeEnum.THROTTLING, ], expire_after_mins=1, ), SageMakerJobStepRetryPolicy( exception_types=[SageMakerJobExceptionTypeEnum.CAPACITY_ERROR], max_attempts=3, ), ], ) assert step.to_request() == { "Name": "MyStep", "Type": "Training", "RetryPolicies": [ { "ExceptionType": ["Step.SERVICE_FAULT", "Step.THROTTLING"], "IntervalSeconds": 1, "BackoffRate": 2.0, "ExpireAfterMin": 1, }, { "ExceptionType": ["SageMaker.CAPACITY_ERROR"], "IntervalSeconds": 1, "BackoffRate": 2.0, "MaxAttempts": 3, }, ], "Arguments": dict(), } step.add_retry_policy( SageMakerJobStepRetryPolicy( exception_types=[SageMakerJobExceptionTypeEnum.INTERNAL_ERROR], interval_seconds=5, backoff_rate=2.0, expire_after_mins=5, )) assert step.to_request() == { "Name": "MyStep", "Type": "Training", "RetryPolicies": [ { "ExceptionType": ["Step.SERVICE_FAULT", "Step.THROTTLING"], "IntervalSeconds": 1, "BackoffRate": 2.0, "ExpireAfterMin": 1, }, { "ExceptionType": ["SageMaker.CAPACITY_ERROR"], "IntervalSeconds": 1, "BackoffRate": 2.0, "MaxAttempts": 3, }, { "ExceptionType": ["SageMaker.JOB_INTERNAL_ERROR"], "IntervalSeconds": 5, "BackoffRate": 2.0, "ExpireAfterMin": 5, }, ], "Arguments": dict(), } step = CustomStep(name="MyStep") assert step.to_request() == { "Name": "MyStep", "Type": "Training", "Arguments": dict(), }
def test_pipeline_execution_processing_step_with_retry( sagemaker_session, smclient, role, sklearn_latest_version, cpu_instance_type, pipeline_name, athena_dataset_definition, ): instance_count = ParameterInteger(name="InstanceCount", default_value=2) script_path = os.path.join(DATA_DIR, "dummy_script.py") input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") inputs = [ ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/"), ProcessingInput(dataset_definition=athena_dataset_definition), ] sklearn_processor = SKLearnProcessor( framework_version=sklearn_latest_version, role=role, instance_type=cpu_instance_type, instance_count=instance_count, command=["python3"], sagemaker_session=sagemaker_session, base_job_name="test-sklearn", ) step_sklearn = ProcessingStep( name="sklearn-process", processor=sklearn_processor, inputs=inputs, code=script_path, retry_policies=[ StepRetryPolicy( exception_types=[ StepExceptionTypeEnum.SERVICE_FAULT, StepExceptionTypeEnum.THROTTLING, ], backoff_rate=2.0, interval_seconds=30, expire_after_mins=5, ), SageMakerJobStepRetryPolicy( exception_types=[SageMakerJobExceptionTypeEnum.CAPACITY_ERROR], max_attempts=10 ), ], ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count], steps=[step_sklearn], sagemaker_session=sagemaker_session, ) try: pipeline.create(role) execution = pipeline.start(parameters={}) try: execution.wait(delay=30, max_attempts=3) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "sklearn-process" # assert execution_steps[0]["AttemptCount"] >= 1 finally: try: pipeline.delete() except Exception: pass
def test_model_registration_with_model_repack( sagemaker_session, role, pipeline_name, region_name, ): base_dir = os.path.join(DATA_DIR, "pytorch_mnist") entry_point = os.path.join(base_dir, "mnist.py") input_path = sagemaker_session.upload_data( path=os.path.join(base_dir, "training"), key_prefix="integ-test-data/pytorch_mnist/training", ) inputs = TrainingInput(s3_data=input_path) instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") good_enough_input = ParameterInteger(name="GoodEnoughInput", default_value=1) pytorch_estimator = PyTorch( entry_point=entry_point, role=role, framework_version="1.5.0", py_version="py3", instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, ) step_train = TrainingStep( name="pytorch-train", estimator=pytorch_estimator, inputs=inputs, retry_policies=[ StepRetryPolicy(exception_types=[StepExceptionTypeEnum.THROTTLING], max_attempts=3) ], ) step_register = RegisterModel( name="pytorch-register-model", estimator=pytorch_estimator, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], description="test-description", entry_point=entry_point, register_model_step_retry_policies=[ StepRetryPolicy(exception_types=[StepExceptionTypeEnum.THROTTLING], max_attempts=3) ], repack_model_step_retry_policies=[ StepRetryPolicy(exception_types=[StepExceptionTypeEnum.THROTTLING], max_attempts=3) ], ) model = Model( image_uri=pytorch_estimator.training_image_uri(), model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, sagemaker_session=sagemaker_session, role=role, ) model_inputs = CreateModelInput( instance_type="ml.m5.large", accelerator_type="ml.eia1.medium", ) step_model = CreateModelStep( name="pytorch-model", model=model, inputs=model_inputs, ) step_cond = ConditionStep( name="cond-good-enough", conditions=[ConditionGreaterThanOrEqualTo(left=good_enough_input, right=1)], if_steps=[step_train, step_register], else_steps=[step_model], ) pipeline = Pipeline( name=pipeline_name, parameters=[good_enough_input, instance_count, instance_type], steps=[step_cond], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn ) execution = pipeline.start(parameters={}) assert re.match( fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) execution = pipeline.start(parameters={"GoodEnoughInput": 0}) assert re.match( fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) finally: try: pipeline.delete() except Exception: pass
def test_estimator_transformer(estimator): model_data = f"s3://{BUCKET}/model.tar.gz" model_inputs = CreateModelInput( instance_type="c4.4xlarge", accelerator_type="ml.eia1.medium", ) service_fault_retry_policy = StepRetryPolicy( exception_types=[StepExceptionTypeEnum.SERVICE_FAULT], max_attempts=10) transform_inputs = TransformInput(data=f"s3://{BUCKET}/transform_manifest") estimator_transformer = EstimatorTransformer( name="EstimatorTransformerStep", estimator=estimator, model_data=model_data, model_inputs=model_inputs, instance_count=1, instance_type="ml.c4.4xlarge", transform_inputs=transform_inputs, depends_on=["TestStep"], model_step_retry_policies=[service_fault_retry_policy], transform_step_retry_policies=[service_fault_retry_policy], repack_model_step_retry_policies=[service_fault_retry_policy], ) request_dicts = estimator_transformer.request_dicts() assert len(request_dicts) == 2 for request_dict in request_dicts: if request_dict["Type"] == "Model": assert request_dict == { "Name": "EstimatorTransformerStepCreateModelStep", "Type": "Model", "DependsOn": ["TestStep"], "RetryPolicies": [service_fault_retry_policy.to_request()], "Arguments": { "ExecutionRoleArn": "DummyRole", "PrimaryContainer": { "Environment": {}, "Image": "fakeimage", "ModelDataUrl": "s3://my-bucket/model.tar.gz", }, }, } elif request_dict["Type"] == "Transform": assert request_dict[ "Name"] == "EstimatorTransformerStepTransformStep" assert request_dict["RetryPolicies"] == [ service_fault_retry_policy.to_request() ] arguments = request_dict["Arguments"] assert isinstance(arguments["ModelName"], Properties) arguments.pop("ModelName") assert "DependsOn" not in request_dict assert arguments == { "TransformInput": { "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": f"s3://{BUCKET}/transform_manifest", } } }, "TransformOutput": { "S3OutputPath": None }, "TransformResources": { "InstanceCount": 1, "InstanceType": "ml.c4.4xlarge" }, } else: raise Exception( "A step exists in the collection of an invalid type.")
def test_register_model_with_model_repack_with_pipeline_model( pipeline_model, model_metrics, drift_check_baselines): model_data = f"s3://{BUCKET}/model.tar.gz" service_fault_retry_policy = StepRetryPolicy( exception_types=[StepExceptionTypeEnum.SERVICE_FAULT], max_attempts=10) register_model = RegisterModel( name="RegisterModelStep", model=pipeline_model, model_data=model_data, content_types=["content_type"], response_types=["response_type"], inference_instances=["inference_instance"], transform_instances=["transform_instance"], model_package_group_name="mpg", model_metrics=model_metrics, drift_check_baselines=drift_check_baselines, approval_status="Approved", description="description", depends_on=["TestStep"], repack_model_step_retry_policies=[service_fault_retry_policy], register_model_step_retry_policies=[service_fault_retry_policy], tags=[{ "Key": "myKey", "Value": "myValue" }], ) request_dicts = register_model.request_dicts() assert len(request_dicts) == 2 for request_dict in request_dicts: if request_dict["Type"] == "Training": assert request_dict["Name"] == "modelNameRepackModel" assert len(request_dict["DependsOn"]) == 1 assert request_dict["DependsOn"][0] == "TestStep" arguments = request_dict["Arguments"] repacker_job_name = arguments["HyperParameters"][ "sagemaker_job_name"] assert ordered(arguments) == ordered({ "AlgorithmSpecification": { "TrainingImage": MODEL_REPACKING_IMAGE_URI, "TrainingInputMode": "File", }, "DebugHookConfig": { "CollectionConfigurations": [], "S3OutputPath": f"s3://{BUCKET}/", }, "HyperParameters": { "dependencies": "null", "inference_script": '"dummy_script.py"', "model_archive": '"model.tar.gz"', "sagemaker_submit_directory": '"s3://{}/{}/source/sourcedir.tar.gz"'.format( BUCKET, repacker_job_name.replace('"', "")), "sagemaker_program": '"_repack_model.py"', "sagemaker_container_log_level": "20", "sagemaker_job_name": repacker_job_name, "sagemaker_region": f'"{REGION}"', "source_dir": "null", }, "InputDataConfig": [{ "ChannelName": "training", "DataSource": { "S3DataSource": { "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri": f"s3://{BUCKET}", } }, }], "OutputDataConfig": { "S3OutputPath": f"s3://{BUCKET}/" }, "ResourceConfig": { "InstanceCount": 1, "InstanceType": "ml.m5.large", "VolumeSizeInGB": 30, }, "RoleArn": ROLE, "StoppingCondition": { "MaxRuntimeInSeconds": 86400 }, "Tags": [{ "Key": "myKey", "Value": "myValue" }], "VpcConfig": [ ("SecurityGroupIds", ["123", "456"]), ("Subnets", ["abc", "def"]), ], }) elif request_dict["Type"] == "RegisterModel": assert request_dict["Name"] == "RegisterModelStep" assert "DependsOn" not in request_dict arguments = request_dict["Arguments"] assert len(arguments["InferenceSpecification"]["Containers"]) == 1 assert (arguments["InferenceSpecification"]["Containers"][0] ["Image"] == pipeline_model.models[0].image_uri) assert isinstance( arguments["InferenceSpecification"]["Containers"][0] ["ModelDataUrl"], Properties) del arguments["InferenceSpecification"]["Containers"] assert ordered(arguments) == ordered({ "InferenceSpecification": { "SupportedContentTypes": ["content_type"], "SupportedRealtimeInferenceInstanceTypes": ["inference_instance"], "SupportedResponseMIMETypes": ["response_type"], "SupportedTransformInstanceTypes": ["transform_instance"], }, "ModelApprovalStatus": "Approved", "ModelMetrics": { "Bias": {}, "Explainability": {}, "ModelQuality": { "Statistics": { "ContentType": "text/csv", "S3Uri": f"s3://{BUCKET}/metrics.csv", }, }, }, "DriftCheckBaselines": { "ModelQuality": { "Constraints": { "ContentType": "text/csv", "S3Uri": f"s3://{BUCKET}/constraints_metrics.csv", } } }, "ModelPackageDescription": "description", "ModelPackageGroupName": "mpg", "Tags": [{ "Key": "myKey", "Value": "myValue" }], }) else: raise Exception( "A step exists in the collection of an invalid type.")
def test_estimator_transformer_with_model_repack_with_estimator(estimator): model_data = f"s3://{BUCKET}/model.tar.gz" model_inputs = CreateModelInput( instance_type="c4.4xlarge", accelerator_type="ml.eia1.medium", ) service_fault_retry_policy = StepRetryPolicy( exception_types=[StepExceptionTypeEnum.SERVICE_FAULT], max_attempts=10 ) transform_inputs = TransformInput(data=f"s3://{BUCKET}/transform_manifest") estimator_transformer = EstimatorTransformer( name="EstimatorTransformerStep", estimator=estimator, model_data=model_data, model_inputs=model_inputs, instance_count=1, instance_type="ml.c4.4xlarge", transform_inputs=transform_inputs, depends_on=["TestStep"], model_step_retry_policies=[service_fault_retry_policy], transform_step_retry_policies=[service_fault_retry_policy], repack_model_step_retry_policies=[service_fault_retry_policy], entry_point=f"{DATA_DIR}/dummy_script.py", ) request_dicts = estimator_transformer.request_dicts() assert len(request_dicts) == 3 for request_dict in request_dicts: if request_dict["Type"] == "Training": assert request_dict["Name"] == "EstimatorTransformerStepRepackModel" assert request_dict["DependsOn"] == ["TestStep"] assert request_dict["RetryPolicies"] == [service_fault_retry_policy.to_request()] arguments = request_dict["Arguments"] # pop out the dynamic generated fields arguments["HyperParameters"].pop("sagemaker_submit_directory") assert arguments == { "AlgorithmSpecification": { "TrainingInputMode": "File", "TrainingImage": "246618743249.dkr.ecr.us-west-2.amazonaws.com/" + "sagemaker-scikit-learn:0.23-1-cpu-py3", }, "OutputDataConfig": {"S3OutputPath": "s3://my-bucket/"}, "StoppingCondition": {"MaxRuntimeInSeconds": 86400}, "ResourceConfig": { "InstanceCount": 1, "InstanceType": "ml.m5.large", "VolumeSizeInGB": 30, }, "RoleArn": "DummyRole", "InputDataConfig": [ { "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": "s3://my-bucket/model.tar.gz", "S3DataDistributionType": "FullyReplicated", } }, "ChannelName": "training", } ], "HyperParameters": { "inference_script": '"dummy_script.py"', "model_archive": '"s3://my-bucket/model.tar.gz"', "dependencies": "null", "source_dir": "null", "sagemaker_program": '"_repack_model.py"', "sagemaker_container_log_level": "20", "sagemaker_region": '"us-west-2"', }, "VpcConfig": {"Subnets": ["abc", "def"], "SecurityGroupIds": ["123", "456"]}, "DebugHookConfig": { "S3OutputPath": "s3://my-bucket/", "CollectionConfigurations": [], }, } elif request_dict["Type"] == "Model": assert request_dict["Name"] == "EstimatorTransformerStepCreateModelStep" assert request_dict["RetryPolicies"] == [service_fault_retry_policy.to_request()] arguments = request_dict["Arguments"] assert isinstance(arguments["PrimaryContainer"]["ModelDataUrl"], Properties) arguments["PrimaryContainer"].pop("ModelDataUrl") assert "DependsOn" not in request_dict assert arguments == { "ExecutionRoleArn": "DummyRole", "PrimaryContainer": { "Environment": {}, "Image": "fakeimage", }, } elif request_dict["Type"] == "Transform": assert request_dict["Name"] == "EstimatorTransformerStepTransformStep" assert request_dict["RetryPolicies"] == [service_fault_retry_policy.to_request()] arguments = request_dict["Arguments"] assert isinstance(arguments["ModelName"], Properties) arguments.pop("ModelName") assert "DependsOn" not in request_dict assert arguments == { "TransformInput": { "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": f"s3://{BUCKET}/transform_manifest", } } }, "TransformOutput": {"S3OutputPath": None}, "TransformResources": {"InstanceCount": 1, "InstanceType": "ml.c4.4xlarge"}, } else: raise Exception("A step exists in the collection of an invalid type.")