def test_sagemakermodel_passes_correct_params_to_scala(): model_image = "model-abc-123" model_path = S3DataPath("my-bucket", "model-abc-123") role_arn = "role-789" endpoint_instance_type = "c4.8xlarge" model = SageMakerModel( endpointInstanceType=endpoint_instance_type, endpointInitialInstanceCount=2, requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer=KMeansProtobufResponseRowDeserializer(), modelImage=model_image, modelPath=model_path, modelEnvironmentVariables=None, modelExecutionRoleARN=role_arn, endpointCreationPolicy=EndpointCreationPolicy.DO_NOT_CREATE, sagemakerClient=SageMakerClients.create_sagemaker_client(), prependResultRows=False, namePolicy=None, uid="uid") assert model.modelImage == model_image assert model.modelPath.bucket == model_path.bucket assert model.modelExecutionRoleARN == role_arn assert model.endpointInstanceType == endpoint_instance_type assert model.existingEndpointName is None
def test_sagemakermodel_can_do_resource_cleanup(): endpoint_name = "my-existing-endpoint-123" model = SageMakerModel( endpointInstanceType="x1.128xlarge", endpointInitialInstanceCount=2, requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer=KMeansProtobufResponseRowDeserializer(), existingEndpointName=endpoint_name, modelImage="some_image", modelPath=S3DataPath("a", "b"), modelEnvironmentVariables=None, modelExecutionRoleARN="role", endpointCreationPolicy=EndpointCreationPolicy.DO_NOT_CREATE, sagemakerClient=SageMakerClients.create_sagemaker_client(), prependResultRows=False, namePolicy=None, uid="uid") sm = model.sagemakerClient assert sm is not None resource_cleanup = SageMakerResourceCleanup(sm) assert resource_cleanup is not None created_resources = model.getCreatedResources() assert created_resources is not None resource_cleanup.deleteResources(created_resources)
def test_sagemakerestimator_default_params(): training_image = "train-abc-123" model_image = "model-abc-123" training_instance_count = 2 training_instance_type = "train-abc-123" endpoint_instance_type = "endpoint-abc-123" endpoint_initial_instance_count = 2 estimator = SageMakerEstimator( trainingImage=training_image, modelImage=model_image, trainingInstanceCount=training_instance_count, trainingInstanceType=training_instance_type, endpointInstanceType=endpoint_instance_type, endpointInitialInstanceCount=endpoint_initial_instance_count, requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer=KMeansProtobufResponseRowDeserializer() ) assert estimator.trainingInstanceVolumeSizeInGB == 1024 assert estimator.trainingProjectedColumns is None assert estimator.trainingChannelName == "train" assert estimator.trainingContentType is None assert estimator.trainingS3DataDistribution == "ShardedByS3Key" assert estimator.trainingSparkDataFormat == "sagemaker" assert estimator.trainingInputMode == "File" assert estimator.trainingCompressionCodec is None assert estimator.trainingMaxRuntimeInSeconds == 24 * 60 * 60 assert estimator.trainingKmsKeyId is None assert estimator.modelPrependInputRowsToTransformationRows is True assert estimator.deleteStagingDataAfterTraining is True assert estimator.latestTrainingJob is None
def test_sagemakerestimator_passes_correct_params_to_scala(): training_image = "train-abc-123" model_image = "model-abc-123" training_instance_count = 2 training_instance_type = "train-abc-123" endpoint_instance_type = "c4.8xlarge" endpoint_initial_instance_count = 2 estimator = SageMakerEstimator( trainingImage=training_image, modelImage=model_image, trainingInstanceCount=training_instance_count, trainingInstanceType=training_instance_type, endpointInstanceType=endpoint_instance_type, endpointInitialInstanceCount=endpoint_initial_instance_count, requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer=KMeansProtobufResponseRowDeserializer() ) assert estimator.trainingImage == training_image assert estimator.modelImage == model_image assert estimator.trainingInstanceType == training_instance_type assert estimator.trainingInstanceCount == training_instance_count assert estimator.endpointInitialInstanceCount == endpoint_initial_instance_count assert estimator.endpointInstanceType == endpoint_instance_type
def __init__( self, trainingInstanceType, trainingInstanceCount, endpointInstanceType, endpointInitialInstanceCount, sagemakerRole=IAMRoleFromConfig(), requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer=KMeansProtobufResponseRowDeserializer(), trainingInputS3DataPath=S3AutoCreatePath(), trainingOutputS3DataPath=S3AutoCreatePath(), trainingInstanceVolumeSizeInGB=1024, trainingProjectedColumns=None, trainingChannelName="train", trainingContentType=None, trainingS3DataDistribution="ShardedByS3Key", trainingSparkDataFormat="sagemaker", trainingSparkDataFormatOptions=None, trainingInputMode="File", trainingCompressionCodec=None, trainingMaxRuntimeInSeconds=24 * 60 * 60, trainingKmsKeyId=None, modelEnvironmentVariables=None, endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_CONSTRUCT, sagemakerClient=SageMakerClients.create_sagemaker_client(), region=None, s3Client=SageMakerClients.create_s3_default_client(), stsClient=SageMakerClients.create_sts_default_client(), modelPrependInputRowsToTransformationRows=True, deleteStagingDataAfterTraining=True, namePolicyFactory=RandomNamePolicyFactory(), uid=None): if trainingSparkDataFormatOptions is None: trainingSparkDataFormatOptions = {} if modelEnvironmentVariables is None: modelEnvironmentVariables = {} if uid is None: uid = Identifiable._randomUID() kwargs = locals().copy() del kwargs['self'] super(KMeansSageMakerEstimator, self).__init__(**kwargs) default_params = {'k': 2} self._setDefault(**default_params)
def test_sagemakermodel_can_be_created_from_java_obj(): endpoint_name = "my-existing-endpoint-123" model = SageMakerModel( endpointInstanceType="x1.128xlarge", endpointInitialInstanceCount=2, requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer=KMeansProtobufResponseRowDeserializer(), existingEndpointName=endpoint_name, modelImage="some_image", modelPath=S3DataPath("a", "b"), modelEnvironmentVariables=None, modelExecutionRoleARN="role", endpointCreationPolicy=EndpointCreationPolicy.DO_NOT_CREATE, sagemakerClient=SageMakerClients.create_sagemaker_client(), prependResultRows=False, namePolicy=None, uid="uid") new_model = SageMakerModel._from_java(model._to_java()) assert new_model.uid == model.uid
def test_kmeansSageMakerEstimator_passes_correct_params_to_scala(): training_instance_type = "c4.8xlarge" training_instance_count = 3 endpoint_instance_type = "c4.8xlarge" endpoint_initial_instance_count = 3 training_bucket = "random-bucket" input_prefix = "kmeans-training" output_prefix = "kmeans-out" integTestingRole = "arn:aws:iam::123456789:role/SageMakerRole" estimator = KMeansSageMakerEstimator( trainingInstanceType=training_instance_type, trainingInstanceCount=training_instance_count, endpointInstanceType=endpoint_instance_type, endpointInitialInstanceCount=endpoint_initial_instance_count, sagemakerRole=IAMRole(integTestingRole), requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer=KMeansProtobufResponseRowDeserializer(), trainingInstanceVolumeSizeInGB=2048, trainingInputS3DataPath=S3DataPath(training_bucket, input_prefix), trainingOutputS3DataPath=S3DataPath(training_bucket, output_prefix), trainingMaxRuntimeInSeconds=1, endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_TRANSFORM, s3Client=SageMakerClients.create_s3_default_client(), stsClient=SageMakerClients.create_sts_default_client(), modelPrependInputRowsToTransformationRows=True, namePolicyFactory=RandomNamePolicyFactory(), uid="sagemaker") assert estimator.trainingInputS3DataPath.bucket == training_bucket assert estimator.trainingInputS3DataPath.objectPath == input_prefix assert estimator.trainingInstanceCount == training_instance_count assert estimator.trainingInstanceType == training_instance_type assert estimator.endpointInstanceType == endpoint_instance_type assert estimator.endpointInitialInstanceCount == endpoint_initial_instance_count assert estimator.trainingInstanceVolumeSizeInGB == 2048 assert estimator.trainingMaxRuntimeInSeconds == 1 assert estimator.trainingKmsKeyId is None
#Create a custom SageMakerEstimator from sagemaker.amazon.amazon_estimator import get_image_uri from sagemaker_pyspark import SageMakerEstimator from sagemaker_pyspark.transformation.deserializers import KMeansProtobufResponseRowDeserializer from sagemaker_pyspark.transformation.serializers import ProtobufRequestRowSerializer from sagemaker_pyspark import IAMRole from sagemaker_pyspark import RandomNamePolicyFactory from sagemaker_pyspark import EndpointCreationPolicy # Create an Estimator from scratch estimator = SageMakerEstimator( trainingImage=get_image_uri(region, 'kmeans'), # Training image modelImage=get_image_uri(region, 'kmeans'), # Model image requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer=KMeansProtobufResponseRowDeserializer(), hyperParameters={ "k": "10", "feature_dim": "784" }, # Set parameters for K-Means sagemakerRole=IAMRole(role), trainingInstanceType="ml.m4.xlarge", trainingInstanceCount=1, endpointInstanceType="ml.t2.medium", endpointInitialInstanceCount=1, trainingSparkDataFormat="sagemaker", namePolicyFactory=RandomNamePolicyFactory("sparksm-4-"), endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_TRANSFORM) customModel = estimator.fit(trainingData)
print(ENDPOINT_NAME) from sagemaker_pyspark import SageMakerModel from sagemaker_pyspark import EndpointCreationPolicy from sagemaker_pyspark.transformation.serializers import ProtobufRequestRowSerializer from sagemaker_pyspark.transformation.deserializers import KMeansProtobufResponseRowDeserializer attachedModel = SageMakerModel( existingEndpointName=ENDPOINT_NAME, endpointCreationPolicy=EndpointCreationPolicy.DO_NOT_CREATE, endpointInstanceType=None, # Required endpointInitialInstanceCount=None, # Required requestRowSerializer=ProtobufRequestRowSerializer( featuresColumnName="features"), # Optional: already default value responseRowDeserializer= KMeansProtobufResponseRowDeserializer( # Optional: already default values distance_to_cluster_column_name="distance_to_cluster", closest_cluster_column_name="closest_cluster")) transformedData2 = attachedModel.transform(testData) transformedData2.show() #Create model and endpoint from model data from sagemaker_pyspark import S3DataPath MODEL_S3_PATH = S3DataPath(initialModel.modelPath.bucket, initialModel.modelPath.objectPath) MODEL_ROLE_ARN = initialModel.modelExecutionRoleARN MODEL_IMAGE_PATH = initialModel.modelImage print(MODEL_S3_PATH.bucket + MODEL_S3_PATH.objectPath) print(MODEL_ROLE_ARN)