def test_sagemakermodel_passes_correct_params_to_scala(): model_image = "model-abc-123" model_path = S3DataPath("my-bucket", "model-abc-123") role_arn = "role-789" endpoint_instance_type = "c4.8xlarge" model = SageMakerModel( endpointInstanceType=endpoint_instance_type, endpointInitialInstanceCount=2, requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer=KMeansProtobufResponseRowDeserializer(), modelImage=model_image, modelPath=model_path, modelEnvironmentVariables=None, modelExecutionRoleARN=role_arn, endpointCreationPolicy=EndpointCreationPolicy.DO_NOT_CREATE, sagemakerClient=SageMakerClients.create_sagemaker_client(), prependResultRows=False, namePolicy=None, uid="uid") assert model.modelImage == model_image assert model.modelPath.bucket == model_path.bucket assert model.modelExecutionRoleARN == role_arn assert model.endpointInstanceType == endpoint_instance_type assert model.existingEndpointName is None
def test_sagemakermodel_can_do_resource_cleanup(): endpoint_name = "my-existing-endpoint-123" model = SageMakerModel( endpointInstanceType="x1.128xlarge", endpointInitialInstanceCount=2, requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer=KMeansProtobufResponseRowDeserializer(), existingEndpointName=endpoint_name, modelImage="some_image", modelPath=S3DataPath("a", "b"), modelEnvironmentVariables=None, modelExecutionRoleARN="role", endpointCreationPolicy=EndpointCreationPolicy.DO_NOT_CREATE, sagemakerClient=SageMakerClients.create_sagemaker_client(), prependResultRows=False, namePolicy=None, uid="uid") sm = model.sagemakerClient assert sm is not None resource_cleanup = SageMakerResourceCleanup(sm) assert resource_cleanup is not None created_resources = model.getCreatedResources() assert created_resources is not None resource_cleanup.deleteResources(created_resources)
def test_sagemakerestimator_default_params(): training_image = "train-abc-123" model_image = "model-abc-123" training_instance_count = 2 training_instance_type = "train-abc-123" endpoint_instance_type = "endpoint-abc-123" endpoint_initial_instance_count = 2 estimator = SageMakerEstimator( trainingImage=training_image, modelImage=model_image, trainingInstanceCount=training_instance_count, trainingInstanceType=training_instance_type, endpointInstanceType=endpoint_instance_type, endpointInitialInstanceCount=endpoint_initial_instance_count, requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer=KMeansProtobufResponseRowDeserializer() ) assert estimator.trainingInstanceVolumeSizeInGB == 1024 assert estimator.trainingProjectedColumns is None assert estimator.trainingChannelName == "train" assert estimator.trainingContentType is None assert estimator.trainingS3DataDistribution == "ShardedByS3Key" assert estimator.trainingSparkDataFormat == "sagemaker" assert estimator.trainingInputMode == "File" assert estimator.trainingCompressionCodec is None assert estimator.trainingMaxRuntimeInSeconds == 24 * 60 * 60 assert estimator.trainingKmsKeyId is None assert estimator.modelPrependInputRowsToTransformationRows is True assert estimator.deleteStagingDataAfterTraining is True assert estimator.latestTrainingJob is None
def test_sagemakerestimator_passes_correct_params_to_scala(): training_image = "train-abc-123" model_image = "model-abc-123" training_instance_count = 2 training_instance_type = "train-abc-123" endpoint_instance_type = "c4.8xlarge" endpoint_initial_instance_count = 2 estimator = SageMakerEstimator( trainingImage=training_image, modelImage=model_image, trainingInstanceCount=training_instance_count, trainingInstanceType=training_instance_type, endpointInstanceType=endpoint_instance_type, endpointInitialInstanceCount=endpoint_initial_instance_count, requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer=KMeansProtobufResponseRowDeserializer() ) assert estimator.trainingImage == training_image assert estimator.modelImage == model_image assert estimator.trainingInstanceType == training_instance_type assert estimator.trainingInstanceCount == training_instance_count assert estimator.endpointInitialInstanceCount == endpoint_initial_instance_count assert estimator.endpointInstanceType == endpoint_instance_type
def __init__( self, trainingInstanceType, trainingInstanceCount, endpointInstanceType, endpointInitialInstanceCount, sagemakerRole=IAMRoleFromConfig(), requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer=LinearLearnerBinaryClassifierProtobufResponseRowDeserializer(), trainingInputS3DataPath=S3AutoCreatePath(), trainingOutputS3DataPath=S3AutoCreatePath(), trainingInstanceVolumeSizeInGB=1024, trainingProjectedColumns=None, trainingChannelName="train", trainingContentType=None, trainingS3DataDistribution="ShardedByS3Key", trainingSparkDataFormat="sagemaker", trainingSparkDataFormatOptions=None, trainingInputMode="File", trainingCompressionCodec=None, trainingMaxRuntimeInSeconds=24*60*60, trainingKmsKeyId=None, modelEnvironmentVariables=None, endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_CONSTRUCT, sagemakerClient=SageMakerClients.create_sagemaker_client(), region=None, s3Client=SageMakerClients.create_s3_default_client(), stsClient=SageMakerClients.create_sts_default_client(), modelPrependInputRowsToTransformationRows=True, deleteStagingDataAfterTraining=True, namePolicyFactory=RandomNamePolicyFactory(), uid=None, javaObject=None): if trainingSparkDataFormatOptions is None: trainingSparkDataFormatOptions = {} if modelEnvironmentVariables is None: modelEnvironmentVariables = {} if uid is None: uid = Identifiable._randomUID() kwargs = locals().copy() del kwargs['self'] super(LinearLearnerBinaryClassifier, self).__init__(**kwargs) default_params = { 'predictor_type': 'binary_classifier' } self._setDefault(**default_params)
def test_linearLearnerBinaryClassifier_passes_correct_params_to_scala(): training_instance_type = "c4.8xlarge" training_instance_count = 3 endpoint_instance_type = "c4.8xlarge" endpoint_initial_instance_count = 3 training_bucket = "random-bucket" input_prefix = "linear-learner-binary-classifier-training" output_prefix = "linear-learner-binary-classifier-out" integTestingRole = "arn:aws:iam::123456789:role/SageMakerRole" estimator = LinearLearnerBinaryClassifier( trainingInstanceType=training_instance_type, trainingInstanceCount=training_instance_count, endpointInstanceType=endpoint_instance_type, endpointInitialInstanceCount=endpoint_initial_instance_count, sagemakerRole=IAMRole(integTestingRole), requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer= LinearLearnerBinaryClassifierProtobufResponseRowDeserializer(), trainingInstanceVolumeSizeInGB=2048, trainingInputS3DataPath=S3DataPath(training_bucket, input_prefix), trainingOutputS3DataPath=S3DataPath(training_bucket, output_prefix), trainingMaxRuntimeInSeconds=1, endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_TRANSFORM, sagemakerClient=SageMakerClients.create_sagemaker_client(), s3Client=SageMakerClients.create_s3_default_client(), stsClient=SageMakerClients.create_sts_default_client(), modelPrependInputRowsToTransformationRows=True, namePolicyFactory=RandomNamePolicyFactory(), uid="sagemaker") assert estimator.trainingInputS3DataPath.bucket == training_bucket assert estimator.trainingInputS3DataPath.objectPath == input_prefix assert estimator.trainingInstanceCount == training_instance_count assert estimator.trainingInstanceType == training_instance_type assert estimator.endpointInstanceType == endpoint_instance_type assert estimator.endpointInitialInstanceCount == endpoint_initial_instance_count assert estimator.trainingInstanceVolumeSizeInGB == 2048 assert estimator.trainingMaxRuntimeInSeconds == 1 assert estimator.trainingKmsKeyId is None
def test_sagemakermodel_can_be_created_from_java_obj(): endpoint_name = "my-existing-endpoint-123" model = SageMakerModel( endpointInstanceType="x1.128xlarge", endpointInitialInstanceCount=2, requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer=KMeansProtobufResponseRowDeserializer(), existingEndpointName=endpoint_name, modelImage="some_image", modelPath=S3DataPath("a", "b"), modelEnvironmentVariables=None, modelExecutionRoleARN="role", endpointCreationPolicy=EndpointCreationPolicy.DO_NOT_CREATE, sagemakerClient=SageMakerClients.create_sagemaker_client(), prependResultRows=False, namePolicy=None, uid="uid") new_model = SageMakerModel._from_java(model._to_java()) assert new_model.uid == model.uid
trainingData.show() #Create a custom SageMakerEstimator from sagemaker.amazon.amazon_estimator import get_image_uri from sagemaker_pyspark import SageMakerEstimator from sagemaker_pyspark.transformation.deserializers import KMeansProtobufResponseRowDeserializer from sagemaker_pyspark.transformation.serializers import ProtobufRequestRowSerializer from sagemaker_pyspark import IAMRole from sagemaker_pyspark import RandomNamePolicyFactory from sagemaker_pyspark import EndpointCreationPolicy # Create an Estimator from scratch estimator = SageMakerEstimator( trainingImage=get_image_uri(region, 'kmeans'), # Training image modelImage=get_image_uri(region, 'kmeans'), # Model image requestRowSerializer=ProtobufRequestRowSerializer(), responseRowDeserializer=KMeansProtobufResponseRowDeserializer(), hyperParameters={ "k": "10", "feature_dim": "784" }, # Set parameters for K-Means sagemakerRole=IAMRole(role), trainingInstanceType="ml.m4.xlarge", trainingInstanceCount=1, endpointInstanceType="ml.t2.medium", endpointInitialInstanceCount=1, trainingSparkDataFormat="sagemaker", namePolicyFactory=RandomNamePolicyFactory("sparksm-4-"), endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_TRANSFORM) customModel = estimator.fit(trainingData)
displayClusters(transformedData) #Re-using existing endpoints or models to create SageMakerModel ENDPOINT_NAME = initialModelEndpointName print(ENDPOINT_NAME) from sagemaker_pyspark import SageMakerModel from sagemaker_pyspark import EndpointCreationPolicy from sagemaker_pyspark.transformation.serializers import ProtobufRequestRowSerializer from sagemaker_pyspark.transformation.deserializers import KMeansProtobufResponseRowDeserializer attachedModel = SageMakerModel( existingEndpointName=ENDPOINT_NAME, endpointCreationPolicy=EndpointCreationPolicy.DO_NOT_CREATE, endpointInstanceType=None, # Required endpointInitialInstanceCount=None, # Required requestRowSerializer=ProtobufRequestRowSerializer( featuresColumnName="features"), # Optional: already default value responseRowDeserializer= KMeansProtobufResponseRowDeserializer( # Optional: already default values distance_to_cluster_column_name="distance_to_cluster", closest_cluster_column_name="closest_cluster")) transformedData2 = attachedModel.transform(testData) transformedData2.show() #Create model and endpoint from model data from sagemaker_pyspark import S3DataPath MODEL_S3_PATH = S3DataPath(initialModel.modelPath.bucket, initialModel.modelPath.objectPath) MODEL_ROLE_ARN = initialModel.modelExecutionRoleARN MODEL_IMAGE_PATH = initialModel.modelImage
endpointInstanceType="ml.t2.large", endpointInitialInstanceCount=1, namePolicyFactory=RandomNamePolicyFactory("sparksm-3p-")) # Set parameters for PCA (number of features in input and the number of principal components to find) pcaSageMakerEstimator.setFeatureDim(784) pcaSageMakerEstimator.setNumComponents(50) # 2nd stage: K-Means on SageMaker kMeansSageMakerEstimator = KMeansSageMakerEstimator( sagemakerRole=IAMRole(role), trainingSparkDataFormatOptions={ "featuresColumnName": "projection" }, # Default output column generated by PCASageMakerEstimator requestRowSerializer=ProtobufRequestRowSerializer( featuresColumnName="projection" ), # Default output column generated by PCASageMakerEstimator trainingInstanceType="ml.m4.xlarge", trainingInstanceCount=1, endpointInstanceType="ml.t2.large", endpointInitialInstanceCount=1, namePolicyFactory=RandomNamePolicyFactory("sparksm-3k-"), endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_TRANSFORM) # Set parameters for K-Means kMeansSageMakerEstimator.setFeatureDim(50) kMeansSageMakerEstimator.setK(10) # Define the stages of the Pipeline in order pipelineSM = Pipeline(stages=[pcaSageMakerEstimator, kMeansSageMakerEstimator])
from sagemaker_pyspark import IAMRole, EndpointCreationPolicy, RandomNamePolicyFactory from sagemaker_pyspark.transformation.serializers import ProtobufRequestRowSerializer # ML pipeline with 2 stages: PCA and K-Means # 1st stage: PCA pcaSparkEstimator = PCA( inputCol = "features", outputCol = "projectedFeatures", k = 50) # 2nd stage: K-Means on SageMaker kMeansSageMakerEstimator = KMeansSageMakerEstimator( sagemakerRole = IAMRole(role), trainingSparkDataFormatOptions = {"featuresColumnName": "projectedFeatures"}, # use the output column of PCA requestRowSerializer = ProtobufRequestRowSerializer(featuresColumnName = "projectedFeatures"), # use the output column of PCA trainingInstanceType = "ml.m4.xlarge", trainingInstanceCount = 1, endpointInstanceType = "ml.t2.medium", endpointInitialInstanceCount = 1, namePolicyFactory = RandomNamePolicyFactory("sparksm-2-"), endpointCreationPolicy = EndpointCreationPolicy.CREATE_ON_TRANSFORM ) # Set parameters for K-Means kMeansSageMakerEstimator.setFeatureDim(50) kMeansSageMakerEstimator.setK(10) # Define the stages of the Pipeline in order pipelineSparkSM = Pipeline(stages=[pcaSparkEstimator, kMeansSageMakerEstimator])