def lambda_handler(event, context): schema_json = get_schema_json() bucket = os.environ['bucket'] # Get the execution ID sess = sagemaker.Session() obj = s3.Object(bucket, 'execution.txt') exec_id = obj.get()['Body'].read().decode("utf-8") role = os.environ['role'] print(exec_id) # Build variables training_job = f'{exec_id}-job' mleap_model_prefix = f'sagemaker/spark-preprocess-demo/{exec_id}/mleap-model' # Create models for Pipeline xgb_model = sagemaker.estimator.Estimator.attach( training_job).create_model() sparkml_data = 's3://{}/{}/{}'.format(os.environ['bucket'], mleap_model_prefix, 'model.tar.gz') sparkml_model = SparkMLModel(model_data=sparkml_data, env={'SAGEMAKER_SPARKML_SCHEMA': schema_json}) # Create Pipeline Model model_name = 'inference-pipeline-' + exec_id sm_model = PipelineModel(name=model_name, role=role, models=[sparkml_model, xgb_model]) sm_model.transformer(1, 'ml.m4.xlarge') event['model_name'] = model_name event['timestamp_prefix'] = exec_id return event
def test_inference_pipeline_batch_transform(sagemaker_session): sparkml_model_data = sagemaker_session.upload_data( path=os.path.join(SPARKML_DATA_PATH, 'mleap_model.tar.gz'), key_prefix='integ-test-data/sparkml/model') xgb_model_data = sagemaker_session.upload_data( path=os.path.join(XGBOOST_DATA_PATH, 'xgb_model.tar.gz'), key_prefix='integ-test-data/xgboost/model') batch_job_name = 'test-inference-pipeline-batch-{}'.format( sagemaker_timestamp()) sparkml_model = SparkMLModel(model_data=sparkml_model_data, env={'SAGEMAKER_SPARKML_SCHEMA': SCHEMA}, sagemaker_session=sagemaker_session) xgb_image = get_image_uri(sagemaker_session.boto_region_name, 'xgboost') xgb_model = Model(model_data=xgb_model_data, image=xgb_image, sagemaker_session=sagemaker_session) model = PipelineModel(models=[sparkml_model, xgb_model], role='SageMakerRole', sagemaker_session=sagemaker_session, name=batch_job_name) transformer = model.transformer(1, 'ml.m4.xlarge') transform_input_key_prefix = 'integ-test-data/sparkml_xgboost/transform' transform_input = transformer.sagemaker_session.upload_data( path=VALID_DATA_PATH, key_prefix=transform_input_key_prefix) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.transform(transform_input, content_type=CONTENT_TYPE_CSV, job_name=batch_job_name) transformer.wait()
def test_transformer(tfo, time, sagemaker_session): framework_model = DummyFrameworkModel(sagemaker_session) sparkml_model = SparkMLModel(model_data=MODEL_DATA_2, role=ROLE, sagemaker_session=sagemaker_session) model_name = "ModelName" model = PipelineModel( models=[framework_model, sparkml_model], role=ROLE, sagemaker_session=sagemaker_session, name=model_name, ) instance_count = 55 strategy = "MultiRecord" assemble_with = "Line" output_path = "s3://output/path" output_kms_key = "output:kms:key" accept = "application/jsonlines" env = {"my_key": "my_value"} max_concurrent_transforms = 20 max_payload = 5 tags = [{"my_tag": "my_value"}] volume_kms_key = "volume:kms:key" transformer = model.transformer( instance_type=INSTANCE_TYPE, instance_count=instance_count, strategy=strategy, assemble_with=assemble_with, output_path=output_path, output_kms_key=output_kms_key, accept=accept, env=env, max_concurrent_transforms=max_concurrent_transforms, max_payload=max_payload, tags=tags, volume_kms_key=volume_kms_key, ) assert transformer.instance_type == INSTANCE_TYPE assert transformer.instance_count == instance_count assert transformer.strategy == strategy assert transformer.assemble_with == assemble_with assert transformer.output_path == output_path assert transformer.output_kms_key == output_kms_key assert transformer.accept == accept assert transformer.env == env assert transformer.max_concurrent_transforms == max_concurrent_transforms assert transformer.max_payload == max_payload assert transformer.tags == tags assert transformer.volume_kms_key == volume_kms_key assert transformer.model_name == model_name
def test_inference_pipeline_batch_transform(sagemaker_session, cpu_instance_type): sparkml_model_data = sagemaker_session.upload_data( path=os.path.join(SPARKML_DATA_PATH, "mleap_model.tar.gz"), key_prefix="integ-test-data/sparkml/model", ) xgb_model_data = sagemaker_session.upload_data( path=os.path.join(XGBOOST_DATA_PATH, "xgb_model.tar.gz"), key_prefix="integ-test-data/xgboost/model", ) batch_job_name = "test-inference-pipeline-batch-{}".format( sagemaker_timestamp()) sparkml_model = SparkMLModel( model_data=sparkml_model_data, env={"SAGEMAKER_SPARKML_SCHEMA": SCHEMA}, sagemaker_session=sagemaker_session, ) xgb_image = image_uris.retrieve("xgboost", sagemaker_session.boto_region_name, version="1", image_scope="inference") xgb_model = Model(model_data=xgb_model_data, image_uri=xgb_image, sagemaker_session=sagemaker_session) model = PipelineModel( models=[sparkml_model, xgb_model], role="SageMakerRole", sagemaker_session=sagemaker_session, name=batch_job_name, ) transformer = model.transformer(1, cpu_instance_type) transform_input_key_prefix = "integ-test-data/sparkml_xgboost/transform" transform_input = transformer.sagemaker_session.upload_data( path=VALID_DATA_PATH, key_prefix=transform_input_key_prefix) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.transform(transform_input, content_type="text/csv", job_name=batch_job_name) transformer.wait()