def sagemaker_model(image: str, hyperparams: dict, role: str, output_dir: str, region_name: str = 'us-east-1', instance_type: str = 'ml.m4.xlarge'): """ :param output_dir: :param image: :param hyperparams: :param role: :param instance_type: :param region_name: :return: """ if image == 'xgboost': input_mode = 'File' container = get_image_uri(region_name, image, '0.90-2') else: input_mode = 'Pipe' container = get_image_uri(region_name, image) model = sagemaker.estimator.Estimator(container, role=role, input_mode=input_mode, train_instance_count=1, output_path=output_dir, train_instance_type=instance_type, train_use_spot_instances=True, train_max_run=300, train_max_wait=600) model.set_hyperparameters(**hyperparams) return model
def test_get_xgboost_image_uri_throws_error_for_unsupported_version(): with pytest.raises(ValueError) as error: get_image_uri(REGION, "xgboost", "99.99-9") assert "SageMaker XGBoost version 99.99-9 is not supported" in str(error) with pytest.raises(ValueError) as error: get_image_uri(REGION, "xgboost", "0.90-1-gpu-py3") assert "SageMaker XGBoost version 0.90-1-gpu-py3 is not supported" in str( error)
def test_get_xgboost_image_uri(): legacy_xgb_image_uri = get_image_uri(REGION, "xgboost") assert legacy_xgb_image_uri == "433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1" updated_xgb_image_uri = get_image_uri(REGION, "xgboost", "0.90-1") assert ( updated_xgb_image_uri == "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:0.90-1-cpu-py3" )
def test_gov_ecr_uri(): assert ( get_image_uri("us-gov-west-1", "kmeans", "latest") == "226302683700.dkr.ecr.us-gov-west-1.amazonaws.com/kmeans:latest" ) assert ( get_image_uri("us-iso-east-1", "kmeans", "latest") == "490574956308.dkr.ecr.us-iso-east-1.c2s.ic.gov/kmeans:latest" )
def test_inference_pipeline_model_deploy_with_update_endpoint( sagemaker_session, cpu_instance_type, alternative_cpu_instance_type ): sparkml_data_path = os.path.join(DATA_DIR, "sparkml_model") xgboost_data_path = os.path.join(DATA_DIR, "xgboost_model") endpoint_name = "test-inference-pipeline-deploy-{}".format(sagemaker_timestamp()) sparkml_model_data = sagemaker_session.upload_data( path=os.path.join(sparkml_data_path, "mleap_model.tar.gz"), key_prefix="integ-test-data/sparkml/model", ) xgb_model_data = sagemaker_session.upload_data( path=os.path.join(xgboost_data_path, "xgb_model.tar.gz"), key_prefix="integ-test-data/xgboost/model", ) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): sparkml_model = SparkMLModel( model_data=sparkml_model_data, env={"SAGEMAKER_SPARKML_SCHEMA": SCHEMA}, sagemaker_session=sagemaker_session, ) xgb_image = get_image_uri(sagemaker_session.boto_region_name, "xgboost") xgb_model = Model( model_data=xgb_model_data, image=xgb_image, sagemaker_session=sagemaker_session ) model = PipelineModel( models=[sparkml_model, xgb_model], role="SageMakerRole", sagemaker_session=sagemaker_session, ) model.deploy(1, alternative_cpu_instance_type, endpoint_name=endpoint_name) old_endpoint = sagemaker_session.sagemaker_client.describe_endpoint( EndpointName=endpoint_name ) old_config_name = old_endpoint["EndpointConfigName"] model.deploy(1, cpu_instance_type, update_endpoint=True, endpoint_name=endpoint_name) # Wait for endpoint to finish updating # Endpoint update takes ~7min. 40 retries * 30s sleeps = 20min timeout for _ in retries(40, "Waiting for 'InService' endpoint status", seconds_to_sleep=30): new_endpoint = sagemaker_session.sagemaker_client.describe_endpoint( EndpointName=endpoint_name ) if new_endpoint["EndpointStatus"] == "InService": break new_config_name = new_endpoint["EndpointConfigName"] new_config = sagemaker_session.sagemaker_client.describe_endpoint_config( EndpointConfigName=new_config_name ) assert old_config_name != new_config_name assert new_config["ProductionVariants"][0]["InstanceType"] == cpu_instance_type assert new_config["ProductionVariants"][0]["InitialInstanceCount"] == 1 model.delete_model() with pytest.raises(Exception) as exception: sagemaker_session.sagemaker_client.describe_model(ModelName=model.name) assert "Could not find model" in str(exception.value)
def test_byo_estimator(sagemaker_session, region, cpu_instance_type): """Use Factorization Machines algorithm as an example here. First we need to prepare data for training. We take standard data set, convert it to the format that the algorithm can process and upload it to S3. Then we create the Estimator and set hyperparamets as required by the algorithm. Next, we can call fit() with path to the S3. Later the trained model is deployed and prediction is called against the endpoint. Default predictor is updated with json serializer and deserializer. """ image_name = get_image_uri(region, "factorization-machines") training_data_path = os.path.join(DATA_DIR, "dummy_tensor") job_name = unique_name_from_base("byo") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") pickle_args = {} if sys.version_info.major == 2 else { "encoding": "latin1" } with gzip.open(data_path, "rb") as f: train_set, _, _ = pickle.load(f, **pickle_args) prefix = "test_byo_estimator" key = "recordio-pb-data" s3_train_data = sagemaker_session.upload_data(path=training_data_path, key_prefix=os.path.join( prefix, "train", key)) estimator = Estimator( image_name=image_name, role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) estimator.set_hyperparameters(num_factors=10, feature_dim=784, mini_batch_size=100, predictor_type="binary_classifier") # training labels must be 'float32' estimator.fit({"train": s3_train_data}, job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = estimator.create_model() predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) predictor.serializer = fm_serializer predictor.content_type = "application/json" predictor.deserializer = sagemaker.predictor.json_deserializer result = predictor.predict(train_set[0][:10]) assert len(result["predictions"]) == 10 for prediction in result["predictions"]: assert prediction["score"] is not None
def trained_estimator_from_hyperparams(s3_train_data, hyperparams, output_path, s3_test_data=None): """ Create an Estimator from the given hyperparams, fit to training data, and return a deployed predictor """ # set up the estimator knn = sagemaker.estimator.Estimator( get_image_uri(boto3.Session().region_name, "knn"), role, # COMMENTED OUT get_execution_role() and replaced with the created role train_instance_count=1, train_instance_type='ml.m5.2xlarge', output_path=output_path, sagemaker_session=sagemaker.Session()) knn.set_hyperparameters(**hyperparams) # train a model. fit_input contains the locations of the train and test data fit_input = {'train': s3_train_data} if s3_test_data is not None: fit_input['test'] = s3_test_data knn.fit(fit_input) return knn
def sagemakerTrain(): try: # get the ARN of the executing role (to pass to Sagemaker for training) role = 'arn:aws:iam::056149205531:role/service-role/AmazonSageMaker-ExecutionRole-20180112T102983' s3_train_data = 's3://{}/train/{}'.format(bucket, dataset) container = get_image_uri(boto3.Session().region_name, 'linear-learner') session = sagemaker.Session() # set up the training params linear = sagemaker.estimator.Estimator( container, role, train_instance_count=1, train_instance_type='ml.c4.xlarge', output_path=output_location, sagemaker_session=session) # set up the hyperparameters linear.set_hyperparameters(feature_dim=13, predictor_type='regressor', epochs=10, loss='absolute_loss', optimizer='adam', mini_batch_size=200) linear.fit({'train': s3_train_data}, wait=False) except Exception as err: logger.error( "Error while launching SageMaker training: {}".format(err))
def trainModel(): sess = sagemaker.Session() container = get_image_uri(region, 'xgboost') YColumns = ['result'] numericalCols = ['guarantee_percentage', 'container_id_label'] categoricalCols = [ 'component_name', 'slot_names', 'container_type', 'component_namespace', 'component_display_name', 'customer_targeting', 'site'] columns_to_keep = YColumns + numericalCols + categoricalCols output_path_str = 's3://{}/{}/sagemaker-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) xgb = sagemaker.estimator.Estimator(container, role, train_instance_count=1, train_instance_type='ml.m4.xlarge', output_path=output_path_str.format(input_bucket, 'results'), sagemaker_session=sess) xgb.set_hyperparameters('objective' : 'multi:softmax', 'colsample_bytree' : 0.3, 'learning_rate' : 0.3, 'max_depth' : 16, 'alpha' : 5, 'num_class': 6, 'n_estimators' : 200, 'num_round': 200) input_prefix = 'inputs' s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/{}'.format(input_bucket, input_prefix, s3_training_file), content_type='csv') s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/{}'.format(input_bucket, input_prefix, s3_training_file), content_type='csv') xgb.fit({'train': s3_input_train, 'validation': s3_input_validation}) saveModel(xgb, columns_to_keep) return
def test_tf_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): tf = TensorFlow( image_name=get_image_uri( sagemaker_session.boto_session.region_name, "factorization-machines" ), entry_point=SCRIPT, role=ROLE, train_instance_count=SINGLE_INSTANCE_COUNT, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, script_mode=True, framework_version=TensorFlow.LATEST_VERSION, py_version=PYTHON_VERSION, metric_definitions=[ {"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"} ], ) inputs = tf.sagemaker_session.upload_data( path=os.path.join(TF_MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist" ) training_config = _build_airflow_workflow_tf( estimator=tf, instance_type=cpu_instance_type, inputs=inputs ) _assert_that_s3_url_contains_data( sagemaker_session, training_config["HyperParameters"]["sagemaker_submit_directory"].strip('"'), )
def test_byo_airflow_config_uploads_data_source_to_s3_when_inputs_provided( sagemaker_session, cpu_instance_type ): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): training_data_path = os.path.join(DATA_DIR, "dummy_tensor") data_source_location = "test-airflow-config-{}".format(sagemaker_timestamp()) inputs = sagemaker_session.upload_data( path=training_data_path, key_prefix=os.path.join(data_source_location, "train") ) estimator = Estimator( image_name=get_image_uri( sagemaker_session.boto_session.region_name, "factorization-machines" ), role=ROLE, train_instance_count=SINGLE_INSTANCE_COUNT, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) training_config = _build_airflow_workflow( estimator=estimator, instance_type=cpu_instance_type, inputs=inputs ) _assert_that_s3_url_contains_data( sagemaker_session, training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"], )
def make_estimator(job_name, s3_output, input_mode='Pipe', train_instance_count=1, train_instance_type='ml.p2.xlarge', train_volume_size=30, train_max_run=360000): role = get_execution_role() sess = sagemaker.Session() training_image = get_image_uri(sess.boto_region_name, 'image-classification', repo_version="latest") estimator = sagemaker.estimator.Estimator( training_image, role, train_instance_count=train_instance_count, train_instance_type=train_instance_type, train_volume_size=train_volume_size, train_max_run=train_max_run, input_mode=input_mode, output_path=s3_output, sagemaker_session=sess, base_job_name=job_name) return estimator
def test_inference_pipeline_batch_transform(sagemaker_session): sparkml_model_data = sagemaker_session.upload_data( path=os.path.join(SPARKML_DATA_PATH, 'mleap_model.tar.gz'), key_prefix='integ-test-data/sparkml/model') xgb_model_data = sagemaker_session.upload_data( path=os.path.join(XGBOOST_DATA_PATH, 'xgb_model.tar.gz'), key_prefix='integ-test-data/xgboost/model') batch_job_name = 'test-inference-pipeline-batch-{}'.format( sagemaker_timestamp()) sparkml_model = SparkMLModel(model_data=sparkml_model_data, env={'SAGEMAKER_SPARKML_SCHEMA': SCHEMA}, sagemaker_session=sagemaker_session) xgb_image = get_image_uri(sagemaker_session.boto_region_name, 'xgboost') xgb_model = Model(model_data=xgb_model_data, image=xgb_image, sagemaker_session=sagemaker_session) model = PipelineModel(models=[sparkml_model, xgb_model], role='SageMakerRole', sagemaker_session=sagemaker_session, name=batch_job_name) transformer = model.transformer(1, 'ml.m4.xlarge') transform_input_key_prefix = 'integ-test-data/sparkml_xgboost/transform' transform_input = transformer.sagemaker_session.upload_data( path=VALID_DATA_PATH, key_prefix=transform_input_key_prefix) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.transform(transform_input, content_type=CONTENT_TYPE_CSV, job_name=batch_job_name) transformer.wait()
def create_blaxing_text_model( region_name: str, sm_session: Session, sm_role: str, s3_input_url: str, s3_output_url: str): """ Create a BlazingText model. Args: - region_name: AWS Region Name to use SageMaker in. - sm_session: SageMaker Session Object. - sm_role: SageMaker role arn that allows SM to connect to s3. - s3_input_url: training data input path on s3 - s3_output_url: model artifacts output path Return: - bt_model: instance of Estimator, can be used to deploy an inference endpoint """ # define container container = get_image_uri(region_name, "blazingtext", "latest") # create estimator bt_model = Estimator(container, sm_role, train_instance_count=1, train_instance_type='ml.c4.2xlarge', train_volume_size=30, train_max_run=360000, input_mode='File', output_path=s3_output_url, sagemaker_session=sm_session) # set hyperparameters bt_model.set_hyperparameters(mode="skipgram", epochs=5, min_count=5, sampling_threshold=0.0001, learning_rate=0.05, window_size=5, vector_dim=100, negative_samples=5, subwords=True, min_char=3, max_char=6, batch_size=11, evaluation=True) # define data channels train_data = s3_input(s3_input_url, distribution='FullyReplicated', content_type='text/plain', s3_data_type='S3Prefix') data_channels = {'train': train_data} # fit model bt_model.fit(inputs=data_channels, logs=True) return bt_model
def test_async_byo_estimator(sagemaker_session, region, cpu_instance_type): image_name = get_image_uri(region, "factorization-machines") endpoint_name = unique_name_from_base("byo") training_data_path = os.path.join(DATA_DIR, "dummy_tensor") job_name = unique_name_from_base("byo") with timeout(minutes=5): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") pickle_args = {} if sys.version_info.major == 2 else { "encoding": "latin1" } with gzip.open(data_path, "rb") as f: train_set, _, _ = pickle.load(f, **pickle_args) prefix = "test_byo_estimator" key = "recordio-pb-data" s3_train_data = sagemaker_session.upload_data(path=training_data_path, key_prefix=os.path.join( prefix, "train", key)) estimator = Estimator( image_name=image_name, role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) estimator.set_hyperparameters(num_factors=10, feature_dim=784, mini_batch_size=100, predictor_type="binary_classifier") # training labels must be 'float32' estimator.fit({"train": s3_train_data}, wait=False, job_name=job_name) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = Estimator.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) model = estimator.create_model() predictor = model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) predictor.serializer = fm_serializer predictor.content_type = "application/json" predictor.deserializer = sagemaker.predictor.json_deserializer result = predictor.predict(train_set[0][:10]) assert len(result["predictions"]) == 10 for prediction in result["predictions"]: assert prediction["score"] is not None assert estimator.train_image() == image_name
def _do_training(self): self._logger.info('Training data is located in: {}'.format( self._data_s3_url)) self._logger.info('Artifacts will be located in: {}'.format( self._output_location)) self._job_name = 'kmeans-batch-training-' + strftime( "%Y-%m-%d-%H-%M-%S", gmtime()) image = get_image_uri(boto3.Session().region_name, 'kmeans') create_training_params = \ { "AlgorithmSpecification": { "TrainingImage": image, "TrainingInputMode": "File" }, "RoleArn": self._ml_engine.iam_role, "OutputDataConfig": { "S3OutputPath": self._output_location }, "ResourceConfig": { "InstanceCount": self._instance_count, "InstanceType": self._instance_type, "VolumeSizeInGB": self._volume_size_in_gb }, "TrainingJobName": self._job_name, "HyperParameters": { "k": str(self._hyper_parameter_k), "epochs": str(self._epochs), "feature_dim": str(self._num_features), "mini_batch_size": str(self._mini_batch_size), "force_dense": "True" }, "StoppingCondition": { "MaxRuntimeInSeconds": self._max_runtime_in_seconds }, "InputDataConfig": [ { "ChannelName": "train", "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": self._data_s3_url, "S3DataDistributionType": "FullyReplicated" } }, "CompressionType": "None", "RecordWrapperType": "None" } ] } self._logger.info("Creating training job ... {}".format( self._job_name)) self._sagemaker_client.create_training_job(**create_training_params)
def getcontainer(self, region): """ xgboost specific code goes here to set up the training container :param region: :return: """ from sagemaker.amazon.amazon_estimator import get_image_uri container = get_image_uri(region, 'xgboost') return container
def create_model(image: str, hyperparameters: dict, instance_type: str, output_path: str, region_name: str, role: str, s3_train: str, s3_validation: str, job_name: str): if image == 'xgboost': container = get_image_uri(region_name, image, '0.90-2') else: container = get_image_uri(region_name, image) save_interval = '1' model = sagemaker.estimator.Estimator( container, role=role, train_instance_count=1, train_instance_type=instance_type, train_use_spot_instances=True, train_max_run=300, train_max_wait=600, output_path=output_path, debugger_hook_config=DebuggerHookConfig( s3_output_path=f's3://{bucket}/{prefix}/debug', collection_configs=[ CollectionConfig(name='metrics', parameters={'save_interval': save_interval}), CollectionConfig(name='feature_importance', parameters={'save_interval': save_interval}), CollectionConfig(name='full_shap', parameters={'save_interval': save_interval}), CollectionConfig(name='average_shap', parameters={'save_interval': save_interval}) ]), rules=[ Rule.sagemaker(rule_configs.class_imbalance(), rule_parameters={'collection_names': 'metrics'}) ]) model.set_hyperparameters(**hyperparameters) data_channel = { 'train': s3_input(s3_train, content_type='text/csv'), 'validation': s3_input(s3_validation, content_type='text/csv') } model.fit(data_channel, job_name=job_name) return model
def test_inference_pipeline_model_deploy(sagemaker_session, cpu_instance_type): sparkml_data_path = os.path.join(DATA_DIR, "sparkml_model") xgboost_data_path = os.path.join(DATA_DIR, "xgboost_model") endpoint_name = "test-inference-pipeline-deploy-{}".format( sagemaker_timestamp()) sparkml_model_data = sagemaker_session.upload_data( path=os.path.join(sparkml_data_path, "mleap_model.tar.gz"), key_prefix="integ-test-data/sparkml/model", ) xgb_model_data = sagemaker_session.upload_data( path=os.path.join(xgboost_data_path, "xgb_model.tar.gz"), key_prefix="integ-test-data/xgboost/model", ) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): sparkml_model = SparkMLModel( model_data=sparkml_model_data, env={"SAGEMAKER_SPARKML_SCHEMA": SCHEMA}, sagemaker_session=sagemaker_session, ) xgb_image = get_image_uri(sagemaker_session.boto_region_name, "xgboost") xgb_model = Model(model_data=xgb_model_data, image=xgb_image, sagemaker_session=sagemaker_session) model = PipelineModel( models=[sparkml_model, xgb_model], role="SageMakerRole", sagemaker_session=sagemaker_session, name=endpoint_name, ) model.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) predictor = RealTimePredictor( endpoint=endpoint_name, sagemaker_session=sagemaker_session, serializer=json_serializer, content_type=CONTENT_TYPE_CSV, accept=CONTENT_TYPE_CSV, ) with open(VALID_DATA_PATH, "r") as f: valid_data = f.read() assert predictor.predict(valid_data) == "0.714013934135" with open(INVALID_DATA_PATH, "r") as f: invalid_data = f.read() assert predictor.predict(invalid_data) is None model.delete_model() with pytest.raises(Exception) as exception: sagemaker_session.sagemaker_client.describe_model(ModelName=model.name) assert "Could not find model" in str(exception.value)
def inference_pipeline_ep(role, sess, spark_model_uri, region, bucket, pipeline_model_name, endpoint_name, **context): timestamp_prefix = Variable.get("timestamp") # sm = boto3.client('sagemaker', region_name=region) s3client = boto3.client('s3', region_name=region) s3_sparkml_data_uri = spark_model_uri # Using S3 calls for listing model artifcats s3_xgb_objects = s3client.list_objects_v2( Bucket=bucket, StartAfter='sagemaker/spark-preprocess/model/xgboost/') obj_list = s3_xgb_objects['Contents'] obj_list.sort(key=lambda x: x['LastModified'], reverse=False) xgboost_model_latest = obj_list[-1]['Key'] s3_xgboost_model_uri = 's3://' + bucket + '/' + xgboost_model_latest # AirFlow XCOM feature # s3_xgboost_model_uri = context['task_instance'].xcom_pull( # task_ids='xgboost_model_training')['Training']['ModelArtifacts']['S3ModelArtifacts'] xgb_container = get_image_uri(sess.region_name, 'xgboost', repo_version='0.90-1') schema_json = schema_utils.abalone_schema() sparkml_model = SparkMLModel( model_data=s3_sparkml_data_uri, role=role, sagemaker_session=sagemaker.session.Session(sess), env={'SAGEMAKER_SPARKML_SCHEMA': schema_json}) xgb_model = Model(model_data=s3_xgboost_model_uri, role=role, sagemaker_session=sagemaker.session.Session(sess), image=xgb_container) pipeline_model_name = pipeline_model_name sm_model = PipelineModel(name=pipeline_model_name, role=role, sagemaker_session=sagemaker.session.Session(sess), models=[sparkml_model, xgb_model]) endpoint_name = endpoint_name sm_model.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name)
def __init__(self, name, training_resource_config, region, repo_version): self.algo_name = name self.training_resource_config = training_resource_config self.region = region self.repo_version = repo_version if self.algo_name == "xgboost": self.algo_image_uri = default_framework_uri( framework=self.algo_name, region_name=region, image_tag=repo_version ) else: self.algo_image_uri = get_image_uri( region_name=region, repo_name=self.algo_name, repo_version=repo_version )
def train_model(s3_model_output_location, s3_training_file_location): # crea i ruoli necessari per la creazione di endpoint e per l'uso di sagemaker role = sagemaker_role.create_role_sagemaker() # chiamata di creazione del ruolo e' asincrona sleep(20) # Build Model sess = sagemaker.Session() # Access appropriate algorithm container image # Specify how many instances to use for distributed training and what type of machine to use # Finally, specify where the trained model artifacts needs to be stored # Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html container_path = get_image_uri(boto3.Session().region_name, 'xgboost', repo_version='0.90-1') estimator = sagemaker.estimator.Estimator( container_path, role, train_instance_count=1, train_instance_type='ml.m5.large', output_path=s3_model_output_location, sagemaker_session=sess, base_job_name='xgboost-fall-v1') # Specify hyper parameters that appropriate for the training algorithm # XGBoost Training Parameter Reference: # https://github.com/dmlc/xgboost/blob/master/doc/parameter.md # max_depth=5,eta=0.1,subsample=0.7,num_round=150 estimator.set_hyperparameters(max_depth=6, objective="reg:linear", eta=0.12, subsample=0.73, num_round=200) estimator.hyperparameters() # content type can be libsvm or csv for XGBoost training_input_config = sagemaker.session.s3_input( s3_data=s3_training_file_location, content_type="csv") estimator.fit({'train': training_input_config}) return estimator
def test_inference_pipeline_model_deploy(sagemaker_session): sparkml_data_path = os.path.join(DATA_DIR, 'sparkml_model') xgboost_data_path = os.path.join(DATA_DIR, 'xgboost_model') endpoint_name = 'test-inference-pipeline-deploy-{}'.format( sagemaker_timestamp()) sparkml_model_data = sagemaker_session.upload_data( path=os.path.join(sparkml_data_path, 'mleap_model.tar.gz'), key_prefix='integ-test-data/sparkml/model') xgb_model_data = sagemaker_session.upload_data( path=os.path.join(xgboost_data_path, 'xgb_model.tar.gz'), key_prefix='integ-test-data/xgboost/model') with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): sparkml_model = SparkMLModel(model_data=sparkml_model_data, env={'SAGEMAKER_SPARKML_SCHEMA': SCHEMA}, sagemaker_session=sagemaker_session) xgb_image = get_image_uri(sagemaker_session.boto_region_name, 'xgboost') xgb_model = Model(model_data=xgb_model_data, image=xgb_image, sagemaker_session=sagemaker_session) model = PipelineModel(models=[sparkml_model, xgb_model], role='SageMakerRole', sagemaker_session=sagemaker_session, name=endpoint_name) model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) predictor = RealTimePredictor(endpoint=endpoint_name, sagemaker_session=sagemaker_session, serializer=json_serializer, content_type=CONTENT_TYPE_CSV, accept=CONTENT_TYPE_CSV) with open(VALID_DATA_PATH, 'r') as f: valid_data = f.read() assert predictor.predict(valid_data) == '0.714013934135' with open(INVALID_DATA_PATH, 'r') as f: invalid_data = f.read() assert (predictor.predict(invalid_data) is None) model.delete_model() with pytest.raises(Exception) as exception: sagemaker_session.sagemaker_client.describe_model(ModelName=model.name) assert 'Could not find model' in str(exception.value)
def estimator_knn(sagemaker_session, cpu_instance_type): knn_image = get_image_uri(sagemaker_session.boto_session.region_name, "knn", repo_version="1") estimator = Estimator( image_name=knn_image, role=EXECUTION_ROLE, train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) estimator.set_hyperparameters(k=10, sample_size=500, feature_dim=784, mini_batch_size=100, predictor_type="regressor") return estimator
def create_knn(): role = 'CS218WebApp' params = { 'feature_dim': session['feature_size'], 'predictor_type': 'classifier', 'k': session['k'], 'sample_size': session['sample_size'] } estimator = sagemaker.estimator.Estimator( get_image_uri(boto3.Session().region_name, "knn"), role=role, train_instance_count=1, train_instance_type='ml.m5.2xlarge', sagemaker_session=sagemaker.Session(), hyperparameters=params) fit_input = {'train': session['train'], 'test': session['test']} estimator.fit(fit_input) return estimator
def estimator_fm(sagemaker_session, cpu_instance_type): fm_image = get_image_uri(sagemaker_session.boto_session.region_name, "factorization-machines", repo_version="1") estimator = Estimator( image_name=fm_image, role=EXECUTION_ROLE, train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) estimator.set_hyperparameters(num_factors=10, feature_dim=784, mini_batch_size=100, predictor_type="regressor") return estimator
def _to_estimator_conf(self, task): from sagemaker.amazon.amazon_estimator import get_image_uri return { "image_name": get_image_uri(task.region, task.estimator_config.algorithm), "role": task.sagemaker_role, "train_instance_count": task.estimator_config.train_instance_count, "train_instance_type": task.estimator_config.train_instance_type, "train_volume_size": task.estimator_config.train_volume_size, "output_path": str(task.output_path), "base_job_name": task.estimator_config.base_job_name, "hyperparameters": task.estimator_config.hyperparameters, }
def _create_model(self): self._model_name = "Kmeans-model-{}".format( strftime("%Y-%m-%d-%H-%M-%S", gmtime())) self._logger.info("Creating SageMaker KMeans model ... {}".format( self._model_name)) primary_container = { 'Image': get_image_uri(self._sagemaker_session.boto_region_name, 'kmeans'), 'ModelDataUrl': self._model_s3_filepath } create_model_response = self._sagemaker_client.create_model( ModelName=self._model_name, ExecutionRoleArn=self._ml_engine.iam_role, PrimaryContainer=primary_container) model_arn = create_model_response['ModelArn'] self._logger.info( "Model created successfully! name: {}, arn: {}".format( self._model_name, model_arn))
def submit_training_job(path_to_train_data, bucket, formatted_data): output_prefix = 'train_output' role = 'arn:aws:iam::450246219423:role/service-role/AmazonSageMaker-ExecutionRole-20200426T181822' train_data_path = path_to_train_data # path_to_test_data = f's3://ml-web-app/test/test.protobuf' # job_name = 'iris-train' output_path = 's3://{}/{}/factorization_machine_output'.format(bucket, output_prefix) container = get_image_uri(boto3.Session(region_name='us-west-1').region_name, 'factorization-machines') estimator = sagemaker.estimator.Estimator(container, role, train_instance_count=1, train_instance_type='ml.c4.xlarge', output_path=output_path, sagemaker_session=sagemaker.Session()) estimator.set_hyperparameters(feature_dim=formatted_data.shape[1], predictor_type='regressor', num_factors=64) # run training job estimator.fit({'train': train_data_path})
def train_model_deploy(args): backup_bucket = args.s3_backup_bucket sagemaker_bucket = args.s3_sagemaker_bucket role = args.role_arn sm_prefix = 'demo-breast-cancer-prediction' # Get Docker image for linear-learner container = get_image_uri(boto3.Session().region_name, 'linear-learner') # Find the latest item from the backup bucket objs = s3.list_objects(Bucket=backup_bucket) key_time = [(item['Key'], item['LastModified']) for item in objs['Contents']] key_time = sorted(key_time, key=lambda tup: tup[1], reverse=True) s3_file_key = key_time[0][0] print('Variables intialized as:') print(f'Backup Bucket {backup_bucket}') print(f'Backup File Key {s3_file_key}') print(f'Role ARN {role}') print(f'Sagemaker Bucket {sagemaker_bucket}') print(f'Sagemaker Prefix {sm_prefix}') print(f'Container {container}') try: data = load_backup_data(backup_bucket, s3_file_key) train_X, train_y, val_X, val_y, test_X, test_y = split_data(data) save_train_val_to_s3(sagemaker_bucket, sm_prefix, train_X, train_y, val_X, val_y) linear_job = create_training_job(container, sagemaker_bucket, sm_prefix, role) model_name = linear_job create_model(container, role, linear_job, model_name) linear_endpoint = create_or_update_endpoint(model_name) test_endpoint(linear_endpoint, test_X, test_y, train_X, train_y) print('Success') except Exception as e: print(e) sys.exit()