def from_string(cls, constraint_violations_file_string, kms_key=None, file_name=None, sagemaker_session=None): """Generates a ConstraintViolations object from an s3 uri. Args: constraint_violations_file_string (str): The uri of the constraint violations JSON file. kms_key (str): The kms key to be used to encrypt the file in S3. file_name (str): The file name to use when uploading to S3. sagemaker_session (sagemaker.session.Session): A SageMaker Session object, used for SageMaker interactions (default: None). If not specified, one is created using the default AWS configuration chain. Returns: sagemaker.model_monitor.ConstraintViolations: The instance of ConstraintViolations generated from the s3 uri. """ sagemaker_session = sagemaker_session or Session() file_name = file_name or "constraint_violations.json" desired_s3_uri = os.path.join("s3://", sagemaker_session.default_bucket(), "monitoring", str(uuid.uuid4()), file_name) s3_uri = S3Uploader.upload_string_as_file_body( body=constraint_violations_file_string, desired_s3_uri=desired_s3_uri, kms_key=kms_key, session=sagemaker_session, ) return ConstraintViolations.from_s3_uri( constraint_violations_file_s3_uri=s3_uri, kms_key=kms_key, sagemaker_session=sagemaker_session, )
def save(self, new_save_location_s3_uri=None): """Save the current instance's body to s3 using the instance's s3 path. The S3 path can be overridden by providing one. This also overrides the default save location for this object. Args: new_save_location_s3_uri (str): Optional. The S3 path to save the file to. If not provided, the file is saved in place in S3. If provided, the file's S3 path is permanently updated. Returns: str: The s3 location to which the file was saved. """ if new_save_location_s3_uri is not None: self.file_s3_uri = new_save_location_s3_uri return S3Uploader.upload_string_as_file_body( body=json.dumps(self.body_dict), desired_s3_uri=self.file_s3_uri, kms_key=self.kms_key, sagemaker_session=self.session, )
def test_statistics_object_creation_from_s3_uri_without_customizations( sagemaker_session): with open(os.path.join(tests.integ.DATA_DIR, "monitor/statistics.json"), "r") as f: file_body = f.read() file_name = "statistics.json" desired_s3_uri = os.path.join( "s3://", sagemaker_session.default_bucket(), "integ-test-test-monitoring-files", str(uuid.uuid4()), file_name, ) s3_uri = S3Uploader.upload_string_as_file_body( body=file_body, desired_s3_uri=desired_s3_uri) statistics = Statistics.from_s3_uri(statistics_file_s3_uri=s3_uri) assert statistics.file_s3_uri.startswith("s3://") assert statistics.file_s3_uri.endswith("statistics.json") assert statistics.body_dict["dataset"]["item_count"] == 418
def test_one_step_sparkjar_processing_pipeline( sagemaker_session, role, cpu_instance_type, pipeline_name, region_name, configuration, build_jar, ): instance_count = ParameterInteger(name="InstanceCount", default_value=2) cache_config = CacheConfig(enable_caching=True, expire_after="T30m") spark_path = os.path.join(DATA_DIR, "spark") spark_jar_processor = SparkJarProcessor( role=role, instance_count=2, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version="2.4", ) bucket = spark_jar_processor.sagemaker_session.default_bucket() with open(os.path.join(spark_path, "files", "data.jsonl")) as data: body = data.read() input_data_uri = f"s3://{bucket}/spark/input/data.jsonl" S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session, ) output_data_uri = f"s3://{bucket}/spark/output/sales/{datetime.now().isoformat()}" java_project_dir = os.path.join(spark_path, "code", "java", "hello-java-spark") spark_run_args = spark_jar_processor.get_run_args( submit_app=f"{java_project_dir}/hello-spark-java.jar", submit_class="com.amazonaws.sagemaker.spark.test.HelloJavaSparkApp", arguments=["--input", input_data_uri, "--output", output_data_uri], configuration=configuration, ) step_pyspark = ProcessingStep( name="sparkjar-process", processor=spark_jar_processor, inputs=spark_run_args.inputs, outputs=spark_run_args.outputs, job_arguments=spark_run_args.arguments, code=spark_run_args.code, cache_config=cache_config, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count], steps=[step_pyspark], sagemaker_session=sagemaker_session, ) try: # NOTE: We should exercise the case when role used in the pipeline execution is # different than that required of the steps in the pipeline itself. The role in # the pipeline definition needs to create training and processing jobs and other # sagemaker entities. However, the jobs created in the steps themselves execute # under a potentially different role, often requiring access to S3 and other # artifacts not required to during creation of the jobs in the pipeline steps. response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) pipeline.parameters = [ ParameterInteger(name="InstanceCount", default_value=1) ] response = pipeline.update(role) update_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", update_arn, ) execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) response = execution.describe() assert response["PipelineArn"] == create_arn # Check CacheConfig response = json.loads( pipeline.describe() ["PipelineDefinition"])["Steps"][0]["CacheConfig"] assert response["Enabled"] == cache_config.enable_caching assert response["ExpireAfter"] == cache_config.expire_after try: execution.wait(delay=30, max_attempts=3) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "sparkjar-process" finally: try: pipeline.delete() except Exception: pass
def run( self, entry_point: str, source_dir: Optional[str], dependencies: Optional[List[str]] = None, git_config: Optional[Dict[str, str]] = None, inputs: Optional[List[ProcessingInput]] = None, outputs: Optional[List[ProcessingOutput]] = None, arguments: Optional[List[str]] = None, wait: bool = True, logs: bool = True, job_name: Optional[str] = None, experiment_config: Optional[Dict[str, str]] = None, kms_key: Optional[str] = None, ): """Runs a processing job. Args: entrypoint (str): Path (absolute or relative) to the local Python source file which should be executed as the entry point to training. If ``source_dir`` is specified, then ``entry_point`` must point to a file located at the root of ``source_dir``. source_dir (str): Path (absolute, relative or an S3 URI) to a directory with any other training source code dependencies aside from the entry point file (default: None). If ``source_dir`` is an S3 URI, it must point to a tar.gz file. Structure within this directory are preserved when training on Amazon SageMaker. dependencies (list[str]): A list of paths to directories (absolute or relative) with any additional libraries that will be exported to the container (default: []). The library folders will be copied to SageMaker in the same folder where the entrypoint is copied. If 'git_config' is provided, 'dependencies' should be a list of relative locations to directories with any additional libraries needed in the Git repo. git_config (dict[str, str]): Git configurations used for cloning files, including ``repo``, ``branch``, ``commit``, ``2FA_enabled``, ``username``, ``password`` and ``token``. The ``repo`` field is required. All other fields are optional. ``repo`` specifies the Git repository where your training script is stored. If you don't provide ``branch``, the default value 'master' is used. If you don't provide ``commit``, the latest commit in the specified branch is used. .. admonition:: Example The following config: >>> git_config = {'repo': 'https://github.com/aws/sagemaker-python-sdk.git', >>> 'branch': 'test-branch-git-config', >>> 'commit': '329bfcf884482002c05ff7f44f62599ebc9f445a'} results in cloning the repo specified in 'repo', then checkout the 'master' branch, and checkout the specified commit. ``2FA_enabled``, ``username``, ``password`` and ``token`` are used for authentication. For GitHub (or other Git) accounts, set ``2FA_enabled`` to 'True' if two-factor authentication is enabled for the account, otherwise set it to 'False'. If you do not provide a value for ``2FA_enabled``, a default value of 'False' is used. CodeCommit does not support two-factor authentication, so do not provide "2FA_enabled" with CodeCommit repositories. For GitHub and other Git repos, when SSH URLs are provided, it doesn't matter whether 2FA is enabled or disabled; you should either have no passphrase for the SSH key pairs, or have the ssh-agent configured so that you will not be prompted for SSH passphrase when you do 'git clone' command with SSH URLs. When HTTPS URLs are provided: if 2FA is disabled, then either token or username+password will be used for authentication if provided (token prioritized); if 2FA is enabled, only token will be used for authentication if provided. If required authentication info is not provided, python SDK will try to use local credentials storage to authenticate. If that fails either, an error message will be thrown. For CodeCommit repos, 2FA is not supported, so '2FA_enabled' should not be provided. There is no token in CodeCommit, so 'token' should not be provided too. When 'repo' is an SSH URL, the requirements are the same as GitHub-like repos. When 'repo' is an HTTPS URL, username+password will be used for authentication if they are provided; otherwise, python SDK will try to use either CodeCommit credential helper or local credential storage for authentication. inputs (list[:class:`~sagemaker.processing.ProcessingInput`]): Input files for the processing job. These must be provided as :class:`~sagemaker.processing.ProcessingInput` objects (default: None). outputs (list[:class:`~sagemaker.processing.ProcessingOutput`]): Outputs for the processing job. These can be specified as either path strings or :class:`~sagemaker.processing.ProcessingOutput` objects (default: None). arguments (list[str]): A list of string arguments to be passed to a processing job (default: None). wait (bool): Whether the call should wait until the job completes (default: True). logs (bool): Whether to show the logs produced by the job. Only meaningful when wait is True (default: True). job_name (str): Processing job name. If not specified, the processor generates a default job name, based on the base job name and current timestamp. experiment_config (dict[str, str]): Experiment management configuration. Dictionary contains three optional keys: 'ExperimentName', 'TrialName', and 'TrialComponentDisplayName'. kms_key (str): The ARN of the KMS key that is used to encrypt the user code file (default: None). """ if job_name is None: job_name = self._generate_current_job_name() estimator = self._upload_payload(entry_point, source_dir, dependencies, git_config, job_name) inputs = self._patch_inputs_with_payload( inputs, estimator._hyperparameters["sagemaker_submit_directory"]) # Upload the bootstrapping code as s3://.../jobname/source/runproc.sh. s3_runproc_sh = S3Uploader.upload_string_as_file_body( self.runproc_sh.format(entry_point=entry_point), desired_s3_uri=f"{self.s3_prefix}/{job_name}/source/runproc.sh", sagemaker_session=self.sagemaker_session, ) self.logger.info("runproc.sh uploaded to", s3_runproc_sh) # Submit a processing job. super().run( code=s3_runproc_sh, inputs=inputs, outputs=outputs, arguments=arguments, wait=wait, logs=logs, job_name=job_name, experiment_config=experiment_config, kms_key=kms_key, )
def test_sagemaker_pyspark_multinode(tag, role, image_uri, configuration, sagemaker_session, region, sagemaker_client): """Test that basic multinode case works on 32KB of data""" spark = PySparkProcessor( base_job_name="sm-spark-py", framework_version=tag, image_uri=image_uri, role=role, instance_count=2, instance_type="ml.c5.xlarge", max_runtime_in_seconds=1200, sagemaker_session=sagemaker_session, ) bucket = spark.sagemaker_session.default_bucket() timestamp = datetime.now().isoformat() output_data_uri = "s3://{}/spark/output/sales/{}".format(bucket, timestamp) spark_event_logs_key_prefix = "spark/spark-events/{}".format(timestamp) spark_event_logs_s3_uri = "s3://{}/{}".format(bucket, spark_event_logs_key_prefix) with open("test/resources/data/files/data.jsonl") as data: body = data.read() input_data_uri = "s3://{}/spark/input/data.jsonl".format(bucket) S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session) spark.run( submit_app= "test/resources/code/python/hello_py_spark/hello_py_spark_app.py", submit_py_files=[ "test/resources/code/python/hello_py_spark/hello_py_spark_udfs.py" ], arguments=["--input", input_data_uri, "--output", output_data_uri], configuration=configuration, spark_event_logs_s3_uri=spark_event_logs_s3_uri, wait=False, ) processing_job = spark.latest_job s3_client = boto3.client("s3", region_name=region) file_size = 0 latest_file_size = None updated_times_count = 0 time_out = time.time() + 900 while not processing_job_not_fail_or_complete(sagemaker_client, processing_job.job_name): response = s3_client.list_objects(Bucket=bucket, Prefix=spark_event_logs_key_prefix) if "Contents" in response: # somehow when call list_objects the first file size is always 0, this for loop # is to skip that. for event_log_file in response["Contents"]: if event_log_file["Size"] != 0: print("\n##### Latest file size is " + str(event_log_file["Size"])) latest_file_size = event_log_file["Size"] # update the file size if it increased if latest_file_size and latest_file_size > file_size: print("\n##### S3 file updated.") updated_times_count += 1 file_size = latest_file_size if time.time() > time_out: raise RuntimeError("Timeout") time.sleep(20) # verify that spark event logs are periodically written to s3 print("\n##### file_size {} updated_times_count {}".format( file_size, updated_times_count)) assert file_size != 0 # Commenting this assert because it's flaky. # assert updated_times_count > 1 output_contents = S3Downloader.list(output_data_uri, sagemaker_session=sagemaker_session) assert len(output_contents) != 0
def test_s3_uploader_and_downloader_downloads_files_when_given_directory_uris_with_directory( sagemaker_session, s3_files_kms_key): my_uuid = str(uuid.uuid4()) my_inner_directory_uuid = str(uuid.uuid4()) file_1_body = "First File Body {}.".format(my_uuid) file_1_name = "first_file_{}.txt".format(my_uuid) file_2_body = "Second File Body {}.".format(my_uuid) file_2_name = "second_file_{}.txt".format(my_uuid) base_s3_uri = os.path.join( "s3://", sagemaker_session.default_bucket(), "integ-test-test-s3-list", my_uuid, my_inner_directory_uuid, ) file_1_s3_uri = os.path.join(base_s3_uri, file_1_name) file_2_s3_uri = os.path.join(base_s3_uri, file_2_name) S3Uploader.upload_string_as_file_body( body=file_1_body, desired_s3_uri=file_1_s3_uri, kms_key=s3_files_kms_key, session=sagemaker_session, ) S3Uploader.upload_string_as_file_body( body=file_2_body, desired_s3_uri=file_2_s3_uri, kms_key=s3_files_kms_key, session=sagemaker_session, ) s3_uris = S3Downloader.list(s3_uri=base_s3_uri, session=sagemaker_session) assert file_1_name in s3_uris[0] assert file_2_name in s3_uris[1] assert file_1_body == S3Downloader.read_file(s3_uri=s3_uris[0], session=sagemaker_session) assert file_2_body == S3Downloader.read_file(s3_uri=s3_uris[1], session=sagemaker_session) s3_directory_with_directory_underneath = os.path.join( "s3://", sagemaker_session.default_bucket(), "integ-test-test-s3-list", my_uuid) S3Downloader.download( s3_uri=s3_directory_with_directory_underneath, local_path=TMP_BASE_PATH, session=sagemaker_session, ) with open( os.path.join(TMP_BASE_PATH, my_inner_directory_uuid, file_1_name), "r") as f: assert file_1_body == f.read() with open( os.path.join(TMP_BASE_PATH, my_inner_directory_uuid, file_2_name), "r") as f: assert file_2_body == f.read()
def test_sagemaker_pyspark_multinode(spark_py_processor, sagemaker_session, configuration): """Test that basic multinode case works on 32KB of data""" bucket = spark_py_processor.sagemaker_session.default_bucket() timestamp = datetime.now().isoformat() output_data_uri = f"s3://{bucket}/spark/output/sales/{timestamp}" spark_event_logs_key_prefix = f"spark/spark-events/{timestamp}" spark_event_logs_s3_uri = f"s3://{bucket}/{spark_event_logs_key_prefix}" with open(os.path.join(SPARK_PATH, "files", "data.jsonl")) as data: body = data.read() input_data_uri = f"s3://{bucket}/spark/input/data.jsonl" S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session) spark_py_processor.run( submit_app=os.path.join(SPARK_PATH, "code", "python", "hello_py_spark", "hello_py_spark_app.py"), submit_py_files=[ os.path.join(SPARK_PATH, "code", "python", "hello_py_spark", "hello_py_spark_udfs.py") ], arguments=["--input", input_data_uri, "--output", output_data_uri], configuration=configuration, spark_event_logs_s3_uri=spark_event_logs_s3_uri, wait=False, ) processing_job = spark_py_processor.latest_job s3_client = boto3.client( "s3", region_name=spark_py_processor.sagemaker_session.boto_region_name) file_size = 0 latest_file_size = None updated_times_count = 0 time_out = time.time() + 900 while not processing_job_not_fail_or_complete( sagemaker_session.sagemaker_client, processing_job.job_name): response = s3_client.list_objects(Bucket=bucket, Prefix=spark_event_logs_key_prefix) if "Contents" in response: # somehow when call list_objects the first file size is always 0, this for loop # is to skip that. for event_log_file in response["Contents"]: if event_log_file["Size"] != 0: latest_file_size = event_log_file["Size"] # update the file size if it increased if latest_file_size and latest_file_size > file_size: updated_times_count += 1 file_size = latest_file_size if time.time() > time_out: raise RuntimeError("Timeout") time.sleep(20) # verify that spark event logs are periodically written to s3 assert file_size != 0 output_contents = S3Downloader.list(output_data_uri, sagemaker_session=sagemaker_session) assert len(output_contents) != 0
def main(deploy_data: dict, train_data: dict, capture_prefix: str): inference_id_prefix = 'sts_' # the same used in testendpoint.py # Load config from environment and set required defaults # AWS especific AWS_DEFAULT_REGION = os.getenv('AWS_DEFAULT_REGION', 'eu-west-1') AWS_PROFILE = os.getenv('AWS_PROFILE', 'default') AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID', None) AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY', None) b3_session, sm_client, sm_runtime, sm_session = get_sm_session( region=AWS_DEFAULT_REGION, profile_name=AWS_PROFILE, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) # read test data test_data = load_dataset(train_data['train']['test'], 'test.csv', sagemaker_session=sm_session) print(f"Loadding {train_data['train']['test']}") Y_val = test_data.iloc[:, 0].to_numpy() print(f"Test dataset shape: {Y_val.shape}") # list capture files, this is just as an example. Not used right # now but could be. capture_files = sorted( S3Downloader.list("{}/{}".format( deploy_data['monitor']['s3_capture_upload_path'], deploy_data['endpoint']['name']), sagemaker_session=sm_session)) # just the files with the prefix filtered = list( filter(lambda file_name: capture_prefix in file_name, capture_files)) print(f"Detected {len(filtered)} capture files") capture_records = [] for c_file in filtered: print(f"Processing: {c_file}") # read the capture data directly from S3 content = S3Downloader.read_file(c_file, sagemaker_session=sm_session) records = [json.loads(l) for l in content.split("\n")[:-1]] capture_records.extend(records) print(f"No. of records {len(capture_records)} captured") captured_predictions = {} for obj in capture_records: # Extract inference ID inference_id = obj["eventMetadata"]["inferenceId"] # current version of script start in 1 when id=0 # remove the prefix and get the id req_id = int(inference_id[len(inference_id_prefix):]) # Extract result given by the model Y_pred_value = encoders.decode( obj["captureData"]["endpointOutput"]["data"], # i have fixed this value here becouse # obj["captureData"]["endpointOutput"]["observedContentType"] # some times include the encoding like: text/csv; utf-8 # and encoders.decode() will give error. content_types.CSV) captured_predictions[req_id] = Y_pred_value # np.array # save and upload the ground truth labels print("Generating labels") fake_records = [] for i, label in captured_predictions.items(): val = ground_truth_with_id(i, label, Y_val, inference_id_prefix) fake_records.append(json.dumps(val)) data_to_upload = "\n".join(fake_records) target_s3_uri = "{}/{}/{}.jsonl".format( deploy_data['monitor']['ground truth uri'], capture_prefix, uuid.uuid4().hex) print(f"Uploading ground truth to {target_s3_uri} ...", end="") S3Uploader.upload_string_as_file_body(data_to_upload, target_s3_uri, sagemaker_session=sm_session) print("Done !")
def test_one_step_ingestion_pipeline(sagemaker_session, feature_store_session, feature_definitions, role, pipeline_name): instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.4xlarge") input_name = "features.csv" input_file_path = os.path.join(DATA_DIR, "workflow", "features.csv") input_data_uri = os.path.join("s3://", sagemaker_session.default_bucket(), "py-sdk-ingestion-test-input/features.csv") with open(input_file_path, "r") as data: body = data.read() S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session) inputs = [ ProcessingInput( input_name=input_name, source=input_data_uri, destination="/opt/ml/processing/features.csv", ) ] feature_group_name = f"py-sdk-integ-fg-{int(time.time() * 10**7)}" feature_group = FeatureGroup( name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=feature_store_session, ) ingestion_only_flow, output_name = generate_data_ingestion_flow_from_s3_input( input_name, input_data_uri, s3_content_type="csv", s3_has_header=True, ) outputs = [ ProcessingOutput( output_name=output_name, app_managed=True, feature_store_output=FeatureStoreOutput( feature_group_name=feature_group_name), ) ] temp_flow_path = "./ingestion.flow" with cleanup_feature_group(feature_group): json.dump(ingestion_only_flow, open(temp_flow_path, "w")) data_wrangler_processor = DataWranglerProcessor( role=role, data_wrangler_flow_source=temp_flow_path, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, max_runtime_in_seconds=86400, ) data_wrangler_step = ProcessingStep(name="ingestion-step", processor=data_wrangler_processor, inputs=inputs, outputs=outputs) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count, instance_type], steps=[data_wrangler_step], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] offline_store_s3_uri = os.path.join( "s3://", sagemaker_session.default_bucket(), feature_group_name) feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="f11", event_time_feature_name="f10", role_arn=role, enable_online_store=False, ) _wait_for_feature_group_create(feature_group) execution = pipeline.start() response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=60, max_attempts=10) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "ingestion-step" assert execution_steps[0]["StepStatus"] == "Succeeded" athena_query = feature_group.athena_query() with timeout(minutes=10): athena_query.run( query_string=f'SELECT * FROM "{athena_query.table_name}"', output_location=f"{offline_store_s3_uri}/query_results", ) athena_query.wait() assert "SUCCEEDED" == athena_query.get_query_execution().get( "QueryExecution").get("Status").get("State") df = athena_query.as_dataframe() assert pd.read_csv(input_file_path).shape[0] == df.shape[0] finally: try: pipeline.delete() except Exception as e: print(f"Delete pipeline failed with error: {e}") os.remove(temp_flow_path)
def test_model_registration_with_drift_check_baselines( sagemaker_session, role, pipeline_name, ): instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") # upload model data to s3 model_local_path = os.path.join(DATA_DIR, "mxnet_mnist/model.tar.gz") model_base_uri = "s3://{}/{}/input/model/{}".format( sagemaker_session.default_bucket(), "register_model_test_with_drift_baseline", utils.unique_name_from_base("model"), ) model_uri = S3Uploader.upload(model_local_path, model_base_uri, sagemaker_session=sagemaker_session) model_uri_param = ParameterString(name="model_uri", default_value=model_uri) # upload metrics to s3 metrics_data = ( '{"regression_metrics": {"mse": {"value": 4.925353410353891, ' '"standard_deviation": 2.219186917819692}}}') metrics_base_uri = "s3://{}/{}/input/metrics/{}".format( sagemaker_session.default_bucket(), "register_model_test_with_drift_baseline", utils.unique_name_from_base("metrics"), ) metrics_uri = S3Uploader.upload_string_as_file_body( body=metrics_data, desired_s3_uri=metrics_base_uri, sagemaker_session=sagemaker_session, ) metrics_uri_param = ParameterString(name="metrics_uri", default_value=metrics_uri) model_metrics = ModelMetrics( bias=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), explainability=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), bias_pre_training=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), bias_post_training=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), ) drift_check_baselines = DriftCheckBaselines( model_statistics=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), model_constraints=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), model_data_statistics=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), model_data_constraints=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), bias_config_file=FileSource( s3_uri=metrics_uri_param, content_type="application/json", ), bias_pre_training_constraints=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), bias_post_training_constraints=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), explainability_constraints=MetricsSource( s3_uri=metrics_uri_param, content_type="application/json", ), explainability_config_file=FileSource( s3_uri=metrics_uri_param, content_type="application/json", ), ) customer_metadata_properties = {"key1": "value1"} estimator = XGBoost( entry_point="training.py", source_dir=os.path.join(DATA_DIR, "sip"), instance_type=instance_type, instance_count=instance_count, framework_version="0.90-2", sagemaker_session=sagemaker_session, py_version="py3", role=role, ) step_register = RegisterModel( name="MyRegisterModelStep", estimator=estimator, model_data=model_uri_param, content_types=["application/json"], response_types=["application/json"], inference_instances=["ml.t2.medium", "ml.m5.xlarge"], transform_instances=["ml.m5.xlarge"], model_package_group_name="testModelPackageGroup", model_metrics=model_metrics, drift_check_baselines=drift_check_baselines, customer_metadata_properties=customer_metadata_properties, ) pipeline = Pipeline( name=pipeline_name, parameters=[ model_uri_param, metrics_uri_param, instance_type, instance_count, ], steps=[step_register], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] for _ in retries( max_retry_count=5, exception_message_prefix= "Waiting for a successful execution of pipeline", seconds_to_sleep=10, ): execution = pipeline.start(parameters={ "model_uri": model_uri, "metrics_uri": metrics_uri }) response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=30, max_attempts=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 failure_reason = execution_steps[0].get("FailureReason", "") if failure_reason != "": logging.error( f"Pipeline execution failed with error: {failure_reason}." " Retrying..") continue assert execution_steps[0]["StepStatus"] == "Succeeded" assert execution_steps[0]["StepName"] == "MyRegisterModelStep" response = sagemaker_session.sagemaker_client.describe_model_package( ModelPackageName=execution_steps[0]["Metadata"] ["RegisterModel"]["Arn"]) assert (response["ModelMetrics"]["Explainability"]["Report"] ["ContentType"] == "application/json") assert (response["DriftCheckBaselines"]["Bias"][ "PreTrainingConstraints"]["ContentType"] == "application/json") assert (response["DriftCheckBaselines"]["Explainability"] ["Constraints"]["ContentType"] == "application/json") assert (response["DriftCheckBaselines"]["ModelQuality"] ["Statistics"]["ContentType"] == "application/json") assert (response["DriftCheckBaselines"]["ModelDataQuality"] ["Statistics"]["ContentType"] == "application/json") assert response[ "CustomerMetadataProperties"] == customer_metadata_properties break finally: try: pipeline.delete() except Exception: pass
def test_end_to_end_pipeline_successful_execution( sagemaker_session, region_name, role, pipeline_name, wait=False ): model_package_group_name = f"{pipeline_name}ModelPackageGroup" data_path = os.path.join(DATA_DIR, "workflow") default_bucket = sagemaker_session.default_bucket() # download the input data local_input_path = os.path.join(data_path, "abalone-dataset.csv") s3 = sagemaker_session.boto_session.resource("s3") s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file( "dataset/abalone-dataset.csv", local_input_path ) # # upload the input data to our bucket base_uri = f"s3://{default_bucket}/{pipeline_name}" with open(local_input_path) as data: body = data.read() input_data_uri = S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=f"{base_uri}/abalone-dataset.csv", sagemaker_session=sagemaker_session, ) # download batch transform data local_batch_path = os.path.join(data_path, "abalone-dataset-batch") s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file( "dataset/abalone-dataset-batch", local_batch_path ) # upload the batch transform data with open(local_batch_path) as data: body = data.read() batch_data_uri = S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=f"{base_uri}/abalone-dataset-batch", sagemaker_session=sagemaker_session, ) # define parameters processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) processing_instance_type = ParameterString( name="ProcessingInstanceType", default_value="ml.m5.xlarge" ) training_instance_type = ParameterString( name="TrainingInstanceType", default_value="ml.m5.xlarge" ) model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved") input_data = ParameterString( name="InputData", default_value=input_data_uri, ) batch_data = ParameterString( name="BatchData", default_value=batch_data_uri, ) # define processing step framework_version = "0.23-1" sklearn_processor = SKLearnProcessor( framework_version=framework_version, instance_type=processing_instance_type, instance_count=processing_instance_count, base_job_name=f"{pipeline_name}-process", role=role, sagemaker_session=sagemaker_session, ) step_process = ProcessingStep( name="AbaloneProcess", processor=sklearn_processor, inputs=[ ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), ], outputs=[ ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), ], code=os.path.join(data_path, "abalone/preprocessing.py"), ) # define training step model_path = f"s3://{default_bucket}/{pipeline_name}Train" image_uri = image_uris.retrieve( framework="xgboost", region=region_name, version="1.0-1", py_version="py3", instance_type=training_instance_type, ) xgb_train = Estimator( image_uri=image_uri, instance_type=training_instance_type, instance_count=1, output_path=model_path, role=role, sagemaker_session=sagemaker_session, ) xgb_train.set_hyperparameters( objective="reg:linear", num_round=50, max_depth=5, eta=0.2, gamma=4, min_child_weight=6, subsample=0.7, silent=0, ) step_train = TrainingStep( name="AbaloneTrain", estimator=xgb_train, inputs={ "train": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "train" ].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ "validation" ].S3Output.S3Uri, content_type="text/csv", ), }, ) # define evaluation step script_eval = ScriptProcessor( image_uri=image_uri, command=["python3"], instance_type=processing_instance_type, instance_count=1, base_job_name=f"{pipeline_name}-eval", role=role, sagemaker_session=sagemaker_session, ) evaluation_report = PropertyFile( name="EvaluationReport", output_name="evaluation", path="evaluation.json" ) step_eval = ProcessingStep( name="AbaloneEval", processor=script_eval, inputs=[ ProcessingInput( source=step_train.properties.ModelArtifacts.S3ModelArtifacts, destination="/opt/ml/processing/model", ), ProcessingInput( source=step_process.properties.ProcessingOutputConfig.Outputs[ "test" ].S3Output.S3Uri, destination="/opt/ml/processing/test", ), ], outputs=[ ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), ], code=os.path.join(data_path, "abalone/evaluation.py"), property_files=[evaluation_report], ) # define create model step model = Model( image_uri=image_uri, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, sagemaker_session=sagemaker_session, role=role, ) inputs = CreateModelInput( instance_type="ml.m5.large", accelerator_type="ml.eia1.medium", ) step_create_model = CreateModelStep( name="AbaloneCreateModel", model=model, inputs=inputs, ) # define transform step transformer = Transformer( model_name=step_create_model.properties.ModelName, instance_type="ml.m5.xlarge", instance_count=1, output_path=f"s3://{default_bucket}/{pipeline_name}Transform", sagemaker_session=sagemaker_session, ) step_transform = TransformStep( name="AbaloneTransform", transformer=transformer, inputs=TransformInput(data=batch_data), ) # define register model step model_metrics = ModelMetrics( model_statistics=MetricsSource( s3_uri="{}/evaluation.json".format( step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] ), content_type="application/json", ) ) step_register = RegisterModel( name="AbaloneRegisterModel", estimator=xgb_train, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["text/csv"], response_types=["text/csv"], inference_instances=["ml.t2.medium", "ml.m5.xlarge"], transform_instances=["ml.m5.xlarge"], model_package_group_name=model_package_group_name, approval_status=model_approval_status, model_metrics=model_metrics, ) # define condition step cond_lte = ConditionLessThanOrEqualTo( left=JsonGet( step_name=step_eval.name, property_file=evaluation_report, json_path="regression_metrics.mse.value", ), right=20.0, ) step_cond = ConditionStep( name="AbaloneMSECond", conditions=[cond_lte], if_steps=[step_register, step_create_model, step_transform], else_steps=[], ) # define pipeline pipeline = Pipeline( name=pipeline_name, parameters=[ processing_instance_type, processing_instance_count, training_instance_type, model_approval_status, input_data, batch_data, ], steps=[step_process, step_train, step_eval, step_cond], sagemaker_session=sagemaker_session, ) pipeline.create(role) execution = pipeline.start() execution_arn = execution.arn if wait: execution.wait() return execution_arn