def create_baseline(): print(f'Baseline data uri: {baseline_data_uri}') print(f'Baseline results uri: {baseline_results_uri}') my_default_monitor.suggest_baseline( baseline_dataset=baseline_data_uri, dataset_format=DatasetFormat.csv(header=False), output_s3_uri=baseline_results_uri, wait=True )
def test_default_model_monitor_suggest_baseline(sagemaker_session): my_default_monitor = DefaultModelMonitor( role=ROLE, instance_count=INSTANCE_COUNT, instance_type=INSTANCE_TYPE, volume_size_in_gb=VOLUME_SIZE_IN_GB, volume_kms_key=VOLUME_KMS_KEY, output_kms_key=OUTPUT_KMS_KEY, max_runtime_in_seconds=MAX_RUNTIME_IN_SECONDS, base_job_name=BASE_JOB_NAME, sagemaker_session=sagemaker_session, env=ENVIRONMENT, tags=TAGS, network_config=NETWORK_CONFIG, ) my_default_monitor.suggest_baseline( baseline_dataset=BASELINE_DATASET_PATH, dataset_format=DatasetFormat.csv(header=False), record_preprocessor_script=PREPROCESSOR_PATH, post_analytics_processor_script=POSTPROCESSOR_PATH, output_s3_uri=OUTPUT_S3_URI, wait=False, logs=False, ) assert my_default_monitor.role == ROLE assert my_default_monitor.instance_count == INSTANCE_COUNT assert my_default_monitor.instance_type == INSTANCE_TYPE assert my_default_monitor.volume_size_in_gb == VOLUME_SIZE_IN_GB assert my_default_monitor.volume_kms_key == VOLUME_KMS_KEY assert my_default_monitor.output_kms_key == OUTPUT_KMS_KEY assert my_default_monitor.max_runtime_in_seconds == MAX_RUNTIME_IN_SECONDS assert my_default_monitor.base_job_name == BASE_JOB_NAME assert my_default_monitor.sagemaker_session == sagemaker_session assert my_default_monitor.tags == TAGS assert my_default_monitor.network_config == NETWORK_CONFIG assert my_default_monitor.image_uri == DEFAULT_IMAGE_URI assert BASE_JOB_NAME in my_default_monitor.latest_baselining_job_name assert my_default_monitor.latest_baselining_job_name != BASE_JOB_NAME assert my_default_monitor.env[ENV_KEY_1] == ENV_VALUE_1
def main(resources, train_data): # configurarion AWS_DEFAULT_REGION = os.getenv('AWS_DEFAULT_REGION', 'eu-west-1') AWS_PROFILE = os.getenv('AWS_PROFILE', 'default') AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID', None) AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY', None) b3_session, sm_client, sm_runtime, sm_session = get_sm_session( region=AWS_DEFAULT_REGION, profile_name=AWS_PROFILE, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) BASE_JOB_PREFIX = os.getenv('BASE_JOB_PREFIX', 'sts') ROLE_ARN = os.getenv('AWS_ROLE', sagemaker.get_execution_role()) outputs = resources bucket = sm_session.default_bucket() prefix = "{}/{}".format(BASE_JOB_PREFIX, resources['endpoint']['name']) if 'monitor' not in resources: raise ValueError("Monitoring not enabled") if 's3_capture_upload_path' not in resources['monitor']: raise ValueError("Monitoring not enabled") baseline_prefix = prefix + "/baselining" baseline_data_prefix = baseline_prefix + "/data" baseline_results_prefix = baseline_prefix + "/results" baseline_data_uri = "s3://{}/{}".format(bucket, baseline_data_prefix) baseline_results_uri = "s3://{}/{}".format(bucket, baseline_results_prefix) outputs['monitor'].update({ 'baseline': { 'data_uri': baseline_data_uri, 'results_uri': baseline_results_uri } }) _l.info("Baseline data uri: {}".format(baseline_data_uri)) _l.info("Baseline results uri: {}".format(baseline_results_uri)) ground_truth_upload_path = f"s3://{bucket}/{prefix}/ground_truth_data" _l.info(f"Ground truth uri: {ground_truth_upload_path}") outputs['monitor'].update({'ground truth uri': ground_truth_upload_path}) # Create a baselining job with training dataset _l.info("Executing a baselining job with training dataset") _l.info(f"baseline_data_uri: {train_data['baseline']['validate']}") my_monitor = ModelQualityMonitor( role=ROLE_ARN, sagemaker_session=sm_session, max_runtime_in_seconds=1800 # 30 minutes ) my_monitor.suggest_baseline( baseline_dataset=train_data['baseline']['validate'] + "/baseline.csv", dataset_format=DatasetFormat.csv(header=True), problem_type="Regression", inference_attribute="prediction", ground_truth_attribute="label", output_s3_uri=baseline_results_uri, wait=True) baseline_job = my_monitor.latest_baselining_job _l.info("suggested baseline contrains") _l.info( pprint.pformat(baseline_job.suggested_constraints(). body_dict["regression_constraints"])) _l.info("suggested baseline statistics") _l.info( pprint.pformat(baseline_job.baseline_statistics(). body_dict["regression_metrics"])) monitor_schedule_name = ( f"{BASE_JOB_PREFIX}-mq-sch-{datetime.datetime.utcnow():%Y-%m-%d-%H%M}") _l.info(f"Monitoring schedule name: {monitor_schedule_name}") outputs['monitor'].update({'schedule_name': monitor_schedule_name}) endpointInput = EndpointInput( resources['endpoint']['name'], "/opt/ml/processing/input_data", inference_attribute='0' # REVIEW: ) my_monitor.create_monitoring_schedule( monitor_schedule_name=monitor_schedule_name, endpoint_input=endpointInput, output_s3_uri=baseline_results_uri, problem_type="Regression", ground_truth_input=ground_truth_upload_path, constraints=baseline_job.suggested_constraints(), # run the scheduler hourly schedule_cron_expression=CronExpressionGenerator.hourly(), enable_cloudwatch_metrics=True, ) mq_schedule_details = my_monitor.describe_schedule() while mq_schedule_details['MonitoringScheduleStatus'] == 'Pending': _l.info(f'Waiting for {monitor_schedule_name}') time.sleep(3) mq_schedule_details = my_monitor.describe_schedule() _l.debug( f"Model Quality Monitor - schedule details: {pprint.pformat(mq_schedule_details)}" ) _l.info( f"Model Quality Monitor - schedule status: {mq_schedule_details['MonitoringScheduleStatus']}" ) # save outputs to a file with open('deploymodel_out.json', 'w') as f: json.dump(outputs, f, default=json_default)
def create_baseline_step(input_data, execution_input, region, role): # Define the enviornment dataset_format = DatasetFormat.csv() env = { "dataset_format": json.dumps(dataset_format), "dataset_source": "/opt/ml/processing/input/baseline_dataset_input", "output_path": "/opt/ml/processing/output", "publish_cloudwatch_metrics": "Disabled", # Have to be disabled from processing job? } # Define the inputs and outputs inputs = [ ProcessingInput( source=input_data["BaselineUri"], destination="/opt/ml/processing/input/baseline_dataset_input", input_name="baseline_dataset_input", ), ] outputs = [ ProcessingOutput( source="/opt/ml/processing/output", destination=execution_input["BaselineOutputUri"], output_name="monitoring_output", ), ] # Get the default model monitor container monor_monitor_container_uri = retrieve(region=region, framework="model-monitor", version="latest") # Create the processor monitor_analyzer = Processor( image_uri=monor_monitor_container_uri, role=role, instance_count=1, instance_type="ml.m5.xlarge", max_runtime_in_seconds=1800, env=env, ) # Create the processing step baseline_step = steps.sagemaker.ProcessingStep( "Baseline Job", processor=monitor_analyzer, job_name=execution_input["BaselineJobName"], inputs=inputs, outputs=outputs, experiment_config={ "ExperimentName": execution_input["ExperimentName"], # '$.ExperimentName', "TrialName": execution_input["TrialName"], "TrialComponentDisplayName": "Baseline", }, tags={ "GitBranch": execution_input["GitBranch"], "GitCommitHash": execution_input["GitCommitHash"], "DataVersionId": execution_input["DataVersionId"], }, ) # Add the catch baseline_step.add_catch( steps.states.Catch( error_equals=["States.TaskFailed"], next_step=stepfunctions.steps.states.Fail( "Baseline failed", cause="SageMakerBaselineJobFailed"), )) return baseline_step
baseline_data_path = 's3://{0}/{1}/monitoring/baselining/data'.format(bucket_name, prefix) baseline_results_path = 's3://{0}/{1}/monitoring/baselining/results'.format(bucket_name, prefix) print(baseline_data_path) print(baseline_results_path) my_default_monitor = DefaultModelMonitor( role=execution_role, instance_count=1, instance_type='ml.c5.4xlarge', volume_size_in_gb=20, max_runtime_in_seconds=3600, ) my_default_monitor.suggest_baseline( job_name=job_name, baseline_dataset=baseline_data_path, dataset_format=DatasetFormat.csv(header=True), output_s3_uri=baseline_results_path, logs=False, # Disable to avoid noisy logging, only meaningful when wait=True wait=True ) # save environment variables with open( './cloud_formation/suggest_baseline.vars', 'w' ) as f: f.write("export PROCESSING_JOB_NAME={0}\n".format(job_name)) end = time.time() print('Monitor baseline complete in: {}'.format(end - start))
print('Baseline data is at {}'.format(baseline_data_uri)) my_default_monitor = DefaultModelMonitor( role=get_execution_role(sagemaker_session=sagemaker_session), sagemaker_session=sagemaker_session, instance_count=2, instance_type='ml.m5.4xlarge', volume_size_in_gb=60, max_runtime_in_seconds=1800, ) my_default_monitor.suggest_baseline( baseline_dataset=baseline_data_uri, dataset_format=DatasetFormat.csv(header=False), output_s3_uri=baseline_results_uri, wait=True ) print('Model data baseline suggested at {}'.format(baseline_results_uri)) import datetime as datetime from time import gmtime, strftime mon_schedule_name = '{}-{}'.format(mon_schedule_name_base, datetime.datetime.now().strftime("%Y-%m-%d-%H%M%S")) s3_report_path = f's3://{bucket}/{prefix}/monitor/report' # Setup daily Cron job schedule print(f"Attempting to create monitoring schedule as {mon_schedule_name} \n")