def test_processor_with_all_parameters(sagemaker_session): processor = Processor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, entrypoint=[ "python3", "/opt/ml/processing/input/code/processing_code.py" ], volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="processor_base_name", env={"my_env_variable": "my_env_variable_value"}, tags=[{ "Key": "my-tag", "Value": "my-tag-value" }], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, ), ) processor.run( inputs=[ ProcessingInput( source="s3://path/to/my/dataset/census.csv", destination="/container/path/", input_name="my_dataset", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", s3_compression_type="None", ) ], outputs=[ ProcessingOutput( source="/container/path/", destination="s3://uri/", output_name="my_output", s3_upload_mode="EndOfJob", ) ], arguments=["--drop-columns", "'SelfEmployed'"], wait=True, logs=False, job_name="my_job_name", experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = _get_expected_args_all_parameters( processor._current_job_name) # Drop the "code" input from expected values. expected_args["inputs"] = [expected_args["inputs"][0]] sagemaker_session.process.assert_called_with(**expected_args)
def test_byo_container_with_baked_in_script(sagemaker_session): custom_processor = Processor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, ) custom_processor.run( inputs=[ ProcessingInput(source="/local/path/to/my/sklearn_transformer", destination="/code/") ], arguments=["CensusTract", "County"], ) expected_args = { "inputs": [{ "InputName": "input-1", "S3Input": { "S3Uri": "mocked_s3_uri_from_upload_data", "LocalPath": "/code/", "S3DataType": "S3Prefix", "S3InputMode": "File", "S3DataDistributionType": "FullyReplicated", "S3CompressionType": "None", }, }], "output_config": { "Outputs": [] }, "job_name": custom_processor._current_job_name, "resources": { "ClusterConfig": { "InstanceType": "ml.m4.xlarge", "InstanceCount": 1, "VolumeSizeInGB": 30, } }, "stopping_condition": None, "app_specification": { "ImageUri": CUSTOM_IMAGE_URI, "ContainerArguments": ["CensusTract", "County"], }, "environment": None, "network_config": None, "role_arn": ROLE, "tags": None, "experiment_config": None, } sagemaker_session.process.assert_called_with(**expected_args)
def test_processor_with_required_parameters(sagemaker_session): processor = Processor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, ) processor.run() expected_args = _get_expected_args(processor._current_job_name) del expected_args["app_specification"]["ContainerEntrypoint"] expected_args["inputs"] = [] sagemaker_session.process.assert_called_with(**expected_args)
def test_processing_step_with_processor_and_step_args(pipeline_session, processing_input): processor = Processor( image_uri=IMAGE_URI, role=sagemaker.get_execution_role(), instance_count=1, instance_type=INSTANCE_TYPE, sagemaker_session=pipeline_session, ) step_args = processor.run(inputs=processing_input) try: ProcessingStep( name="MyProcessingStep", step_args=step_args, processor=processor, ) assert False except Exception as e: assert isinstance(e, ValueError) try: ProcessingStep( name="MyProcessingStep", ) assert False except Exception as e: assert isinstance(e, ValueError)
def test_processor_with_missing_network_config_parameters(sagemaker_session): processor = Processor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, network_config=NetworkConfig(enable_network_isolation=True), ) processor.run() expected_args = _get_expected_args(processor._current_job_name) del expected_args["app_specification"]["ContainerEntrypoint"] expected_args["inputs"] = [] expected_args["network_config"] = {"EnableNetworkIsolation": True} sagemaker_session.process.assert_called_with(**expected_args)
def test_processor_with_all_parameters(sagemaker_session): processor = Processor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, entrypoint=[ "python3", "/opt/ml/processing/input/code/processing_code.py" ], volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="processor_base_name", env={"my_env_variable": "my_env_variable_value"}, tags=[{ "Key": "my-tag", "Value": "my-tag-value" }], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, encrypt_inter_container_traffic=True, ), ) processor.run( inputs=_get_data_inputs_all_parameters(), outputs=_get_data_outputs_all_parameters(), arguments=["--drop-columns", "'SelfEmployed'"], wait=True, logs=False, job_name="my_job_name", experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = _get_expected_args_all_parameters( processor._current_job_name) # Drop the "code" input from expected values. expected_args["inputs"] = expected_args["inputs"][:-1] sagemaker_session.process.assert_called_with(**expected_args)
def test_processing_step_with_processor(pipeline_session, processing_input): processor = Processor( image_uri=IMAGE_URI, role=sagemaker.get_execution_role(), instance_count=1, instance_type=INSTANCE_TYPE, sagemaker_session=pipeline_session, ) with warnings.catch_warnings(record=True) as w: step_args = processor.run(inputs=processing_input) assert len(w) == 1 assert issubclass(w[-1].category, UserWarning) assert "Running within a PipelineSession" in str(w[-1].message) cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") evaluation_report = PropertyFile( name="EvaluationReport", output_name="evaluation", path="evaluation.json" ) with warnings.catch_warnings(record=True) as w: step = ProcessingStep( name="MyProcessingStep", step_args=step_args, description="ProcessingStep description", display_name="MyProcessingStep", depends_on=["TestStep", "SecondTestStep"], cache_config=cache_config, property_files=[evaluation_report], ) assert len(w) == 0 pipeline = Pipeline( name="MyPipeline", steps=[step], sagemaker_session=pipeline_session, ) assert json.loads(pipeline.definition())["Steps"][0] == { "Name": "MyProcessingStep", "Description": "ProcessingStep description", "DisplayName": "MyProcessingStep", "Type": "Processing", "DependsOn": ["TestStep", "SecondTestStep"], "Arguments": step_args, "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"}, "PropertyFiles": [ { "FilePath": "evaluation.json", "OutputName": "evaluation", "PropertyFileName": "EvaluationReport", } ], } assert step.properties.ProcessingJobName.expr == { "Get": "Steps.MyProcessingStep.ProcessingJobName" }
def test_processor_with_custom_bucket( sagemaker_session_with_custom_bucket, custom_bucket_name, image_uri, cpu_instance_type, output_kms_key, ): script_path = os.path.join(DATA_DIR, "dummy_script.py") processor = Processor( role=ROLE, image_uri=image_uri, instance_count=1, instance_type=cpu_instance_type, entrypoint=[ "python3", "/opt/ml/processing/input/code/dummy_script.py" ], volume_size_in_gb=100, volume_kms_key=None, output_kms_key=output_kms_key, max_runtime_in_seconds=3600, base_job_name="test-processor", env={"DUMMY_ENVIRONMENT_VARIABLE": "dummy-value"}, tags=[{ "Key": "dummy-tag", "Value": "dummy-tag-value" }], sagemaker_session=sagemaker_session_with_custom_bucket, ) processor.run( inputs=[ ProcessingInput(source=script_path, destination="/opt/ml/processing/input/code/", input_name="code") ], outputs=[ ProcessingOutput( source="/opt/ml/processing/output/container/path/", output_name="dummy_output", s3_upload_mode="EndOfJob", ) ], arguments=["-v"], wait=True, logs=True, ) job_description = processor.latest_job.describe() assert job_description["ProcessingInputs"][0]["InputName"] == "code" assert custom_bucket_name in job_description["ProcessingInputs"][0][ "S3Input"]["S3Uri"] assert job_description["ProcessingJobName"].startswith("test-processor") assert job_description["ProcessingJobStatus"] == "Completed" assert job_description["ProcessingOutputConfig"][ "KmsKeyId"] == output_kms_key assert job_description["ProcessingOutputConfig"]["Outputs"][0][ "OutputName"] == "dummy_output" assert job_description["ProcessingResources"]["ClusterConfig"][ "InstanceCount"] == 1 assert (job_description["ProcessingResources"]["ClusterConfig"] ["InstanceType"] == cpu_instance_type) assert job_description["ProcessingResources"]["ClusterConfig"][ "VolumeSizeInGB"] == 100 assert job_description["AppSpecification"]["ContainerArguments"] == ["-v"] assert job_description["AppSpecification"]["ContainerEntrypoint"] == [ "python3", "/opt/ml/processing/input/code/dummy_script.py", ] assert job_description["AppSpecification"]["ImageUri"] == image_uri assert job_description["Environment"] == { "DUMMY_ENVIRONMENT_VARIABLE": "dummy-value" } assert ROLE in job_description["RoleArn"] assert job_description["StoppingCondition"] == { "MaxRuntimeInSeconds": 3600 }
def run_model_monitor_job_processor(region, instance_type, role, data_capture_path, statistics_path, constraints_path, reports_path, instance_count=1, preprocessor_path=None, postprocessor_path=None, publish_cloudwatch_metrics='Disabled'): data_capture_sub_path = data_capture_path[data_capture_path.rfind('datacapture/') :] data_capture_sub_path = data_capture_sub_path[data_capture_sub_path.find('/') + 1 :] processing_output_paths = reports_path + '/' + data_capture_sub_path input_1 = ProcessingInput(input_name='input_1', source=data_capture_path, destination='/opt/ml/processing/input/endpoint/' + data_capture_sub_path, s3_data_type='S3Prefix', s3_input_mode='File') baseline = ProcessingInput(input_name='baseline', source=statistics_path, destination='/opt/ml/processing/baseline/stats', s3_data_type='S3Prefix', s3_input_mode='File') constraints = ProcessingInput(input_name='constraints', source=constraints_path, destination='/opt/ml/processing/baseline/constraints', s3_data_type='S3Prefix', s3_input_mode='File') outputs = ProcessingOutput(output_name='result', source='/opt/ml/processing/output', destination=processing_output_paths, s3_upload_mode='Continuous') env = {'baseline_constraints': '/opt/ml/processing/baseline/constraints/' + get_file_name(constraints_path), 'baseline_statistics': '/opt/ml/processing/baseline/stats/' + get_file_name(statistics_path), 'dataset_format': '{"sagemakerCaptureJson":{"captureIndexNames":["endpointInput","endpointOutput"]}}', 'dataset_source': '/opt/ml/processing/input/endpoint', 'output_path': '/opt/ml/processing/output', 'publish_cloudwatch_metrics': publish_cloudwatch_metrics } inputs=[input_1, baseline, constraints] if postprocessor_path: env['post_analytics_processor_script'] = '/opt/ml/processing/code/postprocessing/' + get_file_name(postprocessor_path) post_processor_script = ProcessingInput(input_name='post_processor_script', source=postprocessor_path, destination='/opt/ml/processing/code/postprocessing', s3_data_type='S3Prefix', s3_input_mode='File') inputs.append(post_processor_script) if preprocessor_path: env['record_preprocessor_script'] = '/opt/ml/processing/code/preprocessing/' + get_file_name(preprocessor_path) pre_processor_script = ProcessingInput(input_name='pre_processor_script', source=preprocessor_path, destination='/opt/ml/processing/code/preprocessing', s3_data_type='S3Prefix', s3_input_mode='File') inputs.append(pre_processor_script) processor = Processor(image_uri = get_model_monitor_container_uri(region), instance_count = instance_count, instance_type = instance_type, role=role, env = env) return processor.run(inputs=inputs, outputs=[outputs])
project_name=project_name, env=env, region_name=region, current_time=current_time, ) proc_config = metadata.getter(prcossing_task) sm_config = proc_config.get('sm_config') # create sagemaker session sess = sm.Session(default_bucket=sm_config.getter('sm_bucket')) processor = Processor( role=sm_config.getter('sm_role'), image_uri=image_uri, instance_count=sm_config.getter('sm_instance_count'), instance_type=sm_config.getter('sm_instance_type'), entrypoint=proc_config.get('endpoint'), volume_size_in_gb=sm_config.getter('sm_volumesize'), sagemaker_session=sess, tags=sm_config.getter('project_tag'), ) processor.run( inputs=proc_config.get('inputs'), outputs=proc_config.get('outputs'), arguments=proc_config.get('arguments'), wait=False, logs=False, job_name=sm_config.getter('processing_job_name'), )
def run_model_monitor_job_processor( region, instance_type, role, data_capture_path, statistics_path, constraints_path, reports_path, instance_count=1, preprocessor_path=None, postprocessor_path=None, publish_cloudwatch_metrics="Disabled", ): data_capture_sub_path = data_capture_path[data_capture_path. rfind("datacapture/"):] data_capture_sub_path = data_capture_sub_path[data_capture_sub_path. find("/") + 1:] processing_output_paths = reports_path + "/" + data_capture_sub_path input_1 = ProcessingInput( input_name="input_1", source=data_capture_path, destination="/opt/ml/processing/input/endpoint/" + data_capture_sub_path, s3_data_type="S3Prefix", s3_input_mode="File", ) baseline = ProcessingInput( input_name="baseline", source=statistics_path, destination="/opt/ml/processing/baseline/stats", s3_data_type="S3Prefix", s3_input_mode="File", ) constraints = ProcessingInput( input_name="constraints", source=constraints_path, destination="/opt/ml/processing/baseline/constraints", s3_data_type="S3Prefix", s3_input_mode="File", ) outputs = ProcessingOutput( output_name="result", source="/opt/ml/processing/output", destination=processing_output_paths, s3_upload_mode="Continuous", ) env = { "baseline_constraints": "/opt/ml/processing/baseline/constraints/" + get_file_name(constraints_path), "baseline_statistics": "/opt/ml/processing/baseline/stats/" + get_file_name(statistics_path), "dataset_format": '{"sagemakerCaptureJson":{"captureIndexNames":["endpointInput","endpointOutput"]}}', "dataset_source": "/opt/ml/processing/input/endpoint", "output_path": "/opt/ml/processing/output", "publish_cloudwatch_metrics": publish_cloudwatch_metrics, } inputs = [input_1, baseline, constraints] if postprocessor_path: env["post_analytics_processor_script"] = "/opt/ml/processing/code/postprocessing/" + get_file_name( postprocessor_path) post_processor_script = ProcessingInput( input_name="post_processor_script", source=postprocessor_path, destination="/opt/ml/processing/code/postprocessing", s3_data_type="S3Prefix", s3_input_mode="File", ) inputs.append(post_processor_script) if preprocessor_path: env["record_preprocessor_script"] = "/opt/ml/processing/code/preprocessing/" + get_file_name( preprocessor_path) pre_processor_script = ProcessingInput( input_name="pre_processor_script", source=preprocessor_path, destination="/opt/ml/processing/code/preprocessing", s3_data_type="S3Prefix", s3_input_mode="File", ) inputs.append(pre_processor_script) processor = Processor( image_uri=get_model_monitor_container_uri(region), instance_count=instance_count, instance_type=instance_type, role=role, env=env, ) return processor.run(inputs=inputs, outputs=[outputs])