def _get_processing_outputs_with_all_parameters(): return [ ProcessingOutput( feature_store_output=FeatureStoreOutput( feature_group_name="FeatureGroupName"), app_managed=True, ) ]
def test_processor_with_all_parameters(sagemaker_session): processor = Processor( role=ROLE, image_uri=CUSTOM_IMAGE_URI, instance_count=1, instance_type="ml.m4.xlarge", sagemaker_session=sagemaker_session, entrypoint=["python3", "/opt/ml/processing/input/code/processing_code.py"], volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="processor_base_name", env={"my_env_variable": "my_env_variable_value"}, tags=[{"Key": "my-tag", "Value": "my-tag-value"}], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, encrypt_inter_container_traffic=True, ), ) processor.run( inputs=[ ProcessingInput( source="s3://path/to/my/dataset/census.csv", destination="/container/path/", input_name="my_dataset", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", s3_compression_type="None", ) ], outputs=[ ProcessingOutput( app_managed=True, feature_store_output=FeatureStoreOutput("Foo"), source="/container/path/", destination="s3://uri/", output_name="my_output", s3_upload_mode="EndOfJob", ) ], arguments=["--drop-columns", "'SelfEmployed'"], wait=True, logs=False, job_name="my_job_name", experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = _get_expected_args_all_parameters(processor._current_job_name) # Drop the "code" input from expected values. expected_args["inputs"] = [expected_args["inputs"][0]] sagemaker_session.process.assert_called_with(**expected_args)
def _get_data_outputs_all_parameters(): return [ ProcessingOutput( source="/container/path/", destination="s3://uri/", output_name="my_output", s3_upload_mode="EndOfJob", ), ProcessingOutput( output_name="feature_store_output", app_managed=True, feature_store_output=FeatureStoreOutput(feature_group_name="FeatureGroupName"), ), ]
def create_featurestore_output(output_name, feature_group_name): """Create processing output for a Data Wrangler job to output to SageMaker Feature Store (Modified from Data Wrangler FS notebook template 2021-03-10 to use SageMaker SDK) """ # SDK should be approx equivalent to: # { # 'Outputs': [ # { # 'OutputName': '42eac0fe-e5da-467f-adfd-bb4c4fae57cb.default', # 'FeatureStoreOutput': { # 'FeatureGroupName': feature_group_name # }, # 'AppManaged': True # } # ], # } return ProcessingJobOutput( output_name=output_name, app_managed=True, feature_store_output=FeatureStoreOutput(feature_group_name="hi"), )
def create_pipeline( pipeline_name="s3-fs-ingest-pipeline", pipeline_description="automated ingestion from s3 to feature store", project_id="", project_name="", data_wrangler_flow_s3_url="", flow_output_name="", input_data_s3_url="", feature_group_name="", execution_role=""): logger.info( f"Creating sagemaker S3 to feature store load pipeline: {pipeline_name}" ) logger.info(f"execution role passed: {execution_role}") if execution_role is None or execution_role == "": execution_role = get_execution_role() logger.info(f"execution_role set to {execution_role}") output_content_type = "CSV" sagemaker_session = sagemaker.Session() # setup pipeline parameters p_processing_instance_count = ParameterInteger( name="ProcessingInstanceCount", default_value=1) p_processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.4xlarge") p_processing_volume_size = ParameterInteger(name="ProcessingVolumeSize", default_value=50) p_flow_output_name = ParameterString(name='FlowOutputName', default_value=flow_output_name) p_input_flow = ParameterString(name='InputFlowUrl', default_value=data_wrangler_flow_s3_url) p_input_data = ParameterString(name="InputDataUrl", default_value=input_data_s3_url) p_feature_group_name = ParameterString(name="FeatureGroupName", default_value=feature_group_name) # DW flow processing job inputs and output flow_input = ProcessingInput( source=p_input_flow, destination="/opt/ml/processing/flow", input_name="flow", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", ) data_input = ProcessingInput(source=p_input_data, destination="/opt/ml/processing/data", input_name="data", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated") processing_job_output = ProcessingOutput( output_name=p_flow_output_name, app_managed=True, feature_store_output=FeatureStoreOutput( feature_group_name=p_feature_group_name), ) # Output configuration used as processing job container arguments output_config = {flow_output_name: {"content_type": output_content_type}} # get data wrangler container uri container_uri = image_uris.retrieve( framework='data-wrangler', region=sagemaker_session.boto_region_name) logger.info(f"creating DW processor with container uri: {container_uri}") # create DW processor processor = Processor( role=execution_role, image_uri=container_uri, instance_count=p_processing_instance_count, instance_type=p_processing_instance_type, volume_size_in_gb=p_processing_volume_size, sagemaker_session=sagemaker_session, ) step_process = ProcessingStep( name="datawrangler-processing-to-feature-store", processor=processor, inputs=[flow_input] + [data_input], outputs=[processing_job_output], job_arguments=[f"--output-config '{json.dumps(output_config)}'"], ) pipeline = Pipeline(name=pipeline_name, parameters=[ p_processing_instance_type, p_processing_instance_count, p_processing_volume_size, p_flow_output_name, p_input_flow, p_input_data, p_feature_group_name ], steps=[step_process], sagemaker_session=sagemaker_session) response = pipeline.upsert( role_arn=execution_role, description=pipeline_description, tags=[{ 'Key': 'sagemaker:project-name', 'Value': project_name }, { 'Key': 'sagemaker:project-id', 'Value': project_id }], ) logger.info(f"pipeline upsert response: {response}") return pipeline
def test_sklearn_with_all_parameters( exists_mock, isfile_mock, botocore_resolver, sklearn_version, sagemaker_session ): botocore_resolver.return_value.construct_endpoint.return_value = {"hostname": ECR_HOSTNAME} processor = SKLearnProcessor( role=ROLE, framework_version=sklearn_version, instance_type="ml.m4.xlarge", instance_count=1, volume_size_in_gb=100, volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key", output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key", max_runtime_in_seconds=3600, base_job_name="my_sklearn_processor", env={"my_env_variable": "my_env_variable_value"}, tags=[{"Key": "my-tag", "Value": "my-tag-value"}], network_config=NetworkConfig( subnets=["my_subnet_id"], security_group_ids=["my_security_group_id"], enable_network_isolation=True, encrypt_inter_container_traffic=True, ), sagemaker_session=sagemaker_session, ) processor.run( code="/local/path/to/processing_code.py", inputs=[ ProcessingInput( source="s3://path/to/my/dataset/census.csv", destination="/container/path/", input_name="my_dataset", s3_data_type="S3Prefix", s3_input_mode="File", s3_data_distribution_type="FullyReplicated", s3_compression_type="None", ) ], outputs=[ ProcessingOutput( app_managed=True, feature_store_output=FeatureStoreOutput("Foo"), source="/container/path/", destination="s3://uri/", output_name="my_output", s3_upload_mode="EndOfJob", ) ], arguments=["--drop-columns", "'SelfEmployed'"], wait=True, logs=False, job_name="my_job_name", experiment_config={"ExperimentName": "AnExperiment"}, ) expected_args = _get_expected_args_all_parameters(processor._current_job_name) sklearn_image_uri = ( "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:{}-cpu-py3" ).format(sklearn_version) expected_args["app_specification"]["ImageUri"] = sklearn_image_uri sagemaker_session.process.assert_called_with(**expected_args)
def test_one_step_ingestion_pipeline(sagemaker_session, feature_store_session, feature_definitions, role, pipeline_name): instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.4xlarge") input_name = "features.csv" input_file_path = os.path.join(DATA_DIR, "workflow", "features.csv") input_data_uri = os.path.join("s3://", sagemaker_session.default_bucket(), "py-sdk-ingestion-test-input/features.csv") with open(input_file_path, "r") as data: body = data.read() S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session) inputs = [ ProcessingInput( input_name=input_name, source=input_data_uri, destination="/opt/ml/processing/features.csv", ) ] feature_group_name = f"py-sdk-integ-fg-{int(time.time() * 10**7)}" feature_group = FeatureGroup( name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=feature_store_session, ) ingestion_only_flow, output_name = generate_data_ingestion_flow_from_s3_input( input_name, input_data_uri, s3_content_type="csv", s3_has_header=True, ) outputs = [ ProcessingOutput( output_name=output_name, app_managed=True, feature_store_output=FeatureStoreOutput( feature_group_name=feature_group_name), ) ] temp_flow_path = "./ingestion.flow" with cleanup_feature_group(feature_group): json.dump(ingestion_only_flow, open(temp_flow_path, "w")) data_wrangler_processor = DataWranglerProcessor( role=role, data_wrangler_flow_source=temp_flow_path, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, max_runtime_in_seconds=86400, ) data_wrangler_step = ProcessingStep(name="ingestion-step", processor=data_wrangler_processor, inputs=inputs, outputs=outputs) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count, instance_type], steps=[data_wrangler_step], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] offline_store_s3_uri = os.path.join( "s3://", sagemaker_session.default_bucket(), feature_group_name) feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="f11", event_time_feature_name="f10", role_arn=role, enable_online_store=False, ) _wait_for_feature_group_create(feature_group) execution = pipeline.start() response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=60, max_attempts=10) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "ingestion-step" assert execution_steps[0]["StepStatus"] == "Succeeded" athena_query = feature_group.athena_query() with timeout(minutes=10): athena_query.run( query_string=f'SELECT * FROM "{athena_query.table_name}"', output_location=f"{offline_store_s3_uri}/query_results", ) athena_query.wait() assert "SUCCEEDED" == athena_query.get_query_execution().get( "QueryExecution").get("Status").get("State") df = athena_query.as_dataframe() assert pd.read_csv(input_file_path).shape[0] == df.shape[0] finally: try: pipeline.delete() except Exception as e: print(f"Delete pipeline failed with error: {e}") os.remove(temp_flow_path)