def _get_processing_outputs_with_all_parameters():
    return [
        ProcessingOutput(
            feature_store_output=FeatureStoreOutput(
                feature_group_name="FeatureGroupName"),
            app_managed=True,
        )
    ]
예제 #2
0
def test_processor_with_all_parameters(sagemaker_session):
    processor = Processor(
        role=ROLE,
        image_uri=CUSTOM_IMAGE_URI,
        instance_count=1,
        instance_type="ml.m4.xlarge",
        sagemaker_session=sagemaker_session,
        entrypoint=["python3", "/opt/ml/processing/input/code/processing_code.py"],
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="processor_base_name",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{"Key": "my-tag", "Value": "my-tag-value"}],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
            encrypt_inter_container_traffic=True,
        ),
    )

    processor.run(
        inputs=[
            ProcessingInput(
                source="s3://path/to/my/dataset/census.csv",
                destination="/container/path/",
                input_name="my_dataset",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_data_distribution_type="FullyReplicated",
                s3_compression_type="None",
            )
        ],
        outputs=[
            ProcessingOutput(
                app_managed=True,
                feature_store_output=FeatureStoreOutput("Foo"),
                source="/container/path/",
                destination="s3://uri/",
                output_name="my_output",
                s3_upload_mode="EndOfJob",
            )
        ],
        arguments=["--drop-columns", "'SelfEmployed'"],
        wait=True,
        logs=False,
        job_name="my_job_name",
        experiment_config={"ExperimentName": "AnExperiment"},
    )

    expected_args = _get_expected_args_all_parameters(processor._current_job_name)
    # Drop the "code" input from expected values.
    expected_args["inputs"] = [expected_args["inputs"][0]]

    sagemaker_session.process.assert_called_with(**expected_args)
def _get_data_outputs_all_parameters():
    return [
        ProcessingOutput(
            source="/container/path/",
            destination="s3://uri/",
            output_name="my_output",
            s3_upload_mode="EndOfJob",
        ),
        ProcessingOutput(
            output_name="feature_store_output",
            app_managed=True,
            feature_store_output=FeatureStoreOutput(feature_group_name="FeatureGroupName"),
        ),
    ]
예제 #4
0
def create_featurestore_output(output_name, feature_group_name):
    """Create processing output for a Data Wrangler job to output to SageMaker Feature Store

    (Modified from Data Wrangler FS notebook template 2021-03-10 to use SageMaker SDK)
    """
    # SDK should be approx equivalent to:
    # {
    #   'Outputs': [
    #     {
    #       'OutputName': '42eac0fe-e5da-467f-adfd-bb4c4fae57cb.default',
    #       'FeatureStoreOutput': {
    #         'FeatureGroupName': feature_group_name
    #       },
    #       'AppManaged': True
    #     }
    #   ],
    # }
    return ProcessingJobOutput(
        output_name=output_name,
        app_managed=True,
        feature_store_output=FeatureStoreOutput(feature_group_name="hi"),
    )
예제 #5
0
def create_pipeline(
        pipeline_name="s3-fs-ingest-pipeline",
        pipeline_description="automated ingestion from s3 to feature store",
        project_id="",
        project_name="",
        data_wrangler_flow_s3_url="",
        flow_output_name="",
        input_data_s3_url="",
        feature_group_name="",
        execution_role=""):
    logger.info(
        f"Creating sagemaker S3 to feature store load pipeline: {pipeline_name}"
    )
    logger.info(f"execution role passed: {execution_role}")

    if execution_role is None or execution_role == "":
        execution_role = get_execution_role()
        logger.info(f"execution_role set to {execution_role}")

    output_content_type = "CSV"
    sagemaker_session = sagemaker.Session()

    # setup pipeline parameters
    p_processing_instance_count = ParameterInteger(
        name="ProcessingInstanceCount", default_value=1)
    p_processing_instance_type = ParameterString(name="ProcessingInstanceType",
                                                 default_value="ml.m5.4xlarge")
    p_processing_volume_size = ParameterInteger(name="ProcessingVolumeSize",
                                                default_value=50)
    p_flow_output_name = ParameterString(name='FlowOutputName',
                                         default_value=flow_output_name)
    p_input_flow = ParameterString(name='InputFlowUrl',
                                   default_value=data_wrangler_flow_s3_url)
    p_input_data = ParameterString(name="InputDataUrl",
                                   default_value=input_data_s3_url)
    p_feature_group_name = ParameterString(name="FeatureGroupName",
                                           default_value=feature_group_name)

    # DW flow processing job inputs and output
    flow_input = ProcessingInput(
        source=p_input_flow,
        destination="/opt/ml/processing/flow",
        input_name="flow",
        s3_data_type="S3Prefix",
        s3_input_mode="File",
        s3_data_distribution_type="FullyReplicated",
    )

    data_input = ProcessingInput(source=p_input_data,
                                 destination="/opt/ml/processing/data",
                                 input_name="data",
                                 s3_data_type="S3Prefix",
                                 s3_input_mode="File",
                                 s3_data_distribution_type="FullyReplicated")

    processing_job_output = ProcessingOutput(
        output_name=p_flow_output_name,
        app_managed=True,
        feature_store_output=FeatureStoreOutput(
            feature_group_name=p_feature_group_name),
    )

    # Output configuration used as processing job container arguments
    output_config = {flow_output_name: {"content_type": output_content_type}}

    # get data wrangler container uri
    container_uri = image_uris.retrieve(
        framework='data-wrangler', region=sagemaker_session.boto_region_name)

    logger.info(f"creating DW processor with container uri: {container_uri}")

    # create DW processor
    processor = Processor(
        role=execution_role,
        image_uri=container_uri,
        instance_count=p_processing_instance_count,
        instance_type=p_processing_instance_type,
        volume_size_in_gb=p_processing_volume_size,
        sagemaker_session=sagemaker_session,
    )

    step_process = ProcessingStep(
        name="datawrangler-processing-to-feature-store",
        processor=processor,
        inputs=[flow_input] + [data_input],
        outputs=[processing_job_output],
        job_arguments=[f"--output-config '{json.dumps(output_config)}'"],
    )

    pipeline = Pipeline(name=pipeline_name,
                        parameters=[
                            p_processing_instance_type,
                            p_processing_instance_count,
                            p_processing_volume_size, p_flow_output_name,
                            p_input_flow, p_input_data, p_feature_group_name
                        ],
                        steps=[step_process],
                        sagemaker_session=sagemaker_session)

    response = pipeline.upsert(
        role_arn=execution_role,
        description=pipeline_description,
        tags=[{
            'Key': 'sagemaker:project-name',
            'Value': project_name
        }, {
            'Key': 'sagemaker:project-id',
            'Value': project_id
        }],
    )

    logger.info(f"pipeline upsert response: {response}")

    return pipeline
예제 #6
0
def test_sklearn_with_all_parameters(
    exists_mock, isfile_mock, botocore_resolver, sklearn_version, sagemaker_session
):
    botocore_resolver.return_value.construct_endpoint.return_value = {"hostname": ECR_HOSTNAME}

    processor = SKLearnProcessor(
        role=ROLE,
        framework_version=sklearn_version,
        instance_type="ml.m4.xlarge",
        instance_count=1,
        volume_size_in_gb=100,
        volume_kms_key="arn:aws:kms:us-west-2:012345678901:key/volume-kms-key",
        output_kms_key="arn:aws:kms:us-west-2:012345678901:key/output-kms-key",
        max_runtime_in_seconds=3600,
        base_job_name="my_sklearn_processor",
        env={"my_env_variable": "my_env_variable_value"},
        tags=[{"Key": "my-tag", "Value": "my-tag-value"}],
        network_config=NetworkConfig(
            subnets=["my_subnet_id"],
            security_group_ids=["my_security_group_id"],
            enable_network_isolation=True,
            encrypt_inter_container_traffic=True,
        ),
        sagemaker_session=sagemaker_session,
    )

    processor.run(
        code="/local/path/to/processing_code.py",
        inputs=[
            ProcessingInput(
                source="s3://path/to/my/dataset/census.csv",
                destination="/container/path/",
                input_name="my_dataset",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_data_distribution_type="FullyReplicated",
                s3_compression_type="None",
            )
        ],
        outputs=[
            ProcessingOutput(
                app_managed=True,
                feature_store_output=FeatureStoreOutput("Foo"),
                source="/container/path/",
                destination="s3://uri/",
                output_name="my_output",
                s3_upload_mode="EndOfJob",
            )
        ],
        arguments=["--drop-columns", "'SelfEmployed'"],
        wait=True,
        logs=False,
        job_name="my_job_name",
        experiment_config={"ExperimentName": "AnExperiment"},
    )

    expected_args = _get_expected_args_all_parameters(processor._current_job_name)
    sklearn_image_uri = (
        "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:{}-cpu-py3"
    ).format(sklearn_version)
    expected_args["app_specification"]["ImageUri"] = sklearn_image_uri

    sagemaker_session.process.assert_called_with(**expected_args)
예제 #7
0
def test_one_step_ingestion_pipeline(sagemaker_session, feature_store_session,
                                     feature_definitions, role, pipeline_name):
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.4xlarge")

    input_name = "features.csv"
    input_file_path = os.path.join(DATA_DIR, "workflow", "features.csv")
    input_data_uri = os.path.join("s3://", sagemaker_session.default_bucket(),
                                  "py-sdk-ingestion-test-input/features.csv")
    with open(input_file_path, "r") as data:
        body = data.read()
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session)

    inputs = [
        ProcessingInput(
            input_name=input_name,
            source=input_data_uri,
            destination="/opt/ml/processing/features.csv",
        )
    ]

    feature_group_name = f"py-sdk-integ-fg-{int(time.time() * 10**7)}"
    feature_group = FeatureGroup(
        name=feature_group_name,
        feature_definitions=feature_definitions,
        sagemaker_session=feature_store_session,
    )

    ingestion_only_flow, output_name = generate_data_ingestion_flow_from_s3_input(
        input_name,
        input_data_uri,
        s3_content_type="csv",
        s3_has_header=True,
    )

    outputs = [
        ProcessingOutput(
            output_name=output_name,
            app_managed=True,
            feature_store_output=FeatureStoreOutput(
                feature_group_name=feature_group_name),
        )
    ]

    temp_flow_path = "./ingestion.flow"
    with cleanup_feature_group(feature_group):
        json.dump(ingestion_only_flow, open(temp_flow_path, "w"))

        data_wrangler_processor = DataWranglerProcessor(
            role=role,
            data_wrangler_flow_source=temp_flow_path,
            instance_count=instance_count,
            instance_type=instance_type,
            sagemaker_session=sagemaker_session,
            max_runtime_in_seconds=86400,
        )

        data_wrangler_step = ProcessingStep(name="ingestion-step",
                                            processor=data_wrangler_processor,
                                            inputs=inputs,
                                            outputs=outputs)

        pipeline = Pipeline(
            name=pipeline_name,
            parameters=[instance_count, instance_type],
            steps=[data_wrangler_step],
            sagemaker_session=sagemaker_session,
        )

        try:
            response = pipeline.create(role)
            create_arn = response["PipelineArn"]

            offline_store_s3_uri = os.path.join(
                "s3://", sagemaker_session.default_bucket(),
                feature_group_name)
            feature_group.create(
                s3_uri=offline_store_s3_uri,
                record_identifier_name="f11",
                event_time_feature_name="f10",
                role_arn=role,
                enable_online_store=False,
            )
            _wait_for_feature_group_create(feature_group)

            execution = pipeline.start()
            response = execution.describe()
            assert response["PipelineArn"] == create_arn

            try:
                execution.wait(delay=60, max_attempts=10)
            except WaiterError:
                pass

            execution_steps = execution.list_steps()

            assert len(execution_steps) == 1
            assert execution_steps[0]["StepName"] == "ingestion-step"
            assert execution_steps[0]["StepStatus"] == "Succeeded"

            athena_query = feature_group.athena_query()
            with timeout(minutes=10):
                athena_query.run(
                    query_string=f'SELECT * FROM "{athena_query.table_name}"',
                    output_location=f"{offline_store_s3_uri}/query_results",
                )
                athena_query.wait()
                assert "SUCCEEDED" == athena_query.get_query_execution().get(
                    "QueryExecution").get("Status").get("State")

                df = athena_query.as_dataframe()
                assert pd.read_csv(input_file_path).shape[0] == df.shape[0]
        finally:
            try:
                pipeline.delete()
            except Exception as e:
                print(f"Delete pipeline failed with error: {e}")
            os.remove(temp_flow_path)