def test_as_hive_ddl_with_default_values(
    create_table_ddl, feature_group_dummy_definitions, sagemaker_session_mock
):
    sagemaker_session_mock.describe_feature_group.return_value = {
        "OfflineStoreConfig": {
            "S3StorageConfig": {
                "S3Uri": "s3://some-bucket",
                "ResolvedOutputS3Uri": "s3://resolved_output_s3_uri",
            }
        }
    }
    sagemaker_session_mock.account_id.return_value = "1234"
    sagemaker_session_mock.boto_session.region_name = "us-west-2"

    feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock)
    feature_group.feature_definitions = feature_group_dummy_definitions
    assert (
        create_table_ddl.format(
            database="sagemaker_featurestore",
            table_name="MyGroup",
            account="1234",
            region="us-west-2",
            feature_group_name="MyGroup",
        )
        == feature_group.as_hive_ddl()
    )
Пример #2
0
def test_ingest_with_profile_name(ingestion_manager_init,
                                  sagemaker_session_mock,
                                  fs_runtime_client_config_mock):
    sagemaker_session_mock.sagemaker_featurestore_runtime_client.meta.config = (
        fs_runtime_client_config_mock)

    feature_group = FeatureGroup(name="MyGroup",
                                 sagemaker_session=sagemaker_session_mock)
    df = pd.DataFrame(
        dict((f"float{i}", pd.Series([2.0], dtype="float64"))
             for i in range(300)))

    mock_ingestion_manager_instance = Mock()
    ingestion_manager_init.return_value = mock_ingestion_manager_instance
    feature_group.ingest(data_frame=df,
                         max_workers=10,
                         profile_name="profile_name")

    ingestion_manager_init.assert_called_once_with(
        feature_group_name="MyGroup",
        sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock,
        max_workers=10,
        max_processes=1,
        profile_name="profile_name",
    )
    mock_ingestion_manager_instance.run.assert_called_once_with(data_frame=df,
                                                                wait=True,
                                                                timeout=None)
def test_ingest_zero_workers():
    feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock)
    df = Mock()
    with pytest.raises(RuntimeError) as error:
        feature_group.ingest(data_frame=df, max_workers=0, max_processes=1)

    assert "max_workers must be greater than 0." in str(error)
Пример #4
0
def test_feature_store_create(sagemaker_session_mock, role_arn,
                              feature_group_dummy_definitions, s3_uri):
    feature_group = FeatureGroup(name="MyFeatureGroup",
                                 sagemaker_session=sagemaker_session_mock)
    feature_group.feature_definitions = feature_group_dummy_definitions
    feature_group.create(
        s3_uri=s3_uri,
        record_identifier_name="feature1",
        event_time_feature_name="feature2",
        role_arn=role_arn,
        enable_online_store=True,
    )
    sagemaker_session_mock.create_feature_group.assert_called_with(
        feature_group_name="MyFeatureGroup",
        record_identifier_name="feature1",
        event_time_feature_name="feature2",
        feature_definitions=[
            fd.to_dict() for fd in feature_group_dummy_definitions
        ],
        role_arn=role_arn,
        description=None,
        tags=None,
        online_store_config={"EnableOnlineStore": True},
        offline_store_config={
            "DisableGlueTableCreation": False,
            "S3StorageConfig": {
                "S3Uri": s3_uri
            },
        },
    )
def test_create_feature_store(
    feature_store_session,
    role,
    feature_group_name,
    offline_store_s3_uri,
    pandas_data_frame,
    record,
    create_table_ddl,
):
    feature_group = FeatureGroup(name=feature_group_name,
                                 sagemaker_session=feature_store_session)
    feature_group.load_feature_definitions(data_frame=pandas_data_frame)

    with cleanup_feature_group(feature_group):
        output = feature_group.create(
            s3_uri=offline_store_s3_uri,
            record_identifier_name="feature1",
            event_time_feature_name="feature3",
            role_arn=role,
            enable_online_store=True,
        )
        _wait_for_feature_group_create(feature_group)

        # Ingest data
        feature_group.put_record(record=record)
        ingestion_manager = feature_group.ingest(data_frame=pandas_data_frame,
                                                 max_workers=3,
                                                 wait=False)
        ingestion_manager.wait()
        assert 0 == len(ingestion_manager.failed_rows)

        # Query the integrated Glue table.
        athena_query = feature_group.athena_query()
        df = DataFrame()
        with timeout(minutes=10):
            while df.shape[0] < 11:
                athena_query.run(
                    query_string=f'SELECT * FROM "{athena_query.table_name}"',
                    output_location=f"{offline_store_s3_uri}/query_results",
                )
                athena_query.wait()
                assert "SUCCEEDED" == athena_query.get_query_execution().get(
                    "QueryExecution").get("Status").get("State")
                df = athena_query.as_dataframe()
                print(f"Found {df.shape[0]} records.")
                time.sleep(60)

        assert df.shape[0] == 11
        nans = pd.isna(df.loc[df["feature1"].isin([5, 6, 7, 8,
                                                   9])]["feature4"])
        for is_na in nans.items():
            assert is_na
        assert (create_table_ddl.format(
            feature_group_name=feature_group_name,
            region=feature_store_session.boto_session.region_name,
            account=feature_store_session.account_id(),
        ) == feature_group.as_hive_ddl())
    assert output["FeatureGroupArn"].endswith(
        f"feature-group/{feature_group_name}")
Пример #6
0
def cleanup_feature_group(feature_group: FeatureGroup):
    try:
        yield
    finally:
        try:
            feature_group.delete()
        except Exception:
            pass
Пример #7
0
def cleanup_feature_group(feature_group: FeatureGroup):
    try:
        yield
    finally:
        try:
            feature_group.delete()
            print("FeatureGroup cleaned up")
        except Exception as e:
            print(f"Delete FeatureGroup failed with error: {e}.")
            pass
def cleanup_feature_group(feature_group: FeatureGroup):
    try:
        yield
    finally:
        try:
            feature_group.delete()
        except Exception:
            raise RuntimeError(
                f"Failed to delete feature group with name {feature_group.name}"
            )
Пример #9
0
def _wait_for_feature_group_create(feature_group: FeatureGroup):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        print(feature_group.describe())
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")
Пример #10
0
def test_load_feature_definition_unsupported_types(sagemaker_session_mock):
    feature_group = FeatureGroup(name="FailedGroup",
                                 sagemaker_session=sagemaker_session_mock)
    df = pd.DataFrame({
        "float": pd.Series([2.0], dtype="float64"),
        "int": pd.Series([2], dtype="int64"),
        "object": pd.Series(["f1"], dtype="object"),
    })
    with pytest.raises(ValueError) as error:
        feature_group.load_feature_definitions(data_frame=df)
    assert "Failed to infer Feature type based on dtype object for column object." in str(
        error)
Пример #11
0
def test_load_feature_definition(sagemaker_session_mock):
    feature_group = FeatureGroup(name="SomeGroup", sagemaker_session=sagemaker_session_mock)
    df = pd.DataFrame(
        {
            "float": pd.Series([2.0], dtype="float64"),
            "int": pd.Series([2], dtype="int64"),
            "string": pd.Series(["f1"], dtype="string"),
        }
    )
    feature_definitions = feature_group.load_feature_definitions(data_frame=df)
    names = [fd.feature_name for fd in feature_definitions]
    types = [fd.feature_type for fd in feature_definitions]
    assert names == ["float", "int", "string"]
    assert types == [FeatureTypeEnum.FRACTIONAL, FeatureTypeEnum.INTEGRAL, FeatureTypeEnum.STRING]
Пример #12
0
def test_ingest_default_max_workers(ingestion_manager_init, sagemaker_session_mock):
    feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock)
    df = pd.DataFrame({"float": pd.Series([2.0], dtype="float64")})

    mock_ingestion_manager_instance = Mock()
    ingestion_manager_init.return_value = mock_ingestion_manager_instance
    feature_group.ingest(data_frame=df)

    ingestion_manager_init.assert_called_once_with(
        feature_group_name="MyGroup",
        sagemaker_session=sagemaker_session_mock,
        data_frame=df,
        max_workers=1,
    )
    mock_ingestion_manager_instance.run.assert_called_once_with(wait=True, timeout=None)
Пример #13
0
def test_ingest(ingestion_manager_init, sagemaker_session_mock):
    feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock)
    df = pd.DataFrame(dict((f"float{i}", pd.Series([2.0], dtype="float64")) for i in range(300)))

    mock_ingestion_manager_instance = Mock()
    ingestion_manager_init.return_value = mock_ingestion_manager_instance
    feature_group.ingest(data_frame=df, max_workers=10)

    ingestion_manager_init.assert_called_once_with(
        feature_group_name="MyGroup",
        sagemaker_session=sagemaker_session_mock,
        data_frame=df,
        max_workers=10,
    )
    mock_ingestion_manager_instance.run.assert_called_once_with(wait=True, timeout=None)
def create_feature_group(
    feature_group_name,
    feature_group_description,
    df,
    id_name,
    event_time_name,
    offline_feature_group_bucket,
    sagemaker_session,
    role,
):
    """
    Create a new FeatureGroup.

    :param feature_group_name: str
    :param feature_group_description: str
    :param df: pandas.DataFrame
    :param id_name: str
    :param event_time_name: str
    :param offline_feature_group_bucket: str
    :param sagemaker_session: sagemaker.Session()
    :param role: str
    :return: tuple(FeatureGroup, bool)
    """
    feature_group = FeatureGroup(name=feature_group_name,
                                 sagemaker_session=sagemaker_session)
    feature_definitions = get_feature_definitions(df, feature_group)
    feature_group.feature_definitions = feature_definitions
    feature_group_already_exists = False
    try:
        print(f"Trying to create feature group {feature_group_description} \n")
        feature_group.create(
            description=feature_group_description,
            record_identifier_name=id_name,
            event_time_feature_name=event_time_name,
            role_arn=role,
            s3_uri=offline_feature_group_bucket,
            enable_online_store=True,
        )
        wait_for_feature_group_creation_complete(feature_group)
    except Exception as e:
        code = e.response.get("Error").get("Code")
        if code == "ResourceInUse":
            print(f"Using existing feature group: {feature_group_name}")
            feature_group_already_exists = True
        else:
            raise (e)
    return feature_group, feature_group_already_exists
def test_ingest_multi_process(
    feature_store_session,
    role,
    feature_group_name,
    offline_store_s3_uri,
    pandas_data_frame,
):
    feature_group = FeatureGroup(name=feature_group_name,
                                 sagemaker_session=feature_store_session)
    feature_group.load_feature_definitions(data_frame=pandas_data_frame)

    with cleanup_feature_group(feature_group):
        output = feature_group.create(
            s3_uri=offline_store_s3_uri,
            record_identifier_name="feature1",
            event_time_feature_name="feature3",
            role_arn=role,
            enable_online_store=True,
        )
        _wait_for_feature_group_create(feature_group)

        feature_group.ingest(data_frame=pandas_data_frame,
                             max_workers=3,
                             max_processes=2,
                             wait=True)

    assert output["FeatureGroupArn"].endswith(
        f"feature-group/{feature_group_name}")
Пример #16
0
def test_feature_store_create(
    sagemaker_session_mock, role_arn, feature_group_dummy_definitions, s3_uri
):
    feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock)
    feature_group.feature_definitions = feature_group_dummy_definitions
    feature_group.create(
        s3_uri=s3_uri,
        record_identifier_name="feature1",
        event_time_feature_name="feature2",
        role_arn=role_arn,
        enable_online_store=True,
    )
    assert sagemaker_session_mock.create_feature_group.called_with(
        feature_group_name="MyFeatureGroup",
        record_identifier_name="feature1",
        event_time_feature_name="feature2",
        role_arn=role_arn,
        online_store_config={"EnableOnlineStore": True},
        feature_definitions=[fd.to_dict() for fd in feature_group_dummy_definitions],
    )
Пример #17
0
def test_create_feature_store_online_only(
    feature_store_session,
    role,
    feature_group_name,
    pandas_data_frame,
):
    feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)
    feature_group.load_feature_definitions(data_frame=pandas_data_frame)

    with cleanup_feature_group(feature_group):
        output = feature_group.create(
            s3_uri=False,
            record_identifier_name="feature1",
            event_time_feature_name="feature3",
            role_arn=role,
            enable_online_store=True,
        )
        _wait_for_feature_group_create(feature_group)

    assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}")
def test_ingest_without_string_feature(
    feature_store_session,
    role,
    feature_group_name,
    offline_store_s3_uri,
    pandas_data_frame_without_string,
):
    feature_group = FeatureGroup(name=feature_group_name,
                                 sagemaker_session=feature_store_session)
    feature_group.load_feature_definitions(
        data_frame=pandas_data_frame_without_string)

    with cleanup_feature_group(feature_group):
        output = feature_group.create(
            s3_uri=offline_store_s3_uri,
            record_identifier_name="feature1",
            event_time_feature_name="feature2",
            role_arn=role,
            enable_online_store=True,
        )
        _wait_for_feature_group_create(feature_group)

        ingestion_manager = feature_group.ingest(
            data_frame=pandas_data_frame_without_string,
            max_workers=3,
            wait=False)
        ingestion_manager.wait()

    assert output["FeatureGroupArn"].endswith(
        f"feature-group/{feature_group_name}")
Пример #19
0
    def __init__(
        self,
        name: str,
        s3_uri: str,
        record_identifier_name: str,
        event_time_name: str,
        query: str,
        description: str = None,
    ):

        region = boto3.Session().region_name
        boto_session = boto3.Session(region_name=region)

        sagemaker_client = boto_session.client(
            service_name="sagemaker", region_name=region
        )
        featurestore_runtime = boto_session.client(
            service_name="sagemaker-featurestore-runtime", region_name=region
        )

        feature_store_session = Session(
            boto_session=boto_session,
            sagemaker_client=sagemaker_client,
            sagemaker_featurestore_runtime_client=featurestore_runtime,
        )

        iam = boto3.client("iam")
        role = iam.get_role(RoleName="AmazonSageMaker-ExecutionRole")["Role"]["Arn"]

        # you can also suffix the feature group name with pipeline git version
        self._feature_group = FeatureGroup(
            name=name, sagemaker_session=feature_store_session
        )
        self._description = description
        self._s3_uri = s3_uri
        self._role = role
        self._record_identifier_name = record_identifier_name
        self._event_time_name = event_time_name
        self._query = query
Пример #20
0
def create_or_load_feature_group(prefix, feature_group_name):

    # Feature Definitions for our records
    feature_definitions = [
        FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL),
        FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL),
        #        FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING),
    ]

    feature_group = FeatureGroup(
        name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session
    )

    print("Feature Group: {}".format(feature_group))

    try:
        print(
            "Waiting for existing Feature Group to become available if it is being created by another instance in our cluster..."
        )
        wait_for_feature_group_creation_complete(feature_group)
    except Exception as e:
        print("Before CREATE FG wait exeption: {}".format(e))
    #        pass

    try:
        record_identifier_feature_name = "review_id"
        event_time_feature_name = "date"

        print("Creating Feature Group with role {}...".format(role))
        feature_group.create(
            s3_uri=f"s3://{bucket}/{prefix}",
            record_identifier_name=record_identifier_feature_name,
            event_time_feature_name=event_time_feature_name,
            role_arn=role,
            enable_online_store=True,
        )
        print("Creating Feature Group. Completed.")

        print("Waiting for new Feature Group to become available...")
        wait_for_feature_group_creation_complete(feature_group)
        print("Feature Group available.")
        feature_group.describe()

    except Exception as e:
        print("Exception: {}".format(e))

    return feature_group
# Setup the feature store
timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
print(timestamp)

prefix = 'reviews-feature-store-' + timestamp
print(prefix)

print('List Feature Groups: {}'.format(sm.list_feature_groups()))

from sagemaker.feature_store.feature_group import FeatureGroup

reviews_feature_group_name = 'reviews-feature-group-' + strftime(
    '%d-%H-%M-%S', gmtime())
print(reviews_feature_group_name)

reviews_feature_group = FeatureGroup(name=reviews_feature_group_name,
                                     sagemaker_session=sagemaker_session)
print(reviews_feature_group)

# record identifier and event time feature names
record_identifier_feature_name = "review_id"
event_time_feature_name = "date"


def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == 'object':
            data_frame[label] = data_frame[label].astype("str").astype(
                "string")


def wait_for_feature_group_creation_complete(feature_group):
Пример #22
0
def test_one_step_ingestion_pipeline(sagemaker_session, feature_store_session,
                                     feature_definitions, role, pipeline_name):
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.4xlarge")

    input_name = "features.csv"
    input_file_path = os.path.join(DATA_DIR, "workflow", "features.csv")
    input_data_uri = os.path.join("s3://", sagemaker_session.default_bucket(),
                                  "py-sdk-ingestion-test-input/features.csv")
    with open(input_file_path, "r") as data:
        body = data.read()
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session)

    inputs = [
        ProcessingInput(
            input_name=input_name,
            source=input_data_uri,
            destination="/opt/ml/processing/features.csv",
        )
    ]

    feature_group_name = f"py-sdk-integ-fg-{int(time.time() * 10**7)}"
    feature_group = FeatureGroup(
        name=feature_group_name,
        feature_definitions=feature_definitions,
        sagemaker_session=feature_store_session,
    )

    ingestion_only_flow, output_name = generate_data_ingestion_flow_from_s3_input(
        input_name,
        input_data_uri,
        s3_content_type="csv",
        s3_has_header=True,
    )

    outputs = [
        ProcessingOutput(
            output_name=output_name,
            app_managed=True,
            feature_store_output=FeatureStoreOutput(
                feature_group_name=feature_group_name),
        )
    ]

    temp_flow_path = "./ingestion.flow"
    with cleanup_feature_group(feature_group):
        json.dump(ingestion_only_flow, open(temp_flow_path, "w"))

        data_wrangler_processor = DataWranglerProcessor(
            role=role,
            data_wrangler_flow_source=temp_flow_path,
            instance_count=instance_count,
            instance_type=instance_type,
            sagemaker_session=sagemaker_session,
            max_runtime_in_seconds=86400,
        )

        data_wrangler_step = ProcessingStep(name="ingestion-step",
                                            processor=data_wrangler_processor,
                                            inputs=inputs,
                                            outputs=outputs)

        pipeline = Pipeline(
            name=pipeline_name,
            parameters=[instance_count, instance_type],
            steps=[data_wrangler_step],
            sagemaker_session=sagemaker_session,
        )

        try:
            response = pipeline.create(role)
            create_arn = response["PipelineArn"]

            offline_store_s3_uri = os.path.join(
                "s3://", sagemaker_session.default_bucket(),
                feature_group_name)
            feature_group.create(
                s3_uri=offline_store_s3_uri,
                record_identifier_name="f11",
                event_time_feature_name="f10",
                role_arn=role,
                enable_online_store=False,
            )
            _wait_for_feature_group_create(feature_group)

            execution = pipeline.start()
            response = execution.describe()
            assert response["PipelineArn"] == create_arn

            try:
                execution.wait(delay=60, max_attempts=10)
            except WaiterError:
                pass

            execution_steps = execution.list_steps()

            assert len(execution_steps) == 1
            assert execution_steps[0]["StepName"] == "ingestion-step"
            assert execution_steps[0]["StepStatus"] == "Succeeded"

            athena_query = feature_group.athena_query()
            with timeout(minutes=10):
                athena_query.run(
                    query_string=f'SELECT * FROM "{athena_query.table_name}"',
                    output_location=f"{offline_store_s3_uri}/query_results",
                )
                athena_query.wait()
                assert "SUCCEEDED" == athena_query.get_query_execution().get(
                    "QueryExecution").get("Status").get("State")

                df = athena_query.as_dataframe()
                assert pd.read_csv(input_file_path).shape[0] == df.shape[0]
        finally:
            try:
                pipeline.delete()
            except Exception as e:
                print(f"Delete pipeline failed with error: {e}")
            os.remove(temp_flow_path)
Пример #23
0
def create_or_load_feature_group(prefix, feature_group_name):

    # Feature Definitions for the records
    feature_definitions = [
        FeatureDefinition(feature_name='review_id',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='date',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='sentiment',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='label_id',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='input_ids',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='review_body',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='split_type',
                          feature_type=FeatureTypeEnum.STRING)
    ]

    # setup the Feature Group
    feature_group = FeatureGroup(name=feature_group_name,
                                 feature_definitions=feature_definitions,
                                 sagemaker_session=sagemaker_session)

    print('Feature Group: {}'.format(feature_group))

    try:
        print(
            'Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...'
        )
        wait_for_feature_group_creation_complete(feature_group)
    except Exception as e:
        print('Before CREATE FG wait exeption: {}'.format(e))

    try:
        record_identifier_feature_name = "review_id"
        event_time_feature_name = "date"

        print('Creating Feature Group with role {}...'.format(role))

        # create Feature Group
        feature_group.create(
            s3_uri=f"s3://{bucket}/{prefix}",
            record_identifier_name=record_identifier_feature_name,
            event_time_feature_name=event_time_feature_name,
            role_arn=role,
            enable_online_store=False)
        print('Creating Feature Group. Completed.')

        print('Waiting for new Feature Group to become available...')
        wait_for_feature_group_creation_complete(feature_group)
        print('Feature Group available.')

        # the information about the Feature Group
        feature_group.describe()

    except Exception as e:
        print('Exception: {}'.format(e))

    return feature_group
Пример #24
0
def test_put_record(sagemaker_session_mock):
    feature_group = FeatureGroup(name="MyFeatureGroup",
                                 sagemaker_session=sagemaker_session_mock)
    feature_group.put_record(record=[])
    sagemaker_session_mock.put_record.assert_called_with(
        feature_group_name="MyFeatureGroup", record=[])
Пример #25
0
def test_feature_store_describe(sagemaker_session_mock):
    feature_group = FeatureGroup(name="MyFeatureGroup",
                                 sagemaker_session=sagemaker_session_mock)
    feature_group.describe()
    sagemaker_session_mock.describe_feature_group.assert_called_with(
        feature_group_name="MyFeatureGroup", next_token=None)
Пример #26
0
def test_feature_store_delete(sagemaker_session_mock):
    feature_group = FeatureGroup(name="MyFeatureGroup",
                                 sagemaker_session=sagemaker_session_mock)
    feature_group.delete()
    sagemaker_session_mock.delete_feature_group.assert_called_with(
        feature_group_name="MyFeatureGroup")
Пример #27
0
    bucket = args.bucket

    boto_session = boto3.Session(region_name=region)
    sagemaker_client = boto_session.client(service_name='sagemaker')
    featurestore_client = boto_session.client(
        service_name='sagemaker-featurestore-runtime')
    session = sagemaker.session.Session(
        boto_session=boto_session,
        sagemaker_client=sagemaker_client,
        sagemaker_featurestore_runtime_client=featurestore_client)

    # Read feature group name
    with open('/opt/ml/processing/input/feature_group_name.txt') as f:
        feature_group_name = f.read()

    feature_group = FeatureGroup(name=feature_group_name,
                                 sagemaker_session=session)

    feature_group_query = feature_group.athena_query()
    feature_group_table = feature_group_query.table_name
    print(feature_group_table)

    query_string = 'SELECT label,review_body FROM "' \
        + feature_group_table+'"' \
        + ' INNER JOIN (SELECT product_id FROM (SELECT product_id, avg(star_rating) as avg_rating, count(*) as review_count \
            FROM "'                    + feature_group_table+'"' \
        + ' GROUP BY product_id) WHERE review_count > 1000) tmp ON "' \
        + feature_group_table+'"'+ '.product_id=tmp.product_id;'
    print(query_string)

    dataset = pd.DataFrame()
    feature_group_query.run(query_string=query_string,
Пример #28
0
 featurestore_client = boto_session.client(service_name='sagemaker-featurestore-runtime')
 session = sagemaker.session.Session(
     boto_session=boto_session, 
     sagemaker_client=sagemaker_client, 
     sagemaker_featurestore_runtime_client=featurestore_client)
 
 # Load input data
 input_data_path = '/opt/ml/processing/input/fs_data.tsv'
 print('Reading input data from {}'.format(input_data_path))
 data = pd.read_csv(input_data_path, sep='\t',
                    error_bad_lines=False, dtype='str')
 
 # Define the feature group name
 print('Creating feature group...')
 from sagemaker.feature_store.feature_group import FeatureGroup
 feature_group = FeatureGroup(name=fg_name, sagemaker_session=session)
 # Define the name of the column storing a unique record id (e.g. primary key)
 record_identifier_feature_name = 'review_id'
 # Add a column to store feature timestamps
 event_time_feature_name = 'event_time'
 current_time_sec = int(round(time()))
 data = data.assign(event_time=current_time_sec)
 # Set the correct type for each column
 data['review_id']     = data['review_id'].astype('str').astype('string')
 data['product_id']    = data['product_id'].astype('str').astype('string')
 data['review_body']   = data['review_body'].astype('str').astype('string')
 data['label']         = data['label'].astype('str').astype('string')
 data['star_rating']   = data['star_rating'].astype('int64')
 data['event_time']    = data['event_time'].astype('float64')
 # Load feature definitions
 feature_group.load_feature_definitions(data_frame=data)
Пример #29
0
featurestore_runtime = boto_session.client(
    service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = sagemaker.session.Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime)

feature_group_names = [
    args.feature_group_name_ratings, args.feature_group_name_tracks,
    args.feature_group_name_user_preferences
]
feature_groups = []
for name in feature_group_names:
    feature_group = FeatureGroup(name=name,
                                 sagemaker_session=feature_store_session)
    feature_groups.append(feature_group)

feature_group_s3_prefixes = []
for feature_group in feature_groups:
    feature_group_table_name = feature_group.describe().get(
        "OfflineStoreConfig").get("DataCatalogConfig").get("TableName")
    feature_group_s3_prefix = f'{account_id}/sagemaker/{region}/offline-store/{feature_group_table_name}'
    feature_group_s3_prefixes.append(feature_group_s3_prefix)


# wait for data to be added to offline feature store
def wait_for_offline_store(feature_group_s3_prefix):
    print(feature_group_s3_prefix)
    offline_store_contents = None
    while (offline_store_contents is None):
def create_or_load_feature_group(prefix, feature_group_name):

    # Feature Definitions for our records
    feature_definitions = [
        FeatureDefinition(feature_name='input_ids',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='input_mask',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='segment_ids',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='label_id',
                          feature_type=FeatureTypeEnum.INTEGRAL),
        FeatureDefinition(feature_name='review_id',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='date',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='label',
                          feature_type=FeatureTypeEnum.INTEGRAL),
        #        FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='split_type',
                          feature_type=FeatureTypeEnum.STRING)
    ]

    feature_group = FeatureGroup(name=feature_group_name,
                                 feature_definitions=feature_definitions,
                                 sagemaker_session=sagemaker_session)

    print('Feature Group: {}'.format(feature_group))

    try:
        print(
            'Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...'
        )
        wait_for_feature_group_creation_complete(feature_group)
    except Exception as e:
        print('Before CREATE FG wait exeption: {}'.format(e))
#        pass

    try:
        record_identifier_feature_name = "review_id"
        event_time_feature_name = "date"

        print('Creating Feature Group with role {}...'.format(role))
        feature_group.create(
            s3_uri=f"s3://{bucket}/{prefix}",
            record_identifier_name=record_identifier_feature_name,
            event_time_feature_name=event_time_feature_name,
            role_arn=role,
            enable_online_store=True)
        print('Creating Feature Group. Completed.')

        print('Waiting for new Feature Group to become available...')
        wait_for_feature_group_creation_complete(feature_group)
        print('Feature Group available.')
        feature_group.describe()

    except Exception as e:
        print('Exception: {}'.format(e))


#        pass

#         print('FAILED - NOW Creating Feature Group with service-role {}...'.format('arn:aws:iam::231218423789:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole'))
#         feature_group.create(
#             s3_uri=f"s3://{bucket}/{prefix}",
#             record_identifier_name=record_identifier_feature_name,
#             event_time_feature_name=event_time_feature_name,
#             role_arn='arn:aws:iam::231218423789:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole',
#             enable_online_store=True
#         )
#         print('Creating Feature Group. Completed.')

#    feature_group.describe()

    return feature_group