def test_as_hive_ddl_with_default_values( create_table_ddl, feature_group_dummy_definitions, sagemaker_session_mock ): sagemaker_session_mock.describe_feature_group.return_value = { "OfflineStoreConfig": { "S3StorageConfig": { "S3Uri": "s3://some-bucket", "ResolvedOutputS3Uri": "s3://resolved_output_s3_uri", } } } sagemaker_session_mock.account_id.return_value = "1234" sagemaker_session_mock.boto_session.region_name = "us-west-2" feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions assert ( create_table_ddl.format( database="sagemaker_featurestore", table_name="MyGroup", account="1234", region="us-west-2", feature_group_name="MyGroup", ) == feature_group.as_hive_ddl() )
def test_create_feature_store( feature_store_session, role, feature_group_name, offline_store_s3_uri, pandas_data_frame, record, create_table_ddl, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions(data_frame=pandas_data_frame) with cleanup_feature_group(feature_group): output = feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="feature1", event_time_feature_name="feature3", role_arn=role, enable_online_store=True, ) _wait_for_feature_group_create(feature_group) # Ingest data feature_group.put_record(record=record) ingestion_manager = feature_group.ingest(data_frame=pandas_data_frame, max_workers=3, wait=False) ingestion_manager.wait() assert 0 == len(ingestion_manager.failed_rows) # Query the integrated Glue table. athena_query = feature_group.athena_query() df = DataFrame() with timeout(minutes=10): while df.shape[0] < 11: athena_query.run( query_string=f'SELECT * FROM "{athena_query.table_name}"', output_location=f"{offline_store_s3_uri}/query_results", ) athena_query.wait() assert "SUCCEEDED" == athena_query.get_query_execution().get( "QueryExecution").get("Status").get("State") df = athena_query.as_dataframe() print(f"Found {df.shape[0]} records.") time.sleep(60) assert df.shape[0] == 11 nans = pd.isna(df.loc[df["feature1"].isin([5, 6, 7, 8, 9])]["feature4"]) for is_na in nans.items(): assert is_na assert (create_table_ddl.format( feature_group_name=feature_group_name, region=feature_store_session.boto_session.region_name, account=feature_store_session.account_id(), ) == feature_group.as_hive_ddl()) assert output["FeatureGroupArn"].endswith( f"feature-group/{feature_group_name}")