def test_ingest_multi_process( feature_store_session, role, feature_group_name, offline_store_s3_uri, pandas_data_frame, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions(data_frame=pandas_data_frame) with cleanup_feature_group(feature_group): output = feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="feature1", event_time_feature_name="feature3", role_arn=role, enable_online_store=True, ) _wait_for_feature_group_create(feature_group) feature_group.ingest(data_frame=pandas_data_frame, max_workers=3, max_processes=2, wait=True) assert output["FeatureGroupArn"].endswith( f"feature-group/{feature_group_name}")
def test_ingest_with_profile_name(ingestion_manager_init, sagemaker_session_mock, fs_runtime_client_config_mock): sagemaker_session_mock.sagemaker_featurestore_runtime_client.meta.config = ( fs_runtime_client_config_mock) feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) df = pd.DataFrame( dict((f"float{i}", pd.Series([2.0], dtype="float64")) for i in range(300))) mock_ingestion_manager_instance = Mock() ingestion_manager_init.return_value = mock_ingestion_manager_instance feature_group.ingest(data_frame=df, max_workers=10, profile_name="profile_name") ingestion_manager_init.assert_called_once_with( feature_group_name="MyGroup", sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock, max_workers=10, max_processes=1, profile_name="profile_name", ) mock_ingestion_manager_instance.run.assert_called_once_with(data_frame=df, wait=True, timeout=None)
def test_ingest_zero_workers(): feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) df = Mock() with pytest.raises(RuntimeError) as error: feature_group.ingest(data_frame=df, max_workers=0, max_processes=1) assert "max_workers must be greater than 0." in str(error)
def save_to_feature_store(): logger.info("Save to FeatureStore started") global feature_group df_data = pd.read_csv(feature_s3_url) logger.info("Read data from S3: %s", df_data.head()) feature_store_session = Session( boto_session=boto_session, sagemaker_client=sagemaker_client, sagemaker_featurestore_runtime_client=featurestore_runtime) # You can modify the following to use a bucket of your choosing logger.info("Default bucket: %s", default_bucket) # record identifier and event time feature names record_identifier_feature_name = "IDpol" event_time_feature_name = "EventTime" current_time_sec = int(round(time.time())) # cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type. cast_object_to_string(df_data) df_data[event_time_feature_name] = pd.Series([current_time_sec] * len(df_data), dtype="float64") feature_group_name = 'insurance-policy-feature-group-' + strftime( '%d-%H-%M-%S', gmtime()) logger.info("Feature Group Name: %s", feature_group_name) # Check if feature group already exists. Create a feature group if doesn't exist. if feature_group_exist(feature_group_name) == False: logger.info("Feature Group: %s doesn't exist. Create a new one.", feature_group) feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) # append EventTime feature # load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data. feature_group.load_feature_definitions(data_frame=df_data) # output is suppressed feature_group.create( s3_uri=f"s3://{default_bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=get_execution_role(), enable_online_store=True) wait_for_feature_group_creation_complete(feature_group=feature_group) feature_group.describe() else: logger.info("Feature Group: %s exits", feature_group) # Init feature group object if already exists feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) # ingest data into feature store feature_group.ingest(data_frame=df_data, max_workers=5, wait=True)
def test_ingest_default_max_workers(ingestion_manager_init, sagemaker_session_mock): feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) df = pd.DataFrame({"float": pd.Series([2.0], dtype="float64")}) mock_ingestion_manager_instance = Mock() ingestion_manager_init.return_value = mock_ingestion_manager_instance feature_group.ingest(data_frame=df) ingestion_manager_init.assert_called_once_with( feature_group_name="MyGroup", sagemaker_session=sagemaker_session_mock, data_frame=df, max_workers=1, ) mock_ingestion_manager_instance.run.assert_called_once_with(wait=True, timeout=None)
def test_ingest(ingestion_manager_init, sagemaker_session_mock): feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) df = pd.DataFrame(dict((f"float{i}", pd.Series([2.0], dtype="float64")) for i in range(300))) mock_ingestion_manager_instance = Mock() ingestion_manager_init.return_value = mock_ingestion_manager_instance feature_group.ingest(data_frame=df, max_workers=10) ingestion_manager_init.assert_called_once_with( feature_group_name="MyGroup", sagemaker_session=sagemaker_session_mock, data_frame=df, max_workers=10, ) mock_ingestion_manager_instance.run.assert_called_once_with(wait=True, timeout=None)
def test_ingest_without_string_feature( feature_store_session, role, feature_group_name, offline_store_s3_uri, pandas_data_frame_without_string, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions( data_frame=pandas_data_frame_without_string) with cleanup_feature_group(feature_group): output = feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="feature1", event_time_feature_name="feature2", role_arn=role, enable_online_store=True, ) _wait_for_feature_group_create(feature_group) ingestion_manager = feature_group.ingest( data_frame=pandas_data_frame_without_string, max_workers=3, wait=False) ingestion_manager.wait() assert output["FeatureGroupArn"].endswith( f"feature-group/{feature_group_name}")
def test_create_feature_store( feature_store_session, role, feature_group_name, offline_store_s3_uri, pandas_data_frame, record, create_table_ddl, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions(data_frame=pandas_data_frame) with cleanup_feature_group(feature_group): output = feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="feature1", event_time_feature_name="feature3", role_arn=role, enable_online_store=True, ) _wait_for_feature_group_create(feature_group) # Ingest data feature_group.put_record(record=record) ingestion_manager = feature_group.ingest(data_frame=pandas_data_frame, max_workers=3, wait=False) ingestion_manager.wait() assert 0 == len(ingestion_manager.failed_rows) # Query the integrated Glue table. athena_query = feature_group.athena_query() df = DataFrame() with timeout(minutes=10): while df.shape[0] < 11: athena_query.run( query_string=f'SELECT * FROM "{athena_query.table_name}"', output_location=f"{offline_store_s3_uri}/query_results", ) athena_query.wait() assert "SUCCEEDED" == athena_query.get_query_execution().get( "QueryExecution").get("Status").get("State") df = athena_query.as_dataframe() print(f"Found {df.shape[0]} records.") time.sleep(60) assert df.shape[0] == 11 nans = pd.isna(df.loc[df["feature1"].isin([5, 6, 7, 8, 9])]["feature4"]) for is_na in nans.items(): assert is_na assert (create_table_ddl.format( feature_group_name=feature_group_name, region=feature_store_session.boto_session.region_name, account=feature_store_session.account_id(), ) == feature_group.as_hive_ddl()) assert output["FeatureGroupArn"].endswith( f"feature-group/{feature_group_name}")
role_arn=role, enable_online_store=True, description="1.8M+ tokenized camera reviews from the Amazon Customer Reviews dataset", tags=[ { 'Key': 'Dataset', 'Value': 'amazon customer reviews' }, { 'Key': 'Subset', 'Value': 'cameras' }, { 'Key': 'Owner', 'Value': 'Julien Simon' } ] ) # Wait for feature group to be ready while feature_group.describe().get("FeatureGroupStatus") != 'Created': sleep(1) print('Feature group created') # Ingest data print('Ingesting data...') try: feature_group.ingest(data_frame=data, max_workers=max_workers, wait=True) except Exception: pass print('Waiting...') # Wait for 10 minutes to make sure data has flowed to the offline store # https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store-offline.html sleep(600) # Save feature group name with open('/opt/ml/processing/output/feature_group_name.txt', 'w') as f: f.write(fg_name) print('Job complete')
class FeatureGroupDataSet(AbstractDataSet): def __init__( self, name: str, s3_uri: str, record_identifier_name: str, event_time_name: str, query: str, description: str = None, ): region = boto3.Session().region_name boto_session = boto3.Session(region_name=region) sagemaker_client = boto_session.client( service_name="sagemaker", region_name=region ) featurestore_runtime = boto_session.client( service_name="sagemaker-featurestore-runtime", region_name=region ) feature_store_session = Session( boto_session=boto_session, sagemaker_client=sagemaker_client, sagemaker_featurestore_runtime_client=featurestore_runtime, ) iam = boto3.client("iam") role = iam.get_role(RoleName="AmazonSageMaker-ExecutionRole")["Role"]["Arn"] # you can also suffix the feature group name with pipeline git version self._feature_group = FeatureGroup( name=name, sagemaker_session=feature_store_session ) self._description = description self._s3_uri = s3_uri self._role = role self._record_identifier_name = record_identifier_name self._event_time_name = event_time_name self._query = query def _wait_for_feature_group_creation_complete(self): status = self._feature_group.describe().get("FeatureGroupStatus") while status == "Creating": logger.info("Waiting for Feature Group Creation") time.sleep(5) status = self._feature_group.describe().get("FeatureGroupStatus") if status != "Created": raise RuntimeError( f"Failed to create feature group {self._feature_group.name}" ) logger.info("FeatureGroup %s successfully created.", self._feature_group.name) def _describe(self): return dict(feature_group=self._feature_group) def _save(self, data): self._feature_group.load_feature_definitions(data) try: self._feature_group.create( description=self._description, s3_uri=self._s3_uri, record_identifier_name=self._record_identifier_name, event_time_feature_name=self._event_time_name, role_arn=self._role, enable_online_store=True, ) self._wait_for_feature_group_creation_complete() except Exception as exc: if ( f"Resource Already Exists: FeatureGroup with name {self._feature_group.name} already exists" in str(exc) ): pass else: raise self._feature_group.ingest(data[:10]) # just for demo purpose def _load(self) -> pd.DataFrame: query = self._feature_group.athena_query() print(self._query.format(table_name=query.table_name)) query.run( self._query.format(table_name=query.table_name), output_location=f"{self._s3_uri}/query_results/", ) query.wait() return query.as_dataframe()