def save_to_feature_store(): logger.info("Save to FeatureStore started") global feature_group df_data = pd.read_csv(feature_s3_url) logger.info("Read data from S3: %s", df_data.head()) feature_store_session = Session( boto_session=boto_session, sagemaker_client=sagemaker_client, sagemaker_featurestore_runtime_client=featurestore_runtime) # You can modify the following to use a bucket of your choosing logger.info("Default bucket: %s", default_bucket) # record identifier and event time feature names record_identifier_feature_name = "IDpol" event_time_feature_name = "EventTime" current_time_sec = int(round(time.time())) # cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type. cast_object_to_string(df_data) df_data[event_time_feature_name] = pd.Series([current_time_sec] * len(df_data), dtype="float64") feature_group_name = 'insurance-policy-feature-group-' + strftime( '%d-%H-%M-%S', gmtime()) logger.info("Feature Group Name: %s", feature_group_name) # Check if feature group already exists. Create a feature group if doesn't exist. if feature_group_exist(feature_group_name) == False: logger.info("Feature Group: %s doesn't exist. Create a new one.", feature_group) feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) # append EventTime feature # load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data. feature_group.load_feature_definitions(data_frame=df_data) # output is suppressed feature_group.create( s3_uri=f"s3://{default_bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=get_execution_role(), enable_online_store=True) wait_for_feature_group_creation_complete(feature_group=feature_group) feature_group.describe() else: logger.info("Feature Group: %s exits", feature_group) # Init feature group object if already exists feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) # ingest data into feature store feature_group.ingest(data_frame=df_data, max_workers=5, wait=True)
def _wait_for_feature_group_create(feature_group: FeatureGroup): status = feature_group.describe().get("FeatureGroupStatus") while status == "Creating": print("Waiting for Feature Group Creation") time.sleep(5) status = feature_group.describe().get("FeatureGroupStatus") if status != "Created": print(feature_group.describe()) raise RuntimeError(f"Failed to create feature group {feature_group.name}") print(f"FeatureGroup {feature_group.name} successfully created.")
def create_or_load_feature_group(prefix, feature_group_name): # Feature Definitions for our records feature_definitions = [ FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL), FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL), # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING), ] feature_group = FeatureGroup( name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session ) print("Feature Group: {}".format(feature_group)) try: print( "Waiting for existing Feature Group to become available if it is being created by another instance in our cluster..." ) wait_for_feature_group_creation_complete(feature_group) except Exception as e: print("Before CREATE FG wait exeption: {}".format(e)) # pass try: record_identifier_feature_name = "review_id" event_time_feature_name = "date" print("Creating Feature Group with role {}...".format(role)) feature_group.create( s3_uri=f"s3://{bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=role, enable_online_store=True, ) print("Creating Feature Group. Completed.") print("Waiting for new Feature Group to become available...") wait_for_feature_group_creation_complete(feature_group) print("Feature Group available.") feature_group.describe() except Exception as e: print("Exception: {}".format(e)) return feature_group
def test_feature_store_describe(sagemaker_session_mock): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.describe() sagemaker_session_mock.describe_feature_group.assert_called_with( feature_group_name="MyFeatureGroup", next_token=None)
def test_create_feature_store( feature_store_session, role, feature_group_name, offline_store_s3_uri, pandas_data_frame, record, create_table_ddl, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions(data_frame=pandas_data_frame) with cleanup_feature_group(feature_group): output = feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="feature1", event_time_feature_name="feature3", role_arn=role, enable_online_store=True, ) _wait_for_feature_group_create(feature_group) resolved_output_s3_uri = ( feature_group.describe().get("OfflineStoreConfig").get( "S3StorageConfig").get("ResolvedOutputS3Uri")) # Ingest data feature_group.put_record(record=record) ingestion_manager = feature_group.ingest(data_frame=pandas_data_frame, max_workers=3, wait=False) ingestion_manager.wait() assert 0 == len(ingestion_manager.failed_rows) # Query the integrated Glue table. athena_query = feature_group.athena_query() df = DataFrame() with timeout(minutes=10): while df.shape[0] < 11: athena_query.run( query_string=f'SELECT * FROM "{athena_query.table_name}"', output_location=f"{offline_store_s3_uri}/query_results", ) athena_query.wait() assert "SUCCEEDED" == athena_query.get_query_execution().get( "QueryExecution").get("Status").get("State") df = athena_query.as_dataframe() print(f"Found {df.shape[0]} records.") time.sleep(60) assert df.shape[0] == 11 nans = pd.isna(df.loc[df["feature1"].isin([5, 6, 7, 8, 9])]["feature4"]) for is_na in nans.items(): assert is_na assert (create_table_ddl.format( feature_group_name=feature_group_name, region=feature_store_session.boto_session.region_name, account=feature_store_session.account_id(), resolved_output_s3_uri=resolved_output_s3_uri, ) == feature_group.as_hive_ddl()) assert output["FeatureGroupArn"].endswith( f"feature-group/{feature_group_name}")
# Create feature group feature_group.create( s3_uri='s3://{}/{}'.format(bucket, prefix), record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=role, enable_online_store=True, description="1.8M+ tokenized camera reviews from the Amazon Customer Reviews dataset", tags=[ { 'Key': 'Dataset', 'Value': 'amazon customer reviews' }, { 'Key': 'Subset', 'Value': 'cameras' }, { 'Key': 'Owner', 'Value': 'Julien Simon' } ] ) # Wait for feature group to be ready while feature_group.describe().get("FeatureGroupStatus") != 'Created': sleep(1) print('Feature group created') # Ingest data print('Ingesting data...') try: feature_group.ingest(data_frame=data, max_workers=max_workers, wait=True) except Exception: pass print('Waiting...') # Wait for 10 minutes to make sure data has flowed to the offline store # https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store-offline.html sleep(600)
sagemaker_client=sagemaker_client, sagemaker_featurestore_runtime_client=featurestore_runtime) feature_group_names = [ args.feature_group_name_ratings, args.feature_group_name_tracks, args.feature_group_name_user_preferences ] feature_groups = [] for name in feature_group_names: feature_group = FeatureGroup(name=name, sagemaker_session=feature_store_session) feature_groups.append(feature_group) feature_group_s3_prefixes = [] for feature_group in feature_groups: feature_group_table_name = feature_group.describe().get( "OfflineStoreConfig").get("DataCatalogConfig").get("TableName") feature_group_s3_prefix = f'{account_id}/sagemaker/{region}/offline-store/{feature_group_table_name}' feature_group_s3_prefixes.append(feature_group_s3_prefix) # wait for data to be added to offline feature store def wait_for_offline_store(feature_group_s3_prefix): print(feature_group_s3_prefix) offline_store_contents = None while (offline_store_contents is None): objects_in_bucket = s3_client.list_objects( Bucket=bucket, Prefix=feature_group_s3_prefix) if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1): offline_store_contents = objects_in_bucket['Contents'] else:
def create_or_load_feature_group(prefix, feature_group_name): # Feature Definitions for our records feature_definitions = [ FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL), FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL), # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING) ] feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session) print('Feature Group: {}'.format(feature_group)) try: print( 'Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...' ) wait_for_feature_group_creation_complete(feature_group) except Exception as e: print('Before CREATE FG wait exeption: {}'.format(e)) # pass try: record_identifier_feature_name = "review_id" event_time_feature_name = "date" print('Creating Feature Group with role {}...'.format(role)) feature_group.create( s3_uri=f"s3://{bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=role, enable_online_store=True) print('Creating Feature Group. Completed.') print('Waiting for new Feature Group to become available...') wait_for_feature_group_creation_complete(feature_group) print('Feature Group available.') feature_group.describe() except Exception as e: print('Exception: {}'.format(e)) # pass # print('FAILED - NOW Creating Feature Group with service-role {}...'.format('arn:aws:iam::231218423789:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole')) # feature_group.create( # s3_uri=f"s3://{bucket}/{prefix}", # record_identifier_name=record_identifier_feature_name, # event_time_feature_name=event_time_feature_name, # role_arn='arn:aws:iam::231218423789:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole', # enable_online_store=True # ) # print('Creating Feature Group. Completed.') # feature_group.describe() return feature_group
def create_or_load_feature_group(prefix, feature_group_name): # Feature Definitions for the records feature_definitions = [ FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='sentiment', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING) ] # setup the Feature Group feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session) print('Feature Group: {}'.format(feature_group)) try: print( 'Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...' ) wait_for_feature_group_creation_complete(feature_group) except Exception as e: print('Before CREATE FG wait exeption: {}'.format(e)) try: record_identifier_feature_name = "review_id" event_time_feature_name = "date" print('Creating Feature Group with role {}...'.format(role)) # create Feature Group feature_group.create( s3_uri=f"s3://{bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=role, enable_online_store=False) print('Creating Feature Group. Completed.') print('Waiting for new Feature Group to become available...') wait_for_feature_group_creation_complete(feature_group) print('Feature Group available.') # the information about the Feature Group feature_group.describe() except Exception as e: print('Exception: {}'.format(e)) return feature_group
def test_feature_store_describe(sagemaker_session_mock): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.describe() assert sagemaker_session_mock.describe_feature_group.called_with( feature_group_name="MyFeatureGroup")
class FeatureGroupDataSet(AbstractDataSet): def __init__( self, name: str, s3_uri: str, record_identifier_name: str, event_time_name: str, query: str, description: str = None, ): region = boto3.Session().region_name boto_session = boto3.Session(region_name=region) sagemaker_client = boto_session.client( service_name="sagemaker", region_name=region ) featurestore_runtime = boto_session.client( service_name="sagemaker-featurestore-runtime", region_name=region ) feature_store_session = Session( boto_session=boto_session, sagemaker_client=sagemaker_client, sagemaker_featurestore_runtime_client=featurestore_runtime, ) iam = boto3.client("iam") role = iam.get_role(RoleName="AmazonSageMaker-ExecutionRole")["Role"]["Arn"] # you can also suffix the feature group name with pipeline git version self._feature_group = FeatureGroup( name=name, sagemaker_session=feature_store_session ) self._description = description self._s3_uri = s3_uri self._role = role self._record_identifier_name = record_identifier_name self._event_time_name = event_time_name self._query = query def _wait_for_feature_group_creation_complete(self): status = self._feature_group.describe().get("FeatureGroupStatus") while status == "Creating": logger.info("Waiting for Feature Group Creation") time.sleep(5) status = self._feature_group.describe().get("FeatureGroupStatus") if status != "Created": raise RuntimeError( f"Failed to create feature group {self._feature_group.name}" ) logger.info("FeatureGroup %s successfully created.", self._feature_group.name) def _describe(self): return dict(feature_group=self._feature_group) def _save(self, data): self._feature_group.load_feature_definitions(data) try: self._feature_group.create( description=self._description, s3_uri=self._s3_uri, record_identifier_name=self._record_identifier_name, event_time_feature_name=self._event_time_name, role_arn=self._role, enable_online_store=True, ) self._wait_for_feature_group_creation_complete() except Exception as exc: if ( f"Resource Already Exists: FeatureGroup with name {self._feature_group.name} already exists" in str(exc) ): pass else: raise self._feature_group.ingest(data[:10]) # just for demo purpose def _load(self) -> pd.DataFrame: query = self._feature_group.athena_query() print(self._query.format(table_name=query.table_name)) query.run( self._query.format(table_name=query.table_name), output_location=f"{self._s3_uri}/query_results/", ) query.wait() return query.as_dataframe()
args = parser.parse_args() region = args.region boto3.setup_default_session(region_name=region) s3_client = boto3.client("s3") account_id = boto3.client("sts").get_caller_identity()["Account"] now = pd.to_datetime("now") feature_store_session = sagemaker.Session() claims_feature_group = FeatureGroup(name=args.claims_feature_group_name, sagemaker_session=feature_store_session) customers_feature_group = FeatureGroup( name=args.customers_feature_group_name, sagemaker_session=feature_store_session ) claims_table_name = ( claims_feature_group.describe()["OfflineStoreConfig"]["DataCatalogConfig"]["TableName"] ) customers_table_name = ( customers_feature_group.describe()["OfflineStoreConfig"]["DataCatalogConfig"]["TableName"] ) athena_database_name = customers_feature_group.describe()["OfflineStoreConfig"]["DataCatalogConfig"]["Database"] print(f'claims_table_name: {claims_table_name}') print(f'customers_table_name: {customers_table_name}') claims_feature_group_s3_prefix = f'{args.bucket_prefix}/{account_id}/sagemaker/{region}/offline-store/{claims_table_name}/data/year={now.year}/month={now.strftime("%m")}/day={now.strftime("%d")}' customers_feature_group_s3_prefix = f'{args.bucket_prefix}/{account_id}/sagemaker/{region}/offline-store/{customers_table_name}/data/year={now.year}/month={now.strftime("%m")}/day={now.strftime("%d")}' print(f'claims_feature_group_s3_prefix: {claims_feature_group_s3_prefix}') print(f'customers_feature_group_s3_prefix: {customers_feature_group_s3_prefix}')