def test_as_hive_ddl_with_default_values( create_table_ddl, feature_group_dummy_definitions, sagemaker_session_mock ): sagemaker_session_mock.describe_feature_group.return_value = { "OfflineStoreConfig": { "S3StorageConfig": { "S3Uri": "s3://some-bucket", "ResolvedOutputS3Uri": "s3://resolved_output_s3_uri", } } } sagemaker_session_mock.account_id.return_value = "1234" sagemaker_session_mock.boto_session.region_name = "us-west-2" feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions assert ( create_table_ddl.format( database="sagemaker_featurestore", table_name="MyGroup", account="1234", region="us-west-2", feature_group_name="MyGroup", ) == feature_group.as_hive_ddl() )
def test_ingest_with_profile_name(ingestion_manager_init, sagemaker_session_mock, fs_runtime_client_config_mock): sagemaker_session_mock.sagemaker_featurestore_runtime_client.meta.config = ( fs_runtime_client_config_mock) feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) df = pd.DataFrame( dict((f"float{i}", pd.Series([2.0], dtype="float64")) for i in range(300))) mock_ingestion_manager_instance = Mock() ingestion_manager_init.return_value = mock_ingestion_manager_instance feature_group.ingest(data_frame=df, max_workers=10, profile_name="profile_name") ingestion_manager_init.assert_called_once_with( feature_group_name="MyGroup", sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock, max_workers=10, max_processes=1, profile_name="profile_name", ) mock_ingestion_manager_instance.run.assert_called_once_with(data_frame=df, wait=True, timeout=None)
def test_ingest_zero_workers(): feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) df = Mock() with pytest.raises(RuntimeError) as error: feature_group.ingest(data_frame=df, max_workers=0, max_processes=1) assert "max_workers must be greater than 0." in str(error)
def test_feature_store_create(sagemaker_session_mock, role_arn, feature_group_dummy_definitions, s3_uri): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions feature_group.create( s3_uri=s3_uri, record_identifier_name="feature1", event_time_feature_name="feature2", role_arn=role_arn, enable_online_store=True, ) sagemaker_session_mock.create_feature_group.assert_called_with( feature_group_name="MyFeatureGroup", record_identifier_name="feature1", event_time_feature_name="feature2", feature_definitions=[ fd.to_dict() for fd in feature_group_dummy_definitions ], role_arn=role_arn, description=None, tags=None, online_store_config={"EnableOnlineStore": True}, offline_store_config={ "DisableGlueTableCreation": False, "S3StorageConfig": { "S3Uri": s3_uri }, }, )
def test_create_feature_store( feature_store_session, role, feature_group_name, offline_store_s3_uri, pandas_data_frame, record, create_table_ddl, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions(data_frame=pandas_data_frame) with cleanup_feature_group(feature_group): output = feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="feature1", event_time_feature_name="feature3", role_arn=role, enable_online_store=True, ) _wait_for_feature_group_create(feature_group) # Ingest data feature_group.put_record(record=record) ingestion_manager = feature_group.ingest(data_frame=pandas_data_frame, max_workers=3, wait=False) ingestion_manager.wait() assert 0 == len(ingestion_manager.failed_rows) # Query the integrated Glue table. athena_query = feature_group.athena_query() df = DataFrame() with timeout(minutes=10): while df.shape[0] < 11: athena_query.run( query_string=f'SELECT * FROM "{athena_query.table_name}"', output_location=f"{offline_store_s3_uri}/query_results", ) athena_query.wait() assert "SUCCEEDED" == athena_query.get_query_execution().get( "QueryExecution").get("Status").get("State") df = athena_query.as_dataframe() print(f"Found {df.shape[0]} records.") time.sleep(60) assert df.shape[0] == 11 nans = pd.isna(df.loc[df["feature1"].isin([5, 6, 7, 8, 9])]["feature4"]) for is_na in nans.items(): assert is_na assert (create_table_ddl.format( feature_group_name=feature_group_name, region=feature_store_session.boto_session.region_name, account=feature_store_session.account_id(), ) == feature_group.as_hive_ddl()) assert output["FeatureGroupArn"].endswith( f"feature-group/{feature_group_name}")
def cleanup_feature_group(feature_group: FeatureGroup): try: yield finally: try: feature_group.delete() except Exception: pass
def cleanup_feature_group(feature_group: FeatureGroup): try: yield finally: try: feature_group.delete() print("FeatureGroup cleaned up") except Exception as e: print(f"Delete FeatureGroup failed with error: {e}.") pass
def cleanup_feature_group(feature_group: FeatureGroup): try: yield finally: try: feature_group.delete() except Exception: raise RuntimeError( f"Failed to delete feature group with name {feature_group.name}" )
def _wait_for_feature_group_create(feature_group: FeatureGroup): status = feature_group.describe().get("FeatureGroupStatus") while status == "Creating": print("Waiting for Feature Group Creation") time.sleep(5) status = feature_group.describe().get("FeatureGroupStatus") if status != "Created": print(feature_group.describe()) raise RuntimeError(f"Failed to create feature group {feature_group.name}") print(f"FeatureGroup {feature_group.name} successfully created.")
def test_load_feature_definition_unsupported_types(sagemaker_session_mock): feature_group = FeatureGroup(name="FailedGroup", sagemaker_session=sagemaker_session_mock) df = pd.DataFrame({ "float": pd.Series([2.0], dtype="float64"), "int": pd.Series([2], dtype="int64"), "object": pd.Series(["f1"], dtype="object"), }) with pytest.raises(ValueError) as error: feature_group.load_feature_definitions(data_frame=df) assert "Failed to infer Feature type based on dtype object for column object." in str( error)
def test_load_feature_definition(sagemaker_session_mock): feature_group = FeatureGroup(name="SomeGroup", sagemaker_session=sagemaker_session_mock) df = pd.DataFrame( { "float": pd.Series([2.0], dtype="float64"), "int": pd.Series([2], dtype="int64"), "string": pd.Series(["f1"], dtype="string"), } ) feature_definitions = feature_group.load_feature_definitions(data_frame=df) names = [fd.feature_name for fd in feature_definitions] types = [fd.feature_type for fd in feature_definitions] assert names == ["float", "int", "string"] assert types == [FeatureTypeEnum.FRACTIONAL, FeatureTypeEnum.INTEGRAL, FeatureTypeEnum.STRING]
def test_ingest_default_max_workers(ingestion_manager_init, sagemaker_session_mock): feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) df = pd.DataFrame({"float": pd.Series([2.0], dtype="float64")}) mock_ingestion_manager_instance = Mock() ingestion_manager_init.return_value = mock_ingestion_manager_instance feature_group.ingest(data_frame=df) ingestion_manager_init.assert_called_once_with( feature_group_name="MyGroup", sagemaker_session=sagemaker_session_mock, data_frame=df, max_workers=1, ) mock_ingestion_manager_instance.run.assert_called_once_with(wait=True, timeout=None)
def test_ingest(ingestion_manager_init, sagemaker_session_mock): feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock) df = pd.DataFrame(dict((f"float{i}", pd.Series([2.0], dtype="float64")) for i in range(300))) mock_ingestion_manager_instance = Mock() ingestion_manager_init.return_value = mock_ingestion_manager_instance feature_group.ingest(data_frame=df, max_workers=10) ingestion_manager_init.assert_called_once_with( feature_group_name="MyGroup", sagemaker_session=sagemaker_session_mock, data_frame=df, max_workers=10, ) mock_ingestion_manager_instance.run.assert_called_once_with(wait=True, timeout=None)
def create_feature_group( feature_group_name, feature_group_description, df, id_name, event_time_name, offline_feature_group_bucket, sagemaker_session, role, ): """ Create a new FeatureGroup. :param feature_group_name: str :param feature_group_description: str :param df: pandas.DataFrame :param id_name: str :param event_time_name: str :param offline_feature_group_bucket: str :param sagemaker_session: sagemaker.Session() :param role: str :return: tuple(FeatureGroup, bool) """ feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=sagemaker_session) feature_definitions = get_feature_definitions(df, feature_group) feature_group.feature_definitions = feature_definitions feature_group_already_exists = False try: print(f"Trying to create feature group {feature_group_description} \n") feature_group.create( description=feature_group_description, record_identifier_name=id_name, event_time_feature_name=event_time_name, role_arn=role, s3_uri=offline_feature_group_bucket, enable_online_store=True, ) wait_for_feature_group_creation_complete(feature_group) except Exception as e: code = e.response.get("Error").get("Code") if code == "ResourceInUse": print(f"Using existing feature group: {feature_group_name}") feature_group_already_exists = True else: raise (e) return feature_group, feature_group_already_exists
def test_ingest_multi_process( feature_store_session, role, feature_group_name, offline_store_s3_uri, pandas_data_frame, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions(data_frame=pandas_data_frame) with cleanup_feature_group(feature_group): output = feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="feature1", event_time_feature_name="feature3", role_arn=role, enable_online_store=True, ) _wait_for_feature_group_create(feature_group) feature_group.ingest(data_frame=pandas_data_frame, max_workers=3, max_processes=2, wait=True) assert output["FeatureGroupArn"].endswith( f"feature-group/{feature_group_name}")
def test_feature_store_create( sagemaker_session_mock, role_arn, feature_group_dummy_definitions, s3_uri ): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions feature_group.create( s3_uri=s3_uri, record_identifier_name="feature1", event_time_feature_name="feature2", role_arn=role_arn, enable_online_store=True, ) assert sagemaker_session_mock.create_feature_group.called_with( feature_group_name="MyFeatureGroup", record_identifier_name="feature1", event_time_feature_name="feature2", role_arn=role_arn, online_store_config={"EnableOnlineStore": True}, feature_definitions=[fd.to_dict() for fd in feature_group_dummy_definitions], )
def test_create_feature_store_online_only( feature_store_session, role, feature_group_name, pandas_data_frame, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions(data_frame=pandas_data_frame) with cleanup_feature_group(feature_group): output = feature_group.create( s3_uri=False, record_identifier_name="feature1", event_time_feature_name="feature3", role_arn=role, enable_online_store=True, ) _wait_for_feature_group_create(feature_group) assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}")
def test_ingest_without_string_feature( feature_store_session, role, feature_group_name, offline_store_s3_uri, pandas_data_frame_without_string, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions( data_frame=pandas_data_frame_without_string) with cleanup_feature_group(feature_group): output = feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="feature1", event_time_feature_name="feature2", role_arn=role, enable_online_store=True, ) _wait_for_feature_group_create(feature_group) ingestion_manager = feature_group.ingest( data_frame=pandas_data_frame_without_string, max_workers=3, wait=False) ingestion_manager.wait() assert output["FeatureGroupArn"].endswith( f"feature-group/{feature_group_name}")
def __init__( self, name: str, s3_uri: str, record_identifier_name: str, event_time_name: str, query: str, description: str = None, ): region = boto3.Session().region_name boto_session = boto3.Session(region_name=region) sagemaker_client = boto_session.client( service_name="sagemaker", region_name=region ) featurestore_runtime = boto_session.client( service_name="sagemaker-featurestore-runtime", region_name=region ) feature_store_session = Session( boto_session=boto_session, sagemaker_client=sagemaker_client, sagemaker_featurestore_runtime_client=featurestore_runtime, ) iam = boto3.client("iam") role = iam.get_role(RoleName="AmazonSageMaker-ExecutionRole")["Role"]["Arn"] # you can also suffix the feature group name with pipeline git version self._feature_group = FeatureGroup( name=name, sagemaker_session=feature_store_session ) self._description = description self._s3_uri = s3_uri self._role = role self._record_identifier_name = record_identifier_name self._event_time_name = event_time_name self._query = query
def create_or_load_feature_group(prefix, feature_group_name): # Feature Definitions for our records feature_definitions = [ FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL), FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL), # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING), ] feature_group = FeatureGroup( name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session ) print("Feature Group: {}".format(feature_group)) try: print( "Waiting for existing Feature Group to become available if it is being created by another instance in our cluster..." ) wait_for_feature_group_creation_complete(feature_group) except Exception as e: print("Before CREATE FG wait exeption: {}".format(e)) # pass try: record_identifier_feature_name = "review_id" event_time_feature_name = "date" print("Creating Feature Group with role {}...".format(role)) feature_group.create( s3_uri=f"s3://{bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=role, enable_online_store=True, ) print("Creating Feature Group. Completed.") print("Waiting for new Feature Group to become available...") wait_for_feature_group_creation_complete(feature_group) print("Feature Group available.") feature_group.describe() except Exception as e: print("Exception: {}".format(e)) return feature_group
# Setup the feature store timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") print(timestamp) prefix = 'reviews-feature-store-' + timestamp print(prefix) print('List Feature Groups: {}'.format(sm.list_feature_groups())) from sagemaker.feature_store.feature_group import FeatureGroup reviews_feature_group_name = 'reviews-feature-group-' + strftime( '%d-%H-%M-%S', gmtime()) print(reviews_feature_group_name) reviews_feature_group = FeatureGroup(name=reviews_feature_group_name, sagemaker_session=sagemaker_session) print(reviews_feature_group) # record identifier and event time feature names record_identifier_feature_name = "review_id" event_time_feature_name = "date" def cast_object_to_string(data_frame): for label in data_frame.columns: if data_frame.dtypes[label] == 'object': data_frame[label] = data_frame[label].astype("str").astype( "string") def wait_for_feature_group_creation_complete(feature_group):
def test_one_step_ingestion_pipeline(sagemaker_session, feature_store_session, feature_definitions, role, pipeline_name): instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.4xlarge") input_name = "features.csv" input_file_path = os.path.join(DATA_DIR, "workflow", "features.csv") input_data_uri = os.path.join("s3://", sagemaker_session.default_bucket(), "py-sdk-ingestion-test-input/features.csv") with open(input_file_path, "r") as data: body = data.read() S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session) inputs = [ ProcessingInput( input_name=input_name, source=input_data_uri, destination="/opt/ml/processing/features.csv", ) ] feature_group_name = f"py-sdk-integ-fg-{int(time.time() * 10**7)}" feature_group = FeatureGroup( name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=feature_store_session, ) ingestion_only_flow, output_name = generate_data_ingestion_flow_from_s3_input( input_name, input_data_uri, s3_content_type="csv", s3_has_header=True, ) outputs = [ ProcessingOutput( output_name=output_name, app_managed=True, feature_store_output=FeatureStoreOutput( feature_group_name=feature_group_name), ) ] temp_flow_path = "./ingestion.flow" with cleanup_feature_group(feature_group): json.dump(ingestion_only_flow, open(temp_flow_path, "w")) data_wrangler_processor = DataWranglerProcessor( role=role, data_wrangler_flow_source=temp_flow_path, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, max_runtime_in_seconds=86400, ) data_wrangler_step = ProcessingStep(name="ingestion-step", processor=data_wrangler_processor, inputs=inputs, outputs=outputs) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count, instance_type], steps=[data_wrangler_step], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] offline_store_s3_uri = os.path.join( "s3://", sagemaker_session.default_bucket(), feature_group_name) feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="f11", event_time_feature_name="f10", role_arn=role, enable_online_store=False, ) _wait_for_feature_group_create(feature_group) execution = pipeline.start() response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=60, max_attempts=10) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "ingestion-step" assert execution_steps[0]["StepStatus"] == "Succeeded" athena_query = feature_group.athena_query() with timeout(minutes=10): athena_query.run( query_string=f'SELECT * FROM "{athena_query.table_name}"', output_location=f"{offline_store_s3_uri}/query_results", ) athena_query.wait() assert "SUCCEEDED" == athena_query.get_query_execution().get( "QueryExecution").get("Status").get("State") df = athena_query.as_dataframe() assert pd.read_csv(input_file_path).shape[0] == df.shape[0] finally: try: pipeline.delete() except Exception as e: print(f"Delete pipeline failed with error: {e}") os.remove(temp_flow_path)
def create_or_load_feature_group(prefix, feature_group_name): # Feature Definitions for the records feature_definitions = [ FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='sentiment', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING) ] # setup the Feature Group feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session) print('Feature Group: {}'.format(feature_group)) try: print( 'Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...' ) wait_for_feature_group_creation_complete(feature_group) except Exception as e: print('Before CREATE FG wait exeption: {}'.format(e)) try: record_identifier_feature_name = "review_id" event_time_feature_name = "date" print('Creating Feature Group with role {}...'.format(role)) # create Feature Group feature_group.create( s3_uri=f"s3://{bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=role, enable_online_store=False) print('Creating Feature Group. Completed.') print('Waiting for new Feature Group to become available...') wait_for_feature_group_creation_complete(feature_group) print('Feature Group available.') # the information about the Feature Group feature_group.describe() except Exception as e: print('Exception: {}'.format(e)) return feature_group
def test_put_record(sagemaker_session_mock): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.put_record(record=[]) sagemaker_session_mock.put_record.assert_called_with( feature_group_name="MyFeatureGroup", record=[])
def test_feature_store_describe(sagemaker_session_mock): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.describe() sagemaker_session_mock.describe_feature_group.assert_called_with( feature_group_name="MyFeatureGroup", next_token=None)
def test_feature_store_delete(sagemaker_session_mock): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.delete() sagemaker_session_mock.delete_feature_group.assert_called_with( feature_group_name="MyFeatureGroup")
bucket = args.bucket boto_session = boto3.Session(region_name=region) sagemaker_client = boto_session.client(service_name='sagemaker') featurestore_client = boto_session.client( service_name='sagemaker-featurestore-runtime') session = sagemaker.session.Session( boto_session=boto_session, sagemaker_client=sagemaker_client, sagemaker_featurestore_runtime_client=featurestore_client) # Read feature group name with open('/opt/ml/processing/input/feature_group_name.txt') as f: feature_group_name = f.read() feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=session) feature_group_query = feature_group.athena_query() feature_group_table = feature_group_query.table_name print(feature_group_table) query_string = 'SELECT label,review_body FROM "' \ + feature_group_table+'"' \ + ' INNER JOIN (SELECT product_id FROM (SELECT product_id, avg(star_rating) as avg_rating, count(*) as review_count \ FROM "' + feature_group_table+'"' \ + ' GROUP BY product_id) WHERE review_count > 1000) tmp ON "' \ + feature_group_table+'"'+ '.product_id=tmp.product_id;' print(query_string) dataset = pd.DataFrame() feature_group_query.run(query_string=query_string,
featurestore_client = boto_session.client(service_name='sagemaker-featurestore-runtime') session = sagemaker.session.Session( boto_session=boto_session, sagemaker_client=sagemaker_client, sagemaker_featurestore_runtime_client=featurestore_client) # Load input data input_data_path = '/opt/ml/processing/input/fs_data.tsv' print('Reading input data from {}'.format(input_data_path)) data = pd.read_csv(input_data_path, sep='\t', error_bad_lines=False, dtype='str') # Define the feature group name print('Creating feature group...') from sagemaker.feature_store.feature_group import FeatureGroup feature_group = FeatureGroup(name=fg_name, sagemaker_session=session) # Define the name of the column storing a unique record id (e.g. primary key) record_identifier_feature_name = 'review_id' # Add a column to store feature timestamps event_time_feature_name = 'event_time' current_time_sec = int(round(time())) data = data.assign(event_time=current_time_sec) # Set the correct type for each column data['review_id'] = data['review_id'].astype('str').astype('string') data['product_id'] = data['product_id'].astype('str').astype('string') data['review_body'] = data['review_body'].astype('str').astype('string') data['label'] = data['label'].astype('str').astype('string') data['star_rating'] = data['star_rating'].astype('int64') data['event_time'] = data['event_time'].astype('float64') # Load feature definitions feature_group.load_feature_definitions(data_frame=data)
featurestore_runtime = boto_session.client( service_name='sagemaker-featurestore-runtime', region_name=region) feature_store_session = sagemaker.session.Session( boto_session=boto_session, sagemaker_client=sagemaker_client, sagemaker_featurestore_runtime_client=featurestore_runtime) feature_group_names = [ args.feature_group_name_ratings, args.feature_group_name_tracks, args.feature_group_name_user_preferences ] feature_groups = [] for name in feature_group_names: feature_group = FeatureGroup(name=name, sagemaker_session=feature_store_session) feature_groups.append(feature_group) feature_group_s3_prefixes = [] for feature_group in feature_groups: feature_group_table_name = feature_group.describe().get( "OfflineStoreConfig").get("DataCatalogConfig").get("TableName") feature_group_s3_prefix = f'{account_id}/sagemaker/{region}/offline-store/{feature_group_table_name}' feature_group_s3_prefixes.append(feature_group_s3_prefix) # wait for data to be added to offline feature store def wait_for_offline_store(feature_group_s3_prefix): print(feature_group_s3_prefix) offline_store_contents = None while (offline_store_contents is None):
def create_or_load_feature_group(prefix, feature_group_name): # Feature Definitions for our records feature_definitions = [ FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL), FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL), # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING) ] feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session) print('Feature Group: {}'.format(feature_group)) try: print( 'Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...' ) wait_for_feature_group_creation_complete(feature_group) except Exception as e: print('Before CREATE FG wait exeption: {}'.format(e)) # pass try: record_identifier_feature_name = "review_id" event_time_feature_name = "date" print('Creating Feature Group with role {}...'.format(role)) feature_group.create( s3_uri=f"s3://{bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=role, enable_online_store=True) print('Creating Feature Group. Completed.') print('Waiting for new Feature Group to become available...') wait_for_feature_group_creation_complete(feature_group) print('Feature Group available.') feature_group.describe() except Exception as e: print('Exception: {}'.format(e)) # pass # print('FAILED - NOW Creating Feature Group with service-role {}...'.format('arn:aws:iam::231218423789:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole')) # feature_group.create( # s3_uri=f"s3://{bucket}/{prefix}", # record_identifier_name=record_identifier_feature_name, # event_time_feature_name=event_time_feature_name, # role_arn='arn:aws:iam::231218423789:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole', # enable_online_store=True # ) # print('Creating Feature Group. Completed.') # feature_group.describe() return feature_group