def to_redshift(self, table_name: str) -> None: """ Save dataset as a new Redshift table """ if self.on_demand_feature_views is not None: transformed_df = self.to_df() aws_utils.upload_df_to_redshift( self._redshift_client, self._config.offline_store.cluster_id, self._config.offline_store.database, self._config.offline_store.user, self._s3_resource, f"{self._config.offline_store.s3_staging_location}/features_df/{table_name}.parquet", self._config.offline_store.iam_role, table_name, transformed_df, ) return with self._query_generator() as query: query = f'CREATE TABLE "{table_name}" AS ({query});\n' if self._drop_columns is not None: for column in self._drop_columns: query += f"ALTER TABLE {table_name} DROP COLUMN {column};\n" aws_utils.execute_redshift_statement( self._redshift_client, self._config.offline_store.cluster_id, self._config.offline_store.database, self._config.offline_store.user, query, )
def create_data_source( self, df: pd.DataFrame, destination_name: str, suffix: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: destination_name = self.get_prefixed_table_name(destination_name) aws_utils.upload_df_to_redshift( self.client, self.offline_store_config.cluster_id, self.offline_store_config.database, self.offline_store_config.user, self.s3, f"{self.offline_store_config.s3_staging_location}/copy/{destination_name}.parquet", self.offline_store_config.iam_role, destination_name, df, ) self.tables.append(destination_name) return RedshiftSource( table=destination_name, event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, date_partition_column="", field_mapping=field_mapping or {"ts_1": "ts"}, )
def _upload_entity_df( entity_df: Union[pd.DataFrame, str], redshift_client, config: RepoConfig, s3_resource, table_name: str, ): if isinstance(entity_df, pd.DataFrame): # If the entity_df is a pandas dataframe, upload it to Redshift aws_utils.upload_df_to_redshift( redshift_client, config.offline_store.cluster_id, config.offline_store.database, config.offline_store.user, s3_resource, f"{config.offline_store.s3_staging_location}/entity_df/{table_name}.parquet", config.offline_store.iam_role, table_name, entity_df, ) elif isinstance(entity_df, str): # If the entity_df is a string (SQL query), create a Redshift table out of it aws_utils.execute_redshift_statement( redshift_client, config.offline_store.cluster_id, config.offline_store.database, config.offline_store.user, f"CREATE TABLE {table_name} AS ({entity_df})", ) else: raise InvalidEntityType(type(entity_df))
def bootstrap(): # Bootstrap() will automatically be called from the init_repo() during `feast init` import pathlib from datetime import datetime, timedelta from feast.driver_test_data import create_driver_hourly_stats_df end_date = datetime.now().replace(microsecond=0, second=0, minute=0) start_date = end_date - timedelta(days=15) driver_entities = [1001, 1002, 1003, 1004, 1005] driver_df = create_driver_hourly_stats_df(driver_entities, start_date, end_date) aws_region = click.prompt("AWS Region (e.g. us-west-2)") cluster_id = click.prompt("Redshift Cluster ID") database = click.prompt("Redshift Database Name") user = click.prompt("Redshift User Name") s3_staging_location = click.prompt("Redshift S3 Staging Location (s3://*)") iam_role = click.prompt("Redshift IAM Role for S3 (arn:aws:iam::*:role/*)") if click.confirm( "Should I upload example data to Redshift (overwriting 'feast_driver_hourly_stats' table)?", default=True, ): client = aws_utils.get_redshift_data_client(aws_region) s3 = aws_utils.get_s3_resource(aws_region) aws_utils.execute_redshift_statement( client, cluster_id, database, user, "DROP TABLE IF EXISTS feast_driver_hourly_stats", ) aws_utils.upload_df_to_redshift( client, cluster_id, database, user, s3, f"{s3_staging_location}/data/feast_driver_hourly_stats.parquet", iam_role, "feast_driver_hourly_stats", driver_df, ) repo_path = pathlib.Path(__file__).parent.absolute() config_file = repo_path / "feature_store.yaml" replace_str_in_file(config_file, "%AWS_REGION%", aws_region) replace_str_in_file(config_file, "%REDSHIFT_CLUSTER_ID%", cluster_id) replace_str_in_file(config_file, "%REDSHIFT_DATABASE%", database) replace_str_in_file(config_file, "%REDSHIFT_USER%", user) replace_str_in_file(config_file, "%REDSHIFT_S3_STAGING_LOCATION%", s3_staging_location) replace_str_in_file(config_file, "%REDSHIFT_IAM_ROLE%", iam_role)
def _upload_entity_df_and_get_entity_schema( entity_df: Union[pd.DataFrame, str], redshift_client, config: RepoConfig, s3_resource, table_name: str, ) -> Dict[str, np.dtype]: if isinstance(entity_df, pd.DataFrame): # If the entity_df is a pandas dataframe, upload it to Redshift # and construct the schema from the original entity_df dataframe aws_utils.upload_df_to_redshift( redshift_client, config.offline_store.cluster_id, config.offline_store.database, config.offline_store.user, s3_resource, f"{config.offline_store.s3_staging_location}/entity_df/{table_name}.parquet", config.offline_store.iam_role, table_name, entity_df, ) return dict(zip(entity_df.columns, entity_df.dtypes)) elif isinstance(entity_df, str): # If the entity_df is a string (SQL query), create a Redshift table out of it, # get pandas dataframe consisting of 1 row (LIMIT 1) and generate the schema out of it aws_utils.execute_redshift_statement( redshift_client, config.offline_store.cluster_id, config.offline_store.database, config.offline_store.user, f"CREATE TABLE {table_name} AS ({entity_df})", ) limited_entity_df = RedshiftRetrievalJob( f"SELECT * FROM {table_name} LIMIT 1", redshift_client, s3_resource, config, full_feature_names=False, on_demand_feature_views=None, ).to_df() return dict(zip(limited_entity_df.columns, limited_entity_df.dtypes)) else: raise InvalidEntityType(type(entity_df))
def prep_redshift_fs_and_fv( source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]: client = aws_utils.get_redshift_data_client("us-west-2") s3 = aws_utils.get_s3_resource("us-west-2") df = create_dataset() table_name = f"test_ingestion_{source_type}_correctness_{int(time.time_ns())}_{random.randint(1000, 9999)}" offline_store = RedshiftOfflineStoreConfig( cluster_id="feast-integration-tests", region="us-west-2", user="******", database="feast", s3_staging_location= "s3://feast-integration-tests/redshift/tests/ingestion", iam_role="arn:aws:iam::402087665549:role/redshift_s3_access_role", ) aws_utils.upload_df_to_redshift( client, offline_store.cluster_id, offline_store.database, offline_store.user, s3, f"{offline_store.s3_staging_location}/copy/{table_name}.parquet", offline_store.iam_role, table_name, df, ) redshift_source = RedshiftSource( table=table_name if source_type == "table" else None, query=f"SELECT * FROM {table_name}" if source_type == "query" else None, event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", field_mapping={ "ts_1": "ts", "id": "driver_id" }, ) fv = driver_feature_view(redshift_source) e = Entity( name="driver", description="id for driver", join_key="driver_id", value_type=ValueType.INT32, ) with tempfile.TemporaryDirectory( ) as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: config = RepoConfig( registry=str(Path(repo_dir_name) / "registry.db"), project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", provider="local", online_store=SqliteOnlineStoreConfig( path=str(Path(data_dir_name) / "online_store.db")), offline_store=offline_store, ) fs = FeatureStore(config=config) fs.apply([fv, e]) yield fs, fv fs.teardown() # Clean up the uploaded Redshift table aws_utils.execute_redshift_statement( client, offline_store.cluster_id, offline_store.database, offline_store.user, f"DROP TABLE {table_name}", )