示例#1
0
def _upload_entity_df(
    entity_df: Union[pd.DataFrame, str],
    redshift_client,
    config: RepoConfig,
    s3_resource,
    table_name: str,
):
    if isinstance(entity_df, pd.DataFrame):
        # If the entity_df is a pandas dataframe, upload it to Redshift
        aws_utils.upload_df_to_redshift(
            redshift_client,
            config.offline_store.cluster_id,
            config.offline_store.database,
            config.offline_store.user,
            s3_resource,
            f"{config.offline_store.s3_staging_location}/entity_df/{table_name}.parquet",
            config.offline_store.iam_role,
            table_name,
            entity_df,
        )
    elif isinstance(entity_df, str):
        # If the entity_df is a string (SQL query), create a Redshift table out of it
        aws_utils.execute_redshift_statement(
            redshift_client,
            config.offline_store.cluster_id,
            config.offline_store.database,
            config.offline_store.user,
            f"CREATE TABLE {table_name} AS ({entity_df})",
        )
    else:
        raise InvalidEntityType(type(entity_df))
示例#2
0
文件: redshift.py 项目: dmatrix/feast
    def to_redshift(self, table_name: str) -> None:
        """ Save dataset as a new Redshift table """
        if self.on_demand_feature_views is not None:
            transformed_df = self.to_df()
            aws_utils.upload_df_to_redshift(
                self._redshift_client,
                self._config.offline_store.cluster_id,
                self._config.offline_store.database,
                self._config.offline_store.user,
                self._s3_resource,
                f"{self._config.offline_store.s3_staging_location}/features_df/{table_name}.parquet",
                self._config.offline_store.iam_role,
                table_name,
                transformed_df,
            )
            return

        with self._query_generator() as query:
            query = f'CREATE TABLE "{table_name}" AS ({query});\n'
            if self._drop_columns is not None:
                for column in self._drop_columns:
                    query += f"ALTER TABLE {table_name} DROP COLUMN {column};\n"

            aws_utils.execute_redshift_statement(
                self._redshift_client,
                self._config.offline_store.cluster_id,
                self._config.offline_store.database,
                self._config.offline_store.user,
                query,
            )
示例#3
0
def bootstrap():
    # Bootstrap() will automatically be called from the init_repo() during `feast init`

    import pathlib
    from datetime import datetime, timedelta

    from feast.driver_test_data import create_driver_hourly_stats_df

    end_date = datetime.now().replace(microsecond=0, second=0, minute=0)
    start_date = end_date - timedelta(days=15)

    driver_entities = [1001, 1002, 1003, 1004, 1005]
    driver_df = create_driver_hourly_stats_df(driver_entities, start_date,
                                              end_date)

    aws_region = click.prompt("AWS Region (e.g. us-west-2)")
    cluster_id = click.prompt("Redshift Cluster ID")
    database = click.prompt("Redshift Database Name")
    user = click.prompt("Redshift User Name")
    s3_staging_location = click.prompt("Redshift S3 Staging Location (s3://*)")
    iam_role = click.prompt("Redshift IAM Role for S3 (arn:aws:iam::*:role/*)")

    if click.confirm(
            "Should I upload example data to Redshift (overwriting 'feast_driver_hourly_stats' table)?",
            default=True,
    ):
        client = aws_utils.get_redshift_data_client(aws_region)
        s3 = aws_utils.get_s3_resource(aws_region)

        aws_utils.execute_redshift_statement(
            client,
            cluster_id,
            database,
            user,
            "DROP TABLE IF EXISTS feast_driver_hourly_stats",
        )

        aws_utils.upload_df_to_redshift(
            client,
            cluster_id,
            database,
            user,
            s3,
            f"{s3_staging_location}/data/feast_driver_hourly_stats.parquet",
            iam_role,
            "feast_driver_hourly_stats",
            driver_df,
        )

    repo_path = pathlib.Path(__file__).parent.absolute()
    config_file = repo_path / "feature_store.yaml"

    replace_str_in_file(config_file, "%AWS_REGION%", aws_region)
    replace_str_in_file(config_file, "%REDSHIFT_CLUSTER_ID%", cluster_id)
    replace_str_in_file(config_file, "%REDSHIFT_DATABASE%", database)
    replace_str_in_file(config_file, "%REDSHIFT_USER%", user)
    replace_str_in_file(config_file, "%REDSHIFT_S3_STAGING_LOCATION%",
                        s3_staging_location)
    replace_str_in_file(config_file, "%REDSHIFT_IAM_ROLE%", iam_role)
示例#4
0
 def teardown(self):
     for table in self.tables:
         aws_utils.execute_redshift_statement(
             self.client,
             self.offline_store_config.cluster_id,
             self.offline_store_config.database,
             self.offline_store_config.user,
             f"DROP TABLE IF EXISTS {table}",
         )
示例#5
0
        def query_generator() -> Iterator[str]:
            table_name = offline_utils.get_temp_entity_table_name()

            entity_schema = _upload_entity_df_and_get_entity_schema(
                entity_df, redshift_client, config, s3_resource, table_name)

            entity_df_event_timestamp_col = offline_utils.infer_event_timestamp_from_entity_df(
                entity_schema)

            expected_join_keys = offline_utils.get_expected_join_keys(
                project, feature_views, registry)

            offline_utils.assert_expected_columns_in_entity_df(
                entity_schema, expected_join_keys,
                entity_df_event_timestamp_col)

            entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range(
                entity_df,
                entity_df_event_timestamp_col,
                redshift_client,
                config,
                table_name,
            )

            # Build a query context containing all information required to template the Redshift SQL query
            query_context = offline_utils.get_feature_view_query_context(
                feature_refs,
                feature_views,
                registry,
                project,
                entity_df_event_timestamp_range,
            )

            # Generate the Redshift SQL query from the query context
            query = offline_utils.build_point_in_time_query(
                query_context,
                left_table_query_string=table_name,
                entity_df_event_timestamp_col=entity_df_event_timestamp_col,
                entity_df_columns=entity_schema.keys(),
                query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN,
                full_feature_names=full_feature_names,
            )

            try:
                yield query
            finally:
                # Always clean up the uploaded Redshift table
                aws_utils.execute_redshift_statement(
                    redshift_client,
                    config.offline_store.cluster_id,
                    config.offline_store.database,
                    config.offline_store.user,
                    f"DROP TABLE IF EXISTS {table_name}",
                )
示例#6
0
    def to_redshift(self, table_name: str) -> None:
        """ Save dataset as a new Redshift table """
        with self._query_generator() as query:
            query = f'CREATE TABLE "{table_name}" AS ({query});\n'
            if self._drop_columns is not None:
                for column in self._drop_columns:
                    query += f"ALTER TABLE {table_name} DROP COLUMN {column};\n"

            aws_utils.execute_redshift_statement(
                self._redshift_client,
                self._config.offline_store.cluster_id,
                self._config.offline_store.database,
                self._config.offline_store.user,
                query,
            )
示例#7
0
文件: redshift.py 项目: dmatrix/feast
def _upload_entity_df_and_get_entity_schema(
    entity_df: Union[pd.DataFrame, str],
    redshift_client,
    config: RepoConfig,
    s3_resource,
    table_name: str,
) -> Dict[str, np.dtype]:
    if isinstance(entity_df, pd.DataFrame):
        # If the entity_df is a pandas dataframe, upload it to Redshift
        # and construct the schema from the original entity_df dataframe
        aws_utils.upload_df_to_redshift(
            redshift_client,
            config.offline_store.cluster_id,
            config.offline_store.database,
            config.offline_store.user,
            s3_resource,
            f"{config.offline_store.s3_staging_location}/entity_df/{table_name}.parquet",
            config.offline_store.iam_role,
            table_name,
            entity_df,
        )
        return dict(zip(entity_df.columns, entity_df.dtypes))
    elif isinstance(entity_df, str):
        # If the entity_df is a string (SQL query), create a Redshift table out of it,
        # get pandas dataframe consisting of 1 row (LIMIT 1) and generate the schema out of it
        aws_utils.execute_redshift_statement(
            redshift_client,
            config.offline_store.cluster_id,
            config.offline_store.database,
            config.offline_store.user,
            f"CREATE TABLE {table_name} AS ({entity_df})",
        )
        limited_entity_df = RedshiftRetrievalJob(
            f"SELECT * FROM {table_name} LIMIT 1",
            redshift_client,
            s3_resource,
            config,
            full_feature_names=False,
            on_demand_feature_views=None,
        ).to_df()
        return dict(zip(limited_entity_df.columns, limited_entity_df.dtypes))
    else:
        raise InvalidEntityType(type(entity_df))
示例#8
0
    def get_table_column_names_and_types(
            self, config: RepoConfig) -> Iterable[Tuple[str, str]]:
        """
        Returns a mapping of column names to types for this Redshift source.

        Args:
            config: A RepoConfig describing the feature repo
        """
        from botocore.exceptions import ClientError

        from feast.infra.offline_stores.redshift import RedshiftOfflineStoreConfig
        from feast.infra.utils import aws_utils

        assert isinstance(config.offline_store, RedshiftOfflineStoreConfig)

        client = aws_utils.get_redshift_data_client(
            config.offline_store.region)
        if self.table is not None:
            try:
                table = client.describe_table(
                    ClusterIdentifier=config.offline_store.cluster_id,
                    Database=(self.database if self.database else
                              config.offline_store.database),
                    DbUser=config.offline_store.user,
                    Table=self.table,
                    Schema=self.schema,
                )
            except ClientError as e:
                if e.response["Error"]["Code"] == "ValidationException":
                    raise RedshiftCredentialsError() from e
                raise

            # The API returns valid JSON with empty column list when the table doesn't exist
            if len(table["ColumnList"]) == 0:
                raise DataSourceNotFoundException(self.table)

            columns = table["ColumnList"]
        else:
            statement_id = aws_utils.execute_redshift_statement(
                client,
                config.offline_store.cluster_id,
                self.database
                if self.database else config.offline_store.database,
                config.offline_store.user,
                f"SELECT * FROM ({self.query}) LIMIT 1",
            )
            columns = aws_utils.get_redshift_statement_result(
                client, statement_id)["ColumnMetadata"]

        return [(column["name"], column["typeName"].upper())
                for column in columns]
示例#9
0
def _get_entity_df_event_timestamp_range(
    entity_df: Union[pd.DataFrame, str],
    entity_df_event_timestamp_col: str,
    redshift_client,
    config: RepoConfig,
    table_name: str,
) -> Tuple[datetime, datetime]:
    if isinstance(entity_df, pd.DataFrame):
        entity_df_event_timestamp = entity_df.loc[:,
                                                  entity_df_event_timestamp_col].infer_objects(
                                                  )
        if pd.api.types.is_string_dtype(entity_df_event_timestamp):
            entity_df_event_timestamp = pd.to_datetime(
                entity_df_event_timestamp, utc=True)
        entity_df_event_timestamp_range = (
            entity_df_event_timestamp.min(),
            entity_df_event_timestamp.max(),
        )
    elif isinstance(entity_df, str):
        # If the entity_df is a string (SQL query), determine range
        # from table
        statement_id = aws_utils.execute_redshift_statement(
            redshift_client,
            config.offline_store.cluster_id,
            config.offline_store.database,
            config.offline_store.user,
            f"SELECT MIN({entity_df_event_timestamp_col}) AS min, MAX({entity_df_event_timestamp_col}) AS max FROM {table_name}",
        )
        res = aws_utils.get_redshift_statement_result(
            redshift_client, statement_id)["Records"][0]
        entity_df_event_timestamp_range = (
            parser.parse(res[0]["stringValue"]),
            parser.parse(res[1]["stringValue"]),
        )
    else:
        raise InvalidEntityType(type(entity_df))

    return entity_df_event_timestamp_range
def prep_redshift_fs_and_fv(
    source_type: str, ) -> Iterator[Tuple[FeatureStore, FeatureView]]:
    client = aws_utils.get_redshift_data_client("us-west-2")
    s3 = aws_utils.get_s3_resource("us-west-2")

    df = create_dataset()

    table_name = f"test_ingestion_{source_type}_correctness_{int(time.time_ns())}_{random.randint(1000, 9999)}"

    offline_store = RedshiftOfflineStoreConfig(
        cluster_id="feast-integration-tests",
        region="us-west-2",
        user="******",
        database="feast",
        s3_staging_location=
        "s3://feast-integration-tests/redshift/tests/ingestion",
        iam_role="arn:aws:iam::402087665549:role/redshift_s3_access_role",
    )

    aws_utils.upload_df_to_redshift(
        client,
        offline_store.cluster_id,
        offline_store.database,
        offline_store.user,
        s3,
        f"{offline_store.s3_staging_location}/copy/{table_name}.parquet",
        offline_store.iam_role,
        table_name,
        df,
    )

    redshift_source = RedshiftSource(
        table=table_name if source_type == "table" else None,
        query=f"SELECT * FROM {table_name}"
        if source_type == "query" else None,
        event_timestamp_column="ts",
        created_timestamp_column="created_ts",
        date_partition_column="",
        field_mapping={
            "ts_1": "ts",
            "id": "driver_id"
        },
    )

    fv = driver_feature_view(redshift_source)
    e = Entity(
        name="driver",
        description="id for driver",
        join_key="driver_id",
        value_type=ValueType.INT32,
    )
    with tempfile.TemporaryDirectory(
    ) as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name:
        config = RepoConfig(
            registry=str(Path(repo_dir_name) / "registry.db"),
            project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}",
            provider="local",
            online_store=SqliteOnlineStoreConfig(
                path=str(Path(data_dir_name) / "online_store.db")),
            offline_store=offline_store,
        )
        fs = FeatureStore(config=config)
        fs.apply([fv, e])

        yield fs, fv

        fs.teardown()

    # Clean up the uploaded Redshift table
    aws_utils.execute_redshift_statement(
        client,
        offline_store.cluster_id,
        offline_store.database,
        offline_store.user,
        f"DROP TABLE {table_name}",
    )