def validate(self, config: RepoConfig): if not self.query: from google.api_core.exceptions import NotFound from google.cloud import bigquery client = bigquery.Client() try: client.get_table(self.table_ref) except NotFound: raise DataSourceNotFoundException(self.table_ref)
def get_table_column_names_and_types( self, config: RepoConfig) -> Iterable[Tuple[str, str]]: """ Returns a mapping of column names to types for this Redshift source. Args: config: A RepoConfig describing the feature repo """ from botocore.exceptions import ClientError from feast.infra.offline_stores.redshift import RedshiftOfflineStoreConfig from feast.infra.utils import aws_utils assert isinstance(config.offline_store, RedshiftOfflineStoreConfig) client = aws_utils.get_redshift_data_client( config.offline_store.region) if self.table is not None: try: table = client.describe_table( ClusterIdentifier=config.offline_store.cluster_id, Database=(self.database if self.database else config.offline_store.database), DbUser=config.offline_store.user, Table=self.table, Schema=self.schema, ) except ClientError as e: if e.response["Error"]["Code"] == "ValidationException": raise RedshiftCredentialsError() from e raise # The API returns valid JSON with empty column list when the table doesn't exist if len(table["ColumnList"]) == 0: raise DataSourceNotFoundException(self.table) columns = table["ColumnList"] else: statement_id = aws_utils.execute_redshift_statement( client, config.offline_store.cluster_id, self.database if self.database else config.offline_store.database, config.offline_store.user, f"SELECT * FROM ({self.query}) LIMIT 1", ) columns = aws_utils.get_redshift_statement_result( client, statement_id)["ColumnMetadata"] return [(column["name"], column["typeName"].upper()) for column in columns]
def delete_data_source(self, name: str, project: str, commit: bool = True): """ Deletes a data source or raises an exception if not found. Args: name: Name of data source project: Feast project that this data source belongs to commit: Whether the change should be persisted immediately """ self._prepare_registry_for_changes() assert self.cached_registry_proto for idx, data_source_proto in enumerate( self.cached_registry_proto.data_sources): if data_source_proto.name == name: del self.cached_registry_proto.data_sources[idx] if commit: self.commit() return raise DataSourceNotFoundException(name)
def get_table_column_names_and_types( self, config: RepoConfig) -> Iterable[Tuple[str, str]]: import boto3 from botocore.config import Config from botocore.exceptions import ClientError from feast.infra.offline_stores.redshift import RedshiftOfflineStoreConfig assert isinstance(config.offline_store, RedshiftOfflineStoreConfig) client = boto3.client( "redshift-data", config=Config(region_name=config.offline_store.region)) try: if self.table is not None: table = client.describe_table( ClusterIdentifier=config.offline_store.cluster_id, Database=config.offline_store.database, DbUser=config.offline_store.user, Table=self.table, ) # The API returns valid JSON with empty column list when the table doesn't exist if len(table["ColumnList"]) == 0: raise DataSourceNotFoundException(self.table) columns = table["ColumnList"] else: statement = client.execute_statement( ClusterIdentifier=config.offline_store.cluster_id, Database=config.offline_store.database, DbUser=config.offline_store.user, Sql=f"SELECT * FROM ({self.query}) LIMIT 1", ) # Need to retry client.describe_statement(...) until the task is finished. We don't want to bombard # Redshift with queries, and neither do we want to wait for a long time on the initial call. # The solution is exponential backoff. The backoff starts with 0.1 seconds and doubles exponentially # until reaching 30 seconds, at which point the backoff is fixed. @retry( wait=wait_exponential(multiplier=0.1, max=30), retry=retry_unless_exception_type(RedshiftQueryError), ) def wait_for_statement(): desc = client.describe_statement(Id=statement["Id"]) if desc["Status"] in ("SUBMITTED", "STARTED", "PICKED"): raise Exception # Retry if desc["Status"] != "FINISHED": raise RedshiftQueryError( desc) # Don't retry. Raise exception. wait_for_statement() result = client.get_statement_result(Id=statement["Id"]) columns = result["ColumnMetadata"] except ClientError as e: if e.response["Error"]["Code"] == "ValidationException": raise RedshiftCredentialsError() from e raise return [(column["name"], column["typeName"].upper()) for column in columns]