Пример #1
0
    def validate(self, config: RepoConfig):
        if not self.query:
            from google.api_core.exceptions import NotFound
            from google.cloud import bigquery

            client = bigquery.Client()
            try:
                client.get_table(self.table_ref)
            except NotFound:
                raise DataSourceNotFoundException(self.table_ref)
Пример #2
0
    def get_table_column_names_and_types(
            self, config: RepoConfig) -> Iterable[Tuple[str, str]]:
        """
        Returns a mapping of column names to types for this Redshift source.

        Args:
            config: A RepoConfig describing the feature repo
        """
        from botocore.exceptions import ClientError

        from feast.infra.offline_stores.redshift import RedshiftOfflineStoreConfig
        from feast.infra.utils import aws_utils

        assert isinstance(config.offline_store, RedshiftOfflineStoreConfig)

        client = aws_utils.get_redshift_data_client(
            config.offline_store.region)
        if self.table is not None:
            try:
                table = client.describe_table(
                    ClusterIdentifier=config.offline_store.cluster_id,
                    Database=(self.database if self.database else
                              config.offline_store.database),
                    DbUser=config.offline_store.user,
                    Table=self.table,
                    Schema=self.schema,
                )
            except ClientError as e:
                if e.response["Error"]["Code"] == "ValidationException":
                    raise RedshiftCredentialsError() from e
                raise

            # The API returns valid JSON with empty column list when the table doesn't exist
            if len(table["ColumnList"]) == 0:
                raise DataSourceNotFoundException(self.table)

            columns = table["ColumnList"]
        else:
            statement_id = aws_utils.execute_redshift_statement(
                client,
                config.offline_store.cluster_id,
                self.database
                if self.database else config.offline_store.database,
                config.offline_store.user,
                f"SELECT * FROM ({self.query}) LIMIT 1",
            )
            columns = aws_utils.get_redshift_statement_result(
                client, statement_id)["ColumnMetadata"]

        return [(column["name"], column["typeName"].upper())
                for column in columns]
Пример #3
0
    def delete_data_source(self, name: str, project: str, commit: bool = True):
        """
        Deletes a data source or raises an exception if not found.

        Args:
            name: Name of data source
            project: Feast project that this data source belongs to
            commit: Whether the change should be persisted immediately
        """
        self._prepare_registry_for_changes()
        assert self.cached_registry_proto

        for idx, data_source_proto in enumerate(
                self.cached_registry_proto.data_sources):
            if data_source_proto.name == name:
                del self.cached_registry_proto.data_sources[idx]
                if commit:
                    self.commit()
                return
        raise DataSourceNotFoundException(name)
Пример #4
0
    def get_table_column_names_and_types(
            self, config: RepoConfig) -> Iterable[Tuple[str, str]]:
        import boto3
        from botocore.config import Config
        from botocore.exceptions import ClientError

        from feast.infra.offline_stores.redshift import RedshiftOfflineStoreConfig

        assert isinstance(config.offline_store, RedshiftOfflineStoreConfig)

        client = boto3.client(
            "redshift-data",
            config=Config(region_name=config.offline_store.region))

        try:
            if self.table is not None:
                table = client.describe_table(
                    ClusterIdentifier=config.offline_store.cluster_id,
                    Database=config.offline_store.database,
                    DbUser=config.offline_store.user,
                    Table=self.table,
                )
                # The API returns valid JSON with empty column list when the table doesn't exist
                if len(table["ColumnList"]) == 0:
                    raise DataSourceNotFoundException(self.table)

                columns = table["ColumnList"]
            else:
                statement = client.execute_statement(
                    ClusterIdentifier=config.offline_store.cluster_id,
                    Database=config.offline_store.database,
                    DbUser=config.offline_store.user,
                    Sql=f"SELECT * FROM ({self.query}) LIMIT 1",
                )

                # Need to retry client.describe_statement(...) until the task is finished. We don't want to bombard
                # Redshift with queries, and neither do we want to wait for a long time on the initial call.
                # The solution is exponential backoff. The backoff starts with 0.1 seconds and doubles exponentially
                # until reaching 30 seconds, at which point the backoff is fixed.
                @retry(
                    wait=wait_exponential(multiplier=0.1, max=30),
                    retry=retry_unless_exception_type(RedshiftQueryError),
                )
                def wait_for_statement():
                    desc = client.describe_statement(Id=statement["Id"])
                    if desc["Status"] in ("SUBMITTED", "STARTED", "PICKED"):
                        raise Exception  # Retry
                    if desc["Status"] != "FINISHED":
                        raise RedshiftQueryError(
                            desc)  # Don't retry. Raise exception.

                wait_for_statement()

                result = client.get_statement_result(Id=statement["Id"])

                columns = result["ColumnMetadata"]
        except ClientError as e:
            if e.response["Error"]["Code"] == "ValidationException":
                raise RedshiftCredentialsError() from e
            raise

        return [(column["name"], column["typeName"].upper())
                for column in columns]