예제 #1
0
    def _get_data_asset_class(self, data_asset_type):
        """Returns the class to be used to generate a data_asset from this datasource"""
        if isinstance(data_asset_type, string_types):
            # We have a custom type, but it is defined with only a string
            try:
                logger.warning(
                    "Use of custom_data_assets module is deprecated. Please define data_asset_type"
                    "using a module_name and class_name.")
                # FOR LEGACY REASONS support the fixed "custom_data_assets" name
                # FIXME: this option should be removed in a future release
                custom_data_assets_module = __import__(
                    "custom_data_assets", fromlist=["custom_data_assets"])
                data_asset_type_class = getattr(custom_data_assets_module,
                                                data_asset_type)
                return data_asset_type_class
            except ImportError:
                logger.error(
                    "Unable to import custom_data_asset module. "
                    "Check the plugins directory for 'custom_data_assets'.")
                raise InvalidConfigError(
                    "Unable to import custom_data_asset module. "
                    "Check the plugins directory for 'custom_data_assets'.")
            except AttributeError:
                logger.error("Unable to find data_asset_type: '%s'." %
                             data_asset_type)
                raise InvalidConfigError(
                    "Unable to find data_asset_type: '%s'." % data_asset_type)
        elif isinstance(data_asset_type, ClassConfig):
            try:
                if data_asset_type.module_name is None:
                    data_asset_type.module_name = "great_expectations.dataset"

                loaded_module = import_module(data_asset_type.module_name)
                data_asset_type_class = getattr(loaded_module,
                                                data_asset_type.class_name)
                return data_asset_type_class
            except ImportError:
                logger.error("Unable to find module '%s'." %
                             data_asset_type.module_name)
                raise InvalidConfigError("Unable to find module '%s'." %
                                         data_asset_type.module_name)
            except AttributeError:
                logger.error(
                    "Unable to find data_asset_type: '%s' in module '%s'." %
                    (data_asset_type.class_name, data_asset_type.module_name))
                raise InvalidConfigError(
                    "Unable to find data_asset_type: '%s' in module '%s'." %
                    (data_asset_type.class_name, data_asset_type.module_name))
        else:
            raise InvalidConfigError(
                "Invalid configuration for data_asset_type")
예제 #2
0
def substitute_config_variable(template_str, config_variables_dict):
    """
    This method takes a string, and if it contains a pattern ${SOME_VARIABLE} or $SOME_VARIABLE,
    returns a string where the pattern is replaced with the value of SOME_VARIABLE,
    otherwise returns the string unchanged.

    If the environment variable SOME_VARIABLE is set, the method uses its value for substitution.
    If it is not set, the value of SOME_VARIABLE is looked up in the config variables store (file).
    If it is not found there, the input string is returned as is.

    :param template_str: a string that might or might not be of the form ${SOME_VARIABLE}
            or $SOME_VARIABLE
    :param config_variables_dict: a dictionary of config variables. It is loaded from the
            config variables store (by default, "uncommitted/config_variables.yml file)
    :return:
    """
    if template_str is None:
        return template_str

    try:
        match = re.search(r'\$\{(.*?)\}', template_str) or re.search(
            r'\$([_a-z][_a-z0-9]*)', template_str)
    except TypeError:
        # If the value is not a string (e.g., a boolean), we should return it as is
        return template_str

    if match:
        config_variable_value = os.getenv(match.group(1))
        if not config_variable_value:
            config_variable_value = config_variables_dict.get(match.group(1))

        if config_variable_value:
            if match.start() == 0 and match.end() == len(template_str):
                return config_variable_value
            else:
                return template_str[:match.start(
                )] + config_variable_value + template_str[match.end():]

        raise InvalidConfigError(
            "Unable to find match for config variable {:s}. See https://great-expectations.readthedocs.io/en/latest/reference/data_context_reference.html#managing-environment-and-secrets"
            .format(match.group(1)))

    return template_str
예제 #3
0
    def __init__(
        self,
        name: Optional[str] = None,
        credentials: Optional[dict] = None,
        data_context: Optional[Any] = None,
        engine=None,
        connection_string: Optional[str] = None,
        url: Optional[str] = None,
        batch_data_dict: Optional[dict] = None,
        create_temp_table: bool = True,
        concurrency: Optional[ConcurrencyConfig] = None,
        **kwargs,  # These will be passed as optional parameters to the SQLAlchemy engine, **not** the ExecutionEngine
    ) -> None:
        """Builds a SqlAlchemyExecutionEngine, using a provided connection string/url/engine/credentials to access the
        desired database. Also initializes the dialect to be used and configures usage statistics.

            Args:
                name (str): \
                    The name of the SqlAlchemyExecutionEngine
                credentials: \
                    If the Execution Engine is not provided, the credentials can be used to build the Execution
                    Engine. If the Engine is provided, it will be used instead
                data_context (DataContext): \
                    An object representing a Great Expectations project that can be used to access Expectation
                    Suites and the Project Data itself
                engine (Engine): \
                    A SqlAlchemy Engine used to set the SqlAlchemyExecutionEngine being configured, useful if an
                    Engine has already been configured and should be reused. Will override Credentials
                    if provided.
                connection_string (string): \
                    If neither the engines nor the credentials have been provided, a connection string can be used
                    to access the data. This will be overridden by both the engine and credentials if those are
                    provided.
                url (string): \
                    If neither the engines, the credentials, nor the connection_string have been provided,
                    a url can be used to access the data. This will be overridden by all other configuration
                    options if any are provided.
                concurrency (ConcurrencyConfig): Concurrency config used to configure the sqlalchemy engine.
        """
        super().__init__(name=name, batch_data_dict=batch_data_dict)
        self._name = name

        self._credentials = credentials
        self._connection_string = connection_string
        self._url = url
        self._create_temp_table = create_temp_table

        if engine is not None:
            if credentials is not None:
                logger.warning(
                    "Both credentials and engine were provided during initialization of SqlAlchemyExecutionEngine. "
                    "Ignoring credentials."
                )
            self.engine = engine
        else:
            concurrency: ConcurrencyConfig
            if data_context is None or data_context.concurrency is None:
                concurrency = ConcurrencyConfig()
            else:
                concurrency = data_context.concurrency

            concurrency.add_sqlalchemy_create_engine_parameters(kwargs)

            if credentials is not None:
                self.engine = self._build_engine(credentials=credentials, **kwargs)
            elif connection_string is not None:
                self.engine = sa.create_engine(connection_string, **kwargs)
            elif url is not None:
                parsed_url = make_url(url)
                self.drivername = parsed_url.drivername
                self.engine = sa.create_engine(url, **kwargs)
            else:
                raise InvalidConfigError(
                    "Credentials or an engine are required for a SqlAlchemyExecutionEngine."
                )

        # these are two backends where temp_table_creation is not supported we set the default value to False.
        if self.engine.dialect.name.lower() in [
            "trino",
            "awsathena",  # WKS 202201 - AWS Athena currently doesn't support temp_tables.
        ]:
            self._create_temp_table = False

        # Get the dialect **for purposes of identifying types**
        if self.engine.dialect.name.lower() in [
            "postgresql",
            "mysql",
            "sqlite",
            "oracle",
            "mssql",
        ]:
            # These are the officially included and supported dialects by sqlalchemy
            self.dialect_module = import_library_module(
                module_name=f"sqlalchemy.dialects.{self.engine.dialect.name}"
            )

        elif self.engine.dialect.name.lower() == "snowflake":
            self.dialect_module = import_library_module(
                module_name="snowflake.sqlalchemy.snowdialect"
            )
        elif self.engine.dialect.name.lower() == "dremio":
            # WARNING: Dremio Support is experimental, functionality is not fully under test
            self.dialect_module = import_library_module(
                module_name="sqlalchemy_dremio.pyodbc"
            )
        elif self.engine.dialect.name.lower() == "redshift":
            self.dialect_module = import_library_module(
                module_name="sqlalchemy_redshift.dialect"
            )
        elif self.engine.dialect.name.lower() == "bigquery":
            self.dialect_module = import_library_module(
                module_name=_BIGQUERY_MODULE_NAME
            )
        elif self.engine.dialect.name.lower() == "teradatasql":
            # WARNING: Teradata Support is experimental, functionality is not fully under test
            self.dialect_module = import_library_module(
                module_name="teradatasqlalchemy.dialect"
            )
        else:
            self.dialect_module = None

        # <WILL> 20210726 - engine_backup is used by the snowflake connector, which requires connection and engine
        # to be closed and disposed separately. Currently self.engine can refer to either a Connection or Engine,
        # depending on the backend. This will need to be cleaned up in an upcoming refactor, so that Engine and
        # Connection can be handled separately.
        self._engine_backup = None
        if self.engine and self.engine.dialect.name.lower() in [
            "sqlite",
            "mssql",
            "snowflake",
            "mysql",
        ]:
            self._engine_backup = self.engine
            # sqlite/mssql temp tables only persist within a connection so override the engine
            self.engine = self.engine.connect()

        # Send a connect event to provide dialect type
        if data_context is not None and getattr(
            data_context, "_usage_statistics_handler", None
        ):
            handler = data_context._usage_statistics_handler
            handler.send_usage_message(
                event=UsageStatsEvents.EXECUTION_ENGINE_SQLALCHEMY_CONNECT.value,
                event_payload={
                    "anonymized_name": handler.anonymizer.anonymize(self.name),
                    "sqlalchemy_dialect": self.engine.name,
                },
                success=True,
            )

        # Gather the call arguments of the present function (and add the "class_name"), filter out the Falsy values,
        # and set the instance "_config" variable equal to the resulting dictionary.
        self._config = {
            "name": name,
            "credentials": credentials,
            "data_context": data_context,
            "engine": engine,
            "connection_string": connection_string,
            "url": url,
            "batch_data_dict": batch_data_dict,
            "module_name": self.__class__.__module__,
            "class_name": self.__class__.__name__,
        }
        self._config.update(kwargs)
        filter_properties_dict(properties=self._config, clean_falsy=True, inplace=True)

        self._data_splitter = SqlAlchemyDataSplitter()
        self._data_sampler = SqlAlchemyDataSampler()
예제 #4
0
    def __init__(
        self,
        name=None,
        credentials=None,
        data_context=None,
        engine=None,
        connection_string=None,
        url=None,
        batch_data_dict=None,
        **kwargs,  # These will be passed as optional parameters to the SQLAlchemy engine, **not** the ExecutionEngine
    ):
        """Builds a SqlAlchemyExecutionEngine, using a provided connection string/url/engine/credentials to access the
        desired database. Also initializes the dialect to be used and configures usage statistics.

            Args:
                name (str): \
                    The name of the SqlAlchemyExecutionEngine
                credentials: \
                    If the Execution Engine is not provided, the credentials can be used to build the Execution
                    Engine. If the Engine is provided, it will be used instead
                data_context (DataContext): \
                    An object representing a Great Expectations project that can be used to access Expectation
                    Suites and the Project Data itself
                engine (Engine): \
                    A SqlAlchemy Engine used to set the SqlAlchemyExecutionEngine being configured, useful if an
                    Engine has already been configured and should be reused. Will override Credentials
                    if provided.
                connection_string (string): \
                    If neither the engines nor the credentials have been provided, a connection string can be used
                    to access the data. This will be overridden by both the engine and credentials if those are
                    provided.
                url (string): \
                    If neither the engines, the credentials, nor the connection_string have been provided,
                    a url can be used to access the data. This will be overridden by all other configuration
                    options if any are provided.
        """
        super().__init__(name=name, batch_data_dict=batch_data_dict)  # , **kwargs)
        self._name = name

        self._credentials = credentials
        self._connection_string = connection_string
        self._url = url

        if engine is not None:
            if credentials is not None:
                logger.warning(
                    "Both credentials and engine were provided during initialization of SqlAlchemyExecutionEngine. "
                    "Ignoring credentials."
                )
            self.engine = engine
        elif credentials is not None:
            self.engine = self._build_engine(credentials=credentials, **kwargs)
        elif connection_string is not None:
            self.engine = sa.create_engine(connection_string, **kwargs)
        elif url is not None:
            self.drivername = urlparse(url).scheme
            self.engine = sa.create_engine(url, **kwargs)
        else:
            raise InvalidConfigError(
                "Credentials or an engine are required for a SqlAlchemyExecutionEngine."
            )

        # Get the dialect **for purposes of identifying types**
        if self.engine.dialect.name.lower() in [
            "postgresql",
            "mysql",
            "sqlite",
            "oracle",
            "mssql",
            "oracle",
        ]:
            # These are the officially included and supported dialects by sqlalchemy
            self.dialect = import_library_module(
                module_name="sqlalchemy.dialects." + self.engine.dialect.name
            )

        elif self.engine.dialect.name.lower() == "snowflake":
            self.dialect = import_library_module(
                module_name="snowflake.sqlalchemy.snowdialect"
            )
        elif self.engine.dialect.name.lower() == "redshift":
            self.dialect = import_library_module(
                module_name="sqlalchemy_redshift.dialect"
            )
        elif self.engine.dialect.name.lower() == "bigquery":
            self.dialect = import_library_module(
                module_name="pybigquery.sqlalchemy_bigquery"
            )
        else:
            self.dialect = None

        if self.engine and self.engine.dialect.name.lower() in [
            "sqlite",
            "mssql",
            "snowflake",
        ]:
            # sqlite/mssql temp tables only persist within a connection so override the engine
            self.engine = self.engine.connect()

        # Send a connect event to provide dialect type
        if data_context is not None and getattr(
            data_context, "_usage_statistics_handler", None
        ):
            handler = data_context._usage_statistics_handler
            handler.send_usage_message(
                event="execution_engine.sqlalchemy.connect",
                event_payload={
                    "anonymized_name": handler._execution_engine_anonymizer.anonymize(
                        self.name
                    ),
                    "sqlalchemy_dialect": self.engine.name,
                },
                success=True,
            )

        # Gather the call arguments of the present function (and add the "class_name"), filter out the Falsy values,
        # and set the instance "_config" variable equal to the resulting dictionary.
        self._config = get_currently_executing_function_call_arguments(
            **{"class_name": self.__class__.__name__}
        )
        filter_properties_dict(
            properties=self._config, inplace=True,
        )