def _get_data_asset_class(self, data_asset_type): """Returns the class to be used to generate a data_asset from this datasource""" if isinstance(data_asset_type, string_types): # We have a custom type, but it is defined with only a string try: logger.warning( "Use of custom_data_assets module is deprecated. Please define data_asset_type" "using a module_name and class_name.") # FOR LEGACY REASONS support the fixed "custom_data_assets" name # FIXME: this option should be removed in a future release custom_data_assets_module = __import__( "custom_data_assets", fromlist=["custom_data_assets"]) data_asset_type_class = getattr(custom_data_assets_module, data_asset_type) return data_asset_type_class except ImportError: logger.error( "Unable to import custom_data_asset module. " "Check the plugins directory for 'custom_data_assets'.") raise InvalidConfigError( "Unable to import custom_data_asset module. " "Check the plugins directory for 'custom_data_assets'.") except AttributeError: logger.error("Unable to find data_asset_type: '%s'." % data_asset_type) raise InvalidConfigError( "Unable to find data_asset_type: '%s'." % data_asset_type) elif isinstance(data_asset_type, ClassConfig): try: if data_asset_type.module_name is None: data_asset_type.module_name = "great_expectations.dataset" loaded_module = import_module(data_asset_type.module_name) data_asset_type_class = getattr(loaded_module, data_asset_type.class_name) return data_asset_type_class except ImportError: logger.error("Unable to find module '%s'." % data_asset_type.module_name) raise InvalidConfigError("Unable to find module '%s'." % data_asset_type.module_name) except AttributeError: logger.error( "Unable to find data_asset_type: '%s' in module '%s'." % (data_asset_type.class_name, data_asset_type.module_name)) raise InvalidConfigError( "Unable to find data_asset_type: '%s' in module '%s'." % (data_asset_type.class_name, data_asset_type.module_name)) else: raise InvalidConfigError( "Invalid configuration for data_asset_type")
def substitute_config_variable(template_str, config_variables_dict): """ This method takes a string, and if it contains a pattern ${SOME_VARIABLE} or $SOME_VARIABLE, returns a string where the pattern is replaced with the value of SOME_VARIABLE, otherwise returns the string unchanged. If the environment variable SOME_VARIABLE is set, the method uses its value for substitution. If it is not set, the value of SOME_VARIABLE is looked up in the config variables store (file). If it is not found there, the input string is returned as is. :param template_str: a string that might or might not be of the form ${SOME_VARIABLE} or $SOME_VARIABLE :param config_variables_dict: a dictionary of config variables. It is loaded from the config variables store (by default, "uncommitted/config_variables.yml file) :return: """ if template_str is None: return template_str try: match = re.search(r'\$\{(.*?)\}', template_str) or re.search( r'\$([_a-z][_a-z0-9]*)', template_str) except TypeError: # If the value is not a string (e.g., a boolean), we should return it as is return template_str if match: config_variable_value = os.getenv(match.group(1)) if not config_variable_value: config_variable_value = config_variables_dict.get(match.group(1)) if config_variable_value: if match.start() == 0 and match.end() == len(template_str): return config_variable_value else: return template_str[:match.start( )] + config_variable_value + template_str[match.end():] raise InvalidConfigError( "Unable to find match for config variable {:s}. See https://great-expectations.readthedocs.io/en/latest/reference/data_context_reference.html#managing-environment-and-secrets" .format(match.group(1))) return template_str
def __init__( self, name: Optional[str] = None, credentials: Optional[dict] = None, data_context: Optional[Any] = None, engine=None, connection_string: Optional[str] = None, url: Optional[str] = None, batch_data_dict: Optional[dict] = None, create_temp_table: bool = True, concurrency: Optional[ConcurrencyConfig] = None, **kwargs, # These will be passed as optional parameters to the SQLAlchemy engine, **not** the ExecutionEngine ) -> None: """Builds a SqlAlchemyExecutionEngine, using a provided connection string/url/engine/credentials to access the desired database. Also initializes the dialect to be used and configures usage statistics. Args: name (str): \ The name of the SqlAlchemyExecutionEngine credentials: \ If the Execution Engine is not provided, the credentials can be used to build the Execution Engine. If the Engine is provided, it will be used instead data_context (DataContext): \ An object representing a Great Expectations project that can be used to access Expectation Suites and the Project Data itself engine (Engine): \ A SqlAlchemy Engine used to set the SqlAlchemyExecutionEngine being configured, useful if an Engine has already been configured and should be reused. Will override Credentials if provided. connection_string (string): \ If neither the engines nor the credentials have been provided, a connection string can be used to access the data. This will be overridden by both the engine and credentials if those are provided. url (string): \ If neither the engines, the credentials, nor the connection_string have been provided, a url can be used to access the data. This will be overridden by all other configuration options if any are provided. concurrency (ConcurrencyConfig): Concurrency config used to configure the sqlalchemy engine. """ super().__init__(name=name, batch_data_dict=batch_data_dict) self._name = name self._credentials = credentials self._connection_string = connection_string self._url = url self._create_temp_table = create_temp_table if engine is not None: if credentials is not None: logger.warning( "Both credentials and engine were provided during initialization of SqlAlchemyExecutionEngine. " "Ignoring credentials." ) self.engine = engine else: concurrency: ConcurrencyConfig if data_context is None or data_context.concurrency is None: concurrency = ConcurrencyConfig() else: concurrency = data_context.concurrency concurrency.add_sqlalchemy_create_engine_parameters(kwargs) if credentials is not None: self.engine = self._build_engine(credentials=credentials, **kwargs) elif connection_string is not None: self.engine = sa.create_engine(connection_string, **kwargs) elif url is not None: parsed_url = make_url(url) self.drivername = parsed_url.drivername self.engine = sa.create_engine(url, **kwargs) else: raise InvalidConfigError( "Credentials or an engine are required for a SqlAlchemyExecutionEngine." ) # these are two backends where temp_table_creation is not supported we set the default value to False. if self.engine.dialect.name.lower() in [ "trino", "awsathena", # WKS 202201 - AWS Athena currently doesn't support temp_tables. ]: self._create_temp_table = False # Get the dialect **for purposes of identifying types** if self.engine.dialect.name.lower() in [ "postgresql", "mysql", "sqlite", "oracle", "mssql", ]: # These are the officially included and supported dialects by sqlalchemy self.dialect_module = import_library_module( module_name=f"sqlalchemy.dialects.{self.engine.dialect.name}" ) elif self.engine.dialect.name.lower() == "snowflake": self.dialect_module = import_library_module( module_name="snowflake.sqlalchemy.snowdialect" ) elif self.engine.dialect.name.lower() == "dremio": # WARNING: Dremio Support is experimental, functionality is not fully under test self.dialect_module = import_library_module( module_name="sqlalchemy_dremio.pyodbc" ) elif self.engine.dialect.name.lower() == "redshift": self.dialect_module = import_library_module( module_name="sqlalchemy_redshift.dialect" ) elif self.engine.dialect.name.lower() == "bigquery": self.dialect_module = import_library_module( module_name=_BIGQUERY_MODULE_NAME ) elif self.engine.dialect.name.lower() == "teradatasql": # WARNING: Teradata Support is experimental, functionality is not fully under test self.dialect_module = import_library_module( module_name="teradatasqlalchemy.dialect" ) else: self.dialect_module = None # <WILL> 20210726 - engine_backup is used by the snowflake connector, which requires connection and engine # to be closed and disposed separately. Currently self.engine can refer to either a Connection or Engine, # depending on the backend. This will need to be cleaned up in an upcoming refactor, so that Engine and # Connection can be handled separately. self._engine_backup = None if self.engine and self.engine.dialect.name.lower() in [ "sqlite", "mssql", "snowflake", "mysql", ]: self._engine_backup = self.engine # sqlite/mssql temp tables only persist within a connection so override the engine self.engine = self.engine.connect() # Send a connect event to provide dialect type if data_context is not None and getattr( data_context, "_usage_statistics_handler", None ): handler = data_context._usage_statistics_handler handler.send_usage_message( event=UsageStatsEvents.EXECUTION_ENGINE_SQLALCHEMY_CONNECT.value, event_payload={ "anonymized_name": handler.anonymizer.anonymize(self.name), "sqlalchemy_dialect": self.engine.name, }, success=True, ) # Gather the call arguments of the present function (and add the "class_name"), filter out the Falsy values, # and set the instance "_config" variable equal to the resulting dictionary. self._config = { "name": name, "credentials": credentials, "data_context": data_context, "engine": engine, "connection_string": connection_string, "url": url, "batch_data_dict": batch_data_dict, "module_name": self.__class__.__module__, "class_name": self.__class__.__name__, } self._config.update(kwargs) filter_properties_dict(properties=self._config, clean_falsy=True, inplace=True) self._data_splitter = SqlAlchemyDataSplitter() self._data_sampler = SqlAlchemyDataSampler()
def __init__( self, name=None, credentials=None, data_context=None, engine=None, connection_string=None, url=None, batch_data_dict=None, **kwargs, # These will be passed as optional parameters to the SQLAlchemy engine, **not** the ExecutionEngine ): """Builds a SqlAlchemyExecutionEngine, using a provided connection string/url/engine/credentials to access the desired database. Also initializes the dialect to be used and configures usage statistics. Args: name (str): \ The name of the SqlAlchemyExecutionEngine credentials: \ If the Execution Engine is not provided, the credentials can be used to build the Execution Engine. If the Engine is provided, it will be used instead data_context (DataContext): \ An object representing a Great Expectations project that can be used to access Expectation Suites and the Project Data itself engine (Engine): \ A SqlAlchemy Engine used to set the SqlAlchemyExecutionEngine being configured, useful if an Engine has already been configured and should be reused. Will override Credentials if provided. connection_string (string): \ If neither the engines nor the credentials have been provided, a connection string can be used to access the data. This will be overridden by both the engine and credentials if those are provided. url (string): \ If neither the engines, the credentials, nor the connection_string have been provided, a url can be used to access the data. This will be overridden by all other configuration options if any are provided. """ super().__init__(name=name, batch_data_dict=batch_data_dict) # , **kwargs) self._name = name self._credentials = credentials self._connection_string = connection_string self._url = url if engine is not None: if credentials is not None: logger.warning( "Both credentials and engine were provided during initialization of SqlAlchemyExecutionEngine. " "Ignoring credentials." ) self.engine = engine elif credentials is not None: self.engine = self._build_engine(credentials=credentials, **kwargs) elif connection_string is not None: self.engine = sa.create_engine(connection_string, **kwargs) elif url is not None: self.drivername = urlparse(url).scheme self.engine = sa.create_engine(url, **kwargs) else: raise InvalidConfigError( "Credentials or an engine are required for a SqlAlchemyExecutionEngine." ) # Get the dialect **for purposes of identifying types** if self.engine.dialect.name.lower() in [ "postgresql", "mysql", "sqlite", "oracle", "mssql", "oracle", ]: # These are the officially included and supported dialects by sqlalchemy self.dialect = import_library_module( module_name="sqlalchemy.dialects." + self.engine.dialect.name ) elif self.engine.dialect.name.lower() == "snowflake": self.dialect = import_library_module( module_name="snowflake.sqlalchemy.snowdialect" ) elif self.engine.dialect.name.lower() == "redshift": self.dialect = import_library_module( module_name="sqlalchemy_redshift.dialect" ) elif self.engine.dialect.name.lower() == "bigquery": self.dialect = import_library_module( module_name="pybigquery.sqlalchemy_bigquery" ) else: self.dialect = None if self.engine and self.engine.dialect.name.lower() in [ "sqlite", "mssql", "snowflake", ]: # sqlite/mssql temp tables only persist within a connection so override the engine self.engine = self.engine.connect() # Send a connect event to provide dialect type if data_context is not None and getattr( data_context, "_usage_statistics_handler", None ): handler = data_context._usage_statistics_handler handler.send_usage_message( event="execution_engine.sqlalchemy.connect", event_payload={ "anonymized_name": handler._execution_engine_anonymizer.anonymize( self.name ), "sqlalchemy_dialect": self.engine.name, }, success=True, ) # Gather the call arguments of the present function (and add the "class_name"), filter out the Falsy values, # and set the instance "_config" variable equal to the resulting dictionary. self._config = get_currently_executing_function_call_arguments( **{"class_name": self.__class__.__name__} ) filter_properties_dict( properties=self._config, inplace=True, )