def test_standalone_sqlalchemy_datasource(test_db_connection_string, sa): datasource = SqlAlchemyDatasource( "SqlAlchemy", connection_string=test_db_connection_string, echo=False, batch_kwargs_generators={ "default": { "class_name": "TableBatchKwargsGenerator" } }, ) assert set( datasource.get_available_data_asset_names()["default"]["names"]) == { ("main.table_1", "table"), ("main.table_2", "table"), } batch_kwargs = datasource.build_batch_kwargs("default", "main.table_1") batch = datasource.get_batch(batch_kwargs=batch_kwargs) assert isinstance(batch, Batch) batch_data = batch.data assert isinstance( batch_data, great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyBatchReference, ) dataset = SqlAlchemyDataset(**batch.data.get_init_kwargs()) assert len(dataset.head(10)) == 5
def test_sqlalchemy_source_limit(sqlitedb_engine): df1 = pd.DataFrame({ "col_1": [1, 2, 3, 4, 5], "col_2": ["a", "b", "c", "d", "e"] }) df2 = pd.DataFrame({ "col_1": [0, 1, 2, 3, 4], "col_2": ["b", "c", "d", "e", "f"] }) df1.to_sql("table_1", con=sqlitedb_engine, index=True) df2.to_sql("table_2", con=sqlitedb_engine, index=True, schema="main") datasource = SqlAlchemyDatasource("SqlAlchemy", engine=sqlitedb_engine) limited_batch = datasource.get_batch({ "table": "table_1", "limit": 1, "offset": 2 }) assert isinstance(limited_batch, Batch) limited_dataset = Validator( limited_batch, expectation_suite=ExpectationSuite("test"), expectation_engine=SqlAlchemyDataset, ).get_dataset() assert limited_dataset._table.name.startswith( "ge_tmp_") # we have generated a temporary table assert len(limited_dataset.head(10)) == 1 # and it is only one row long assert limited_dataset.head( 10)["col_1"][0] == 3 # offset should have been applied
def test_sqlalchemy_source_limit(sqlitedb_engine): df1 = pd.DataFrame({ 'col_1': [1, 2, 3, 4, 5], 'col_2': ['a', 'b', 'c', 'd', 'e'] }) df2 = pd.DataFrame({ 'col_1': [0, 1, 2, 3, 4], 'col_2': ['b', 'c', 'd', 'e', 'f'] }) df1.to_sql('table_1', con=sqlitedb_engine, index=True) df2.to_sql('table_2', con=sqlitedb_engine, index=True, schema='main') datasource = SqlAlchemyDatasource('SqlAlchemy', engine=sqlitedb_engine) limited_batch = datasource.get_batch({ "table": "table_1", "limit": 1, "offset": 2 }) assert isinstance(limited_batch, Batch) limited_dataset = Validator( limited_batch, expectation_suite=ExpectationSuite("test"), expectation_engine=SqlAlchemyDataset).get_dataset() assert limited_dataset._table.name.startswith( "ge_tmp_") # we have generated a temporary table assert len(limited_dataset.head(10)) == 1 # and it is only one row long assert limited_dataset.head( 10)['col_1'][0] == 3 # offset should have been applied
def test_sqlalchemy_source_templating(sqlitedb_engine, empty_data_context): context: DataContext = empty_data_context datasource = SqlAlchemyDatasource( engine=sqlitedb_engine, batch_kwargs_generators={ "foo": { "class_name": "QueryBatchKwargsGenerator" } }, ) generator = datasource.get_batch_kwargs_generator("foo") generator.add_query(data_asset_name="test", query="select 'cat' as ${col_name};") batch = datasource.get_batch( generator.build_batch_kwargs( "test", query_parameters={"col_name": "animal_name"})) dataset = BridgeValidator( batch, expectation_suite=ExpectationSuite("test", data_context=context), expectation_engine=SqlAlchemyDataset, ).get_dataset() res = dataset.expect_column_to_exist("animal_name") assert res.success is True res = dataset.expect_column_values_to_be_in_set("animal_name", ["cat"]) assert res.success is True
def test_sqlalchemysource_templating(sqlitedb_engine): datasource = SqlAlchemyDatasource(engine=sqlitedb_engine) generator = datasource.get_generator() generator.add_query("test", "select 'cat' as ${col_name};") df = datasource.get_batch("test", col_name="animal_name") res = df.expect_column_to_exist("animal_name") assert res["success"] == True
def test_standalone_sqlalchemy_datasource(test_db_connection_string): datasource = SqlAlchemyDatasource( 'SqlAlchemy', connection_string=test_db_connection_string, echo=False) assert datasource.get_available_data_asset_names() == {"default": {"main.table_1", "main.table_2"}} dataset1 = datasource.get_batch("main.table_1") dataset2 = datasource.get_batch("main.table_2") assert isinstance(dataset1, SqlAlchemyDataset) assert isinstance(dataset2, SqlAlchemyDataset)
def test_query_generator_view(sqlite_view_engine): datasource = SqlAlchemyDatasource( engine=sqlite_view_engine, batch_kwargs_generators={"table": {"class_name": "TableBatchKwargsGenerator"}}, ) # Build a datasource with a queries generator to introspect our database with a view names = set(datasource.get_available_data_asset_names()["table"]["names"]) # We should see both the table *and* the primary view, but *not* the temp view assert names == {("main.test_table", "table"), ("main.test_view", "view")}
def test_query_generator_view(sqlite_view_engine): datasource = SqlAlchemyDatasource( engine=sqlite_view_engine, generators={"query": { "type": "queries" }} ) # Build a datasource with a queries generator to introspect our database with a view names = set(datasource.get_available_data_asset_names()["query"]) # We should see both the table *and* the primary view, but *not* the temp view assert names == {"main.test_table", "main.test_view"}
def test_sqlalchemy_source_templating(sqlitedb_engine): datasource = SqlAlchemyDatasource( engine=sqlitedb_engine, generators={"foo": { "class_name": "QueryGenerator" }}) generator = datasource.get_generator("foo") generator.add_query("test", "select 'cat' as ${col_name};") df = datasource.get_batch( "test", "my_suite", generator.yield_batch_kwargs("test", col_name="animal_name")) res = df.expect_column_to_exist("animal_name") assert res["success"] is True
def test_sqlalchemy_datasource_query_and_table_handling(sqlitedb_engine): # MANUALLY SET DIALECT NAME FOR TEST datasource = SqlAlchemyDatasource('SqlAlchemy', engine=sqlitedb_engine) with mock.patch("great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyBatchReference.__init__", return_value=None) as mock_batch: datasource.get_batch({ "query": "select * from foo;" }) mock_batch.assert_called_once_with(engine=sqlitedb_engine, schema=None, query="select * from foo;", table_name=None) # Normally, we do not allow both query and table_name with mock.patch("great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyBatchReference.__init__", return_value=None) as mock_batch: datasource.get_batch({ "query": "select * from foo;", "table_name": "bar" }) mock_batch.assert_called_once_with(engine=sqlitedb_engine, schema=None, query="select * from foo;", table_name=None) # Snowflake should allow *both* query *and* table_name sqlitedb_engine.dialect.name = "snowflake" with mock.patch("great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyBatchReference.__init__", return_value=None) as mock_batch: datasource.get_batch({ "query": "select * from foo;", "table_name": "bar" }) mock_batch.assert_called_once_with(engine=sqlitedb_engine, schema=None, query="select * from foo;", table_name="bar")
def test_sqlalchemy_source_templating(sqlitedb_engine): datasource = SqlAlchemyDatasource(engine=sqlitedb_engine, generators={ "foo": { "class_name": "QueryBatchKwargsGenerator" } }) generator = datasource.get_generator("foo") generator.add_query("test", "select 'cat' as ${col_name};") batch = datasource.get_batch(generator.build_batch_kwargs("test", query_parameters={'col_name': "animal_name"})) dataset = Validator(batch, expectation_suite=ExpectationSuite("test"), expectation_engine=SqlAlchemyDataset).get_dataset() res = dataset.expect_column_to_exist("animal_name") assert res.success is True res = dataset.expect_column_values_to_be_in_set('animal_name', ['cat']) assert res.success is True
def test_sqlalchemy_datasource_processes_dataset_options( test_db_connection_string): datasource = SqlAlchemyDatasource( "SqlAlchemy", credentials={"url": test_db_connection_string}) batch_kwargs = datasource.process_batch_parameters( dataset_options={"caching": False}) batch_kwargs["query"] = "select * from table_1;" batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False batch_kwargs = datasource.process_batch_parameters( dataset_options={"caching": True}) batch_kwargs["query"] = "select * from table_1;" batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is True batch_kwargs = { "query": "select * from table_1;", "dataset_options": { "caching": False }, } batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False
def test_sqlalchemy_source_limit(sqlitedb_engine): df1 = pd.DataFrame({ 'col_1': [1, 2, 3, 4, 5], 'col_2': ['a', 'b', 'c', 'd', 'e'] }) df2 = pd.DataFrame({ 'col_1': [0, 1, 2, 3, 4], 'col_2': ['b', 'c', 'd', 'e', 'f'] }) df1.to_sql('table_1', con=sqlitedb_engine, index=True) df2.to_sql('table_2', con=sqlitedb_engine, index=True, schema='main') datasource = SqlAlchemyDatasource('SqlAlchemy', engine=sqlitedb_engine) limited_dataset = datasource.get_data_asset("table_1", "default", limit=1, offset=2) assert isinstance(limited_dataset, SqlAlchemyDataset) assert limited_dataset._table.name.startswith( "ge_tmp_") # we have generated a temporary table assert len(limited_dataset.head(10)) == 1 # and it is only one row long assert limited_dataset.head( 10)['col_1'][0] == 3 # offset should have been applied
def _add_sqlalchemy_datasource(context, prompt_for_datasource_name=True): msg_success_database = ( "\n<green>Great Expectations connected to your database!</green>" ) if not _verify_sqlalchemy_dependent_modules(): return None db_choices = [str(x) for x in list(range(1, 1 + len(SupportedDatabases)))] selected_database = ( int( click.prompt( msg_prompt_choose_database, type=click.Choice(db_choices), show_choices=False, ) ) - 1 ) # don't show user a zero index list :) selected_database = list(SupportedDatabases)[selected_database] toolkit.send_usage_message( data_context=context, event="cli.new_ds_choice", event_payload={"type": "sqlalchemy", "db": selected_database.name}, success=True, ) datasource_name = "my_{}_db".format(selected_database.value.lower()) if selected_database == SupportedDatabases.OTHER: datasource_name = "my_database" if prompt_for_datasource_name: datasource_name = click.prompt( msg_prompt_datasource_name, default=datasource_name ) credentials = {} # Since we don't want to save the database credentials in the config file that will be # committed in the repo, we will use our Variable Substitution feature to store the credentials # in the credentials file (that will not be committed, since it is in the uncommitted directory) # with the datasource's name as the variable name. # The value of the datasource's "credentials" key in the config file (great_expectations.yml) will # be ${datasource name}. # Great Expectations will replace the ${datasource name} with the value from the credentials file in runtime. while True: cli_message(msg_db_config.format(datasource_name)) if selected_database == SupportedDatabases.MYSQL: if not _verify_mysql_dependent_modules(): return None credentials = _collect_mysql_credentials(default_credentials=credentials) elif selected_database == SupportedDatabases.POSTGRES: if not _verify_postgresql_dependent_modules(): return None credentials = _collect_postgres_credentials(default_credentials=credentials) elif selected_database == SupportedDatabases.REDSHIFT: if not _verify_redshift_dependent_modules(): return None credentials = _collect_redshift_credentials(default_credentials=credentials) elif selected_database == SupportedDatabases.SNOWFLAKE: if not _verify_snowflake_dependent_modules(): return None credentials = _collect_snowflake_credentials( default_credentials=credentials ) elif selected_database == SupportedDatabases.BIGQUERY: if not _verify_bigquery_dependent_modules(): return None credentials = _collect_bigquery_credentials(default_credentials=credentials) elif selected_database == SupportedDatabases.OTHER: sqlalchemy_url = click.prompt( """What is the url/connection string for the sqlalchemy connection? (reference: https://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls) """, show_default=False, ).strip() credentials = {"url": sqlalchemy_url} context.save_config_variable(datasource_name, credentials) message = """ <red>Cannot connect to the database.</red> - Please check your environment and the configuration you provided. - Database Error: {0:s}""" try: cli_message( "<cyan>Attempting to connect to your database. This may take a moment...</cyan>" ) configuration = SqlAlchemyDatasource.build_configuration( credentials="${" + datasource_name + "}" ) configuration["class_name"] = "SqlAlchemyDatasource" configuration["module_name"] = "great_expectations.datasource" errors = DatasourceConfigSchema().validate(configuration) if len(errors) != 0: raise ge_exceptions.GreatExpectationsError( "Invalid Datasource configuration: {:s}".format(errors) ) cli_message( """ Great Expectations will now add a new Datasource '{0:s}' to your deployment, by adding this entry to your great_expectations.yml: {1:s} The credentials will be saved in uncommitted/config_variables.yml under the key '{0:s}' """.format( datasource_name, textwrap.indent( toolkit.yaml.dump({datasource_name: configuration}), " " ), ) ) toolkit.confirm_proceed_or_exit() context.add_datasource(name=datasource_name, **configuration) cli_message(msg_success_database) break except ModuleNotFoundError as de: cli_message(message.format(str(de))) return None except DatasourceInitializationError as de: cli_message(message.format(str(de))) if not click.confirm("Enter the credentials again?", default=True): context.add_datasource( datasource_name, initialize=False, module_name="great_expectations.datasource", class_name="SqlAlchemyDatasource", data_asset_type={"class_name": "SqlAlchemyDataset"}, credentials="${" + datasource_name + "}", ) # TODO this message about continuing may not be accurate cli_message( """ We saved datasource {:s} in {:s} and the credentials you entered in {:s}. Since we could not connect to the database, you can complete troubleshooting in the configuration files documented in the how-to guides here: <blue>https://docs.greatexpectations.io/en/latest/guides/how_to_guides/configuring_datasources.html?utm_source=cli&utm_medium=init&utm_campaign={:s}#{:s}</blue> . After you connect to the datasource, run great_expectations init to continue. """.format( datasource_name, DataContext.GE_YML, context.get_config()["config_variables_file_path"], rtd_url_ge_version, selected_database.value.lower(), ) ) return None return datasource_name
def postgresql_sqlalchemy_datasource(postgresql_engine): return SqlAlchemyDatasource("postgresql_sqlalchemy_datasource", engine=postgresql_engine)
def basic_sqlalchemy_datasource(sqlitedb_engine): return SqlAlchemyDatasource("basic_sqlalchemy_datasource", engine=sqlitedb_engine)
def _add_sqlalchemy_datasource(context): if not load_library("sqlalchemy"): return None db_choices = [str(x) for x in list(range(1, 1 + len(SupportedDatabases)))] selected_database = int( click.prompt( msg_prompt_choose_database, type=click.Choice(db_choices), show_choices=False)) - 1 # don't show user a zero index list :) selected_database = list(SupportedDatabases)[selected_database] data_source_name = click.prompt(msg_prompt_datasource_name, default="my_{}_db".format( selected_database.value.lower()), show_default=True) credentials = {} # Since we don't want to save the database credentials in the config file that will be # committed in the repo, we will use our Variable Substitution feature to store the credentials # in the credentials file (that will not be committed, since it is in the uncommitted directory) # with the datasource's name as the variable name. # The value of the datasource's "credentials" key in the config file (great_expectations.yml) will # be ${datasource name}. # GE will replace the ${datasource name} with the value from the credentials file in runtime. while True: cli_message(msg_db_config.format(data_source_name)) if selected_database == SupportedDatabases.MYSQL: if not load_library("pymysql"): return None credentials = _collect_mysql_credentials( default_credentials=credentials) elif selected_database == SupportedDatabases.POSTGRES: credentials = _collect_postgres_credentials( default_credentials=credentials) elif selected_database == SupportedDatabases.REDSHIFT: if not load_library("psycopg2"): return None credentials = _collect_redshift_credentials( default_credentials=credentials) elif selected_database == SupportedDatabases.SNOWFLAKE: if not load_library("snowflake", install_instructions_string= "pip install snowflake-sqlalchemy"): return None credentials = _collect_snowflake_credentials( default_credentials=credentials) elif selected_database == SupportedDatabases.OTHER: sqlalchemy_url = click.prompt( """What is the url/connection string for the sqlalchemy connection? (reference: https://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls) """, show_default=False) credentials = {"url": sqlalchemy_url} context.save_config_variable(data_source_name, credentials) message = """ <red>Cannot connect to the database.</red> - Please check your environment and the configuration you provided. - Database Error: {0:s}""" try: configuration = SqlAlchemyDatasource.build_configuration( credentials="${" + data_source_name + "}") context.add_datasource(name=data_source_name, class_name='SqlAlchemyDatasource', **configuration) break except ModuleNotFoundError as de: cli_message(message.format(str(de))) return None except DatasourceInitializationError as de: cli_message(message.format(str(de))) if not click.confirm("Enter the credentials again?".format( str(de)), default=True): context.add_datasource( data_source_name, initialize=False, module_name="great_expectations.datasource", class_name="SqlAlchemyDatasource", data_asset_type={"class_name": "SqlAlchemyDataset"}, credentials="${" + data_source_name + "}", generators={"default": { "class_name": "TableGenerator" }}) cli_message(""" We saved datasource {0:s} in {1:s} and the credentials you entered in {2:s}. Since we could not connect to the database, you can complete troubleshooting in the configuration files. Read here: <blue>https://docs.greatexpectations.io/en/latest/tutorials/add-sqlalchemy-datasource.html?utm_source=cli&utm_medium=init&utm_campaign={3:s}#{4:s}</blue> . After you connect to the datasource, run great_expectations profile to continue. """.format(data_source_name, DataContext.GE_YML, context.get_project_config().get("config_variables_file_path"), rtd_url_ge_version, selected_database.value.lower())) return None return data_source_name