def test_spark_datasource_processes_dataset_options( test_folder_connection_path_csv, test_backends, empty_data_context): context: DataContext = empty_data_context if "SparkDFDataset" not in test_backends: pytest.skip( "Spark has not been enabled, so this test must be skipped.") datasource = SparkDFDatasource( "PandasCSV", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path_csv, } }, ) batch_kwargs = datasource.build_batch_kwargs("subdir_reader", data_asset_name="test") batch_kwargs["dataset_options"] = {"caching": False, "persist": False} batch = datasource.get_batch(batch_kwargs) validator = BridgeValidator( batch, ExpectationSuite(expectation_suite_name="foo", data_context=context)) dataset = validator.get_dataset() assert dataset.caching is False assert dataset._persist is False
def test_sqlalchemy_source_limit(sqlitedb_engine, empty_data_context): context: DataContext = empty_data_context df1 = pd.DataFrame({ "col_1": [1, 2, 3, 4, 5], "col_2": ["a", "b", "c", "d", "e"] }) df2 = pd.DataFrame({ "col_1": [0, 1, 2, 3, 4], "col_2": ["b", "c", "d", "e", "f"] }) df1.to_sql(name="table_1", con=sqlitedb_engine, index=True) df2.to_sql(name="table_2", con=sqlitedb_engine, index=True, schema="main") datasource = SqlAlchemyDatasource("SqlAlchemy", engine=sqlitedb_engine) limited_batch = datasource.get_batch({ "table": "table_1", "limit": 1, "offset": 2 }) assert isinstance(limited_batch, Batch) limited_dataset = BridgeValidator( limited_batch, expectation_suite=ExpectationSuite("test", data_context=context), expectation_engine=SqlAlchemyDataset, ).get_dataset() assert limited_dataset._table.name.startswith( "ge_temp_") # we have generated a temporary table assert len(limited_dataset.head(10)) == 1 # and it is only one row long assert limited_dataset.head( 10)["col_1"][0] == 3 # offset should have been applied
def test_sqlalchemy_source_templating(sqlitedb_engine, empty_data_context): context: DataContext = empty_data_context datasource = SqlAlchemyDatasource( engine=sqlitedb_engine, batch_kwargs_generators={ "foo": { "class_name": "QueryBatchKwargsGenerator" } }, ) generator = datasource.get_batch_kwargs_generator("foo") generator.add_query(data_asset_name="test", query="select 'cat' as ${col_name};") batch = datasource.get_batch( generator.build_batch_kwargs( "test", query_parameters={"col_name": "animal_name"})) dataset = BridgeValidator( batch, expectation_suite=ExpectationSuite("test", data_context=context), expectation_engine=SqlAlchemyDataset, ).get_dataset() res = dataset.expect_column_to_exist("animal_name") assert res.success is True res = dataset.expect_column_values_to_be_in_set("animal_name", ["cat"]) assert res.success is True
def test_sqlalchemy_datasource_processes_dataset_options( test_db_connection_string): datasource = SqlAlchemyDatasource( "SqlAlchemy", credentials={"url": test_db_connection_string}) batch_kwargs = datasource.process_batch_parameters( dataset_options={"caching": False}) batch_kwargs["query"] = "select * from table_1;" batch = datasource.get_batch(batch_kwargs) validator = BridgeValidator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False batch_kwargs = datasource.process_batch_parameters( dataset_options={"caching": True}) batch_kwargs["query"] = "select * from table_1;" batch = datasource.get_batch(batch_kwargs) validator = BridgeValidator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is True batch_kwargs = { "query": "select * from table_1;", "dataset_options": { "caching": False }, } batch = datasource.get_batch(batch_kwargs) validator = BridgeValidator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False
def test_pandas_datasource_processes_dataset_options(test_folder_connection_path_csv): datasource = PandasDatasource( "PandasCSV", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path_csv, } }, ) batch_kwargs = datasource.build_batch_kwargs( "subdir_reader", data_asset_name="test" ) batch_kwargs["dataset_options"] = {"caching": False} batch = datasource.get_batch(batch_kwargs) validator = BridgeValidator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False
def _get_batch_kwargs_for_sqlalchemy_datasource(context, datasource_name, additional_batch_kwargs=None): data_asset_name = None sql_query = None datasource = context.get_datasource(datasource_name) msg_prompt_how_to_connect_to_data = """ You have selected a datasource that is a SQL database. How would you like to specify the data? 1. Enter a table name and schema 2. Enter a custom SQL query 3. List all tables in the database (this may take a very long time) """ default_schema = _get_default_schema(datasource) temp_generator = TableBatchKwargsGenerator(name="temp", datasource=datasource) while data_asset_name is None: single_or_multiple_data_asset_selection = click.prompt( msg_prompt_how_to_connect_to_data, type=click.Choice(["1", "2", "3"]), show_choices=False, ) if single_or_multiple_data_asset_selection == "1": # name the table and schema schema_name = click.prompt( "Please provide the schema name of the table (this is optional)", default=default_schema, ) table_name = click.prompt( "Please provide the table name (this is required)") data_asset_name = f"{schema_name}.{table_name}" elif single_or_multiple_data_asset_selection == "2": # SQL query sql_query = click.prompt("Please provide the SQL query") data_asset_name = "custom_sql_query" elif single_or_multiple_data_asset_selection == "3": # list it all msg_prompt_warning = fr"""Warning: If you have a large number of tables in your datasource, this may take a very long time. \m Would you like to continue?""" confirmation = click.prompt(msg_prompt_warning, type=click.Choice(["y", "n"]), show_choices=True) if confirmation == "y": # avoid this call until necessary available_data_asset_names = ( temp_generator.get_available_data_asset_names()["names"]) available_data_asset_names_str = [ "{} ({})".format(name[0], name[1]) for name in available_data_asset_names ] data_asset_names_to_display = available_data_asset_names_str choices = "\n".join([ " {}. {}".format(i, name) for i, name in enumerate(data_asset_names_to_display, 1) ]) msg_prompt_enter_data_asset_name = ( "\nWhich table would you like to use? (Choose one)\n") prompt = msg_prompt_enter_data_asset_name + choices + os.linesep selection = click.prompt(prompt, show_default=False) selection = selection.strip() try: data_asset_index = int(selection) - 1 try: data_asset_name = [ name[0] for name in available_data_asset_names ][data_asset_index] except IndexError: print( f"You have specified {selection}, which is an incorrect index" ) pass except ValueError: print( f"You have specified {selection}, which is an incorrect value" ) pass if additional_batch_kwargs is None: additional_batch_kwargs = {} # Some backends require named temporary table parameters. We specifically elicit those and add them # where appropriate. temp_table_kwargs = dict() datasource = context.get_datasource(datasource_name) if datasource.engine.dialect.name.lower() == "bigquery": # bigquery also requires special handling bigquery_temp_table = click.prompt( "Great Expectations will create a table to use for " "validation." + os.linesep + "Please enter a name for this table: ", default="SOME_PROJECT.SOME_DATASET.ge_tmp_" + str(uuid.uuid4())[:8], ) temp_table_kwargs = { "bigquery_temp_table": bigquery_temp_table, } # now building the actual batch_kwargs if sql_query is None: batch_kwargs = temp_generator.build_batch_kwargs( data_asset_name, **additional_batch_kwargs) batch_kwargs.update(temp_table_kwargs) else: batch_kwargs = {"query": sql_query, "datasource": datasource_name} batch_kwargs.update(temp_table_kwargs) BridgeValidator( batch=datasource.get_batch(batch_kwargs), expectation_suite=ExpectationSuite("throwaway"), ).get_dataset() batch_kwargs["data_asset_name"] = data_asset_name return data_asset_name, batch_kwargs