def test_sampling_method__mod( sampler_method_name_prefix, test_cases_for_sql_data_connector_sqlite_execution_engine, ): execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=SqlAlchemyDatasourceBatchSpec( { "table_name": "table_partitioned_by_date_column__A", "batch_identifiers": {}, "splitter_method": "_split_on_whole_table", "splitter_kwargs": {}, "sampling_method": f"{sampler_method_name_prefix}sample_using_mod", "sampling_kwargs": { "column_name": "id", "mod": 10, "value": 8, }, } ) ) execution_engine.load_batch_data("__", batch_data) validator = Validator(execution_engine) assert len(validator.head(fetch_all=True)) == 12
def test_parse_validation_graph_with_bad_metrics_args(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) graph = ValidationGraph() engine = PandasExecutionEngine() validator = Validator(execution_engine=engine) for configuration in [expectation_configuration]: expectation_impl = get_expectation_impl( "expect_column_value_z_scores_to_be_less_than") validation_dependencies = expectation_impl( configuration).get_validation_dependencies( configuration, execution_engine=engine, ) for metric_configuration in validation_dependencies["metrics"].values( ): validator.build_metric_dependency_graph( graph=graph, execution_engine=engine, metric_configuration=metric_configuration, configuration=configuration, ) ready_metrics, needed_metrics = validator._parse_validation_graph( validation_graph=graph, metrics=("nonexistent", "NONE")) assert len(ready_metrics) == 2 and len(needed_metrics) == 9
def test_validator_default_expectation_args__pandas(basic_datasource): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]}) batch = basic_datasource.get_single_batch_from_batch_request( BatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "data_asset_name": "IN_MEMORY_DATA_ASSET", "batch_data": df, "partition_request": PartitionRequest( **{ "batch_identifiers": { "pipeline_stage_name": 0, "airflow_run_id": 0, "custom_key_0": 0, } }), })) my_validator = Validator(execution_engine=PandasExecutionEngine(), batches=[batch]) print(my_validator.get_default_expectation_arguments())
def test_sqlalchemy_source_limit(sqlitedb_engine): df1 = pd.DataFrame({ 'col_1': [1, 2, 3, 4, 5], 'col_2': ['a', 'b', 'c', 'd', 'e'] }) df2 = pd.DataFrame({ 'col_1': [0, 1, 2, 3, 4], 'col_2': ['b', 'c', 'd', 'e', 'f'] }) df1.to_sql('table_1', con=sqlitedb_engine, index=True) df2.to_sql('table_2', con=sqlitedb_engine, index=True, schema='main') datasource = SqlAlchemyDatasource('SqlAlchemy', engine=sqlitedb_engine) limited_batch = datasource.get_batch({ "table": "table_1", "limit": 1, "offset": 2 }) assert isinstance(limited_batch, Batch) limited_dataset = Validator( limited_batch, expectation_suite=ExpectationSuite("test"), expectation_engine=SqlAlchemyDataset).get_dataset() assert limited_dataset._table.name.startswith( "ge_tmp_") # we have generated a temporary table assert len(limited_dataset.head(10)) == 1 # and it is only one row long assert limited_dataset.head( 10)['col_1'][0] == 3 # offset should have been applied
def test_validator_default_expectation_args__pandas(basic_datasource): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]}) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "b", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) batch = basic_datasource.get_single_batch_from_batch_request( BatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "batch_data": df, "partition_request": PartitionRequest( **{ "partition_identifiers": { "pipeline_stage_name": 0, "run_id": 0, "custom_key_0": 0, } } ), } ) ) my_validator = Validator(execution_engine=PandasExecutionEngine(), batches=[batch]) print(my_validator.get_default_expectation_arguments())
def test_sqlalchemy_source_limit(sqlitedb_engine): df1 = pd.DataFrame({ "col_1": [1, 2, 3, 4, 5], "col_2": ["a", "b", "c", "d", "e"] }) df2 = pd.DataFrame({ "col_1": [0, 1, 2, 3, 4], "col_2": ["b", "c", "d", "e", "f"] }) df1.to_sql("table_1", con=sqlitedb_engine, index=True) df2.to_sql("table_2", con=sqlitedb_engine, index=True, schema="main") datasource = SqlAlchemyDatasource("SqlAlchemy", engine=sqlitedb_engine) limited_batch = datasource.get_batch({ "table": "table_1", "limit": 1, "offset": 2 }) assert isinstance(limited_batch, Batch) limited_dataset = Validator( limited_batch, expectation_suite=ExpectationSuite("test"), expectation_engine=SqlAlchemyDataset, ).get_dataset() assert limited_dataset._table.name.startswith( "ge_tmp_") # we have generated a temporary table assert len(limited_dataset.head(10)) == 1 # and it is only one row long assert limited_dataset.head( 10)["col_1"][0] == 3 # offset should have been applied
def test_instantiation_via_url_and_retrieve_data_with_other_dialect(sa): """Ensure that we can still retrieve data when the dialect is not recognized.""" # 1. Create engine with sqlite db db_file = file_relative_path( __file__, os.path.join("..", "test_sets", "test_cases_for_sql_data_connector.db"), ) my_execution_engine = SqlAlchemyExecutionEngine(url="sqlite:///" + db_file) assert my_execution_engine.connection_string is None assert my_execution_engine.credentials is None assert my_execution_engine.url[ -36:] == "test_cases_for_sql_data_connector.db" # 2. Change dialect to one not listed in GESqlDialect my_execution_engine.engine.dialect.name = "other_dialect" # 3. Get data num_rows_in_sample: int = 10 batch_data, _ = my_execution_engine.get_batch_data_and_markers( batch_spec=SqlAlchemyDatasourceBatchSpec( table_name="table_partitioned_by_date_column__A", sampling_method="_sample_using_limit", sampling_kwargs={"n": num_rows_in_sample}, )) # 4. Assert dialect and data are as expected assert batch_data.dialect == GESqlDialect.OTHER my_execution_engine.load_batch_data("__", batch_data) validator = Validator(my_execution_engine) assert len(validator.head(fetch_all=True)) == num_rows_in_sample
def test_sampling_method__limit( test_cases_for_sql_data_connector_sqlite_execution_engine, ): execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=BatchSpec( { "table_name": "table_partitioned_by_date_column__A", "partition_definition": {}, "splitter_method": "_split_on_whole_table", "splitter_kwargs": {}, "sampling_method": "_sample_using_limit", "sampling_kwargs": {"n": 20}, } ) ) execution_engine.load_batch_data("__", batch_data) validator = Validator(execution_engine) assert len(validator.head(fetch_all=True)) == 20 assert ( validator.expect_column_values_to_be_in_set( "date", value_set=["2020-01-02"] ).success == False )
def test_parse_validation_graph(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) # noinspection PyUnusedLocal expectation = ExpectColumnValueZScoresToBeLessThan( expectation_configuration) # noinspection PyUnusedLocal batch = Batch(data=df) graph = ValidationGraph() engine = PandasExecutionEngine() for configuration in [expectation_configuration]: expectation_impl = get_expectation_impl( "expect_column_value_z_scores_to_be_less_than") validation_dependencies = expectation_impl( configuration).get_validation_dependencies(configuration, engine) for metric_configuration in validation_dependencies["metrics"].values( ): Validator(execution_engine=engine).build_metric_dependency_graph( graph=graph, execution_engine=engine, metric_configuration=metric_configuration, configuration=configuration, ) ready_metrics, needed_metrics = Validator(engine)._parse_validation_graph( validation_graph=graph, metrics=dict()) assert len(ready_metrics) == 2 and len(needed_metrics) == 9
def test_sampling_method__limit( test_cases_for_sql_data_connector_sqlite_execution_engine, ): execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=SqlAlchemyDatasourceBatchSpec( { "table_name": "table_partitioned_by_date_column__A", "batch_identifiers": {}, "splitter_method": "_split_on_whole_table", "splitter_kwargs": {}, "sampling_method": "_sample_using_limit", "sampling_kwargs": {"n": 20}, } ) ) batch = Batch(data=batch_data) validator = Validator(execution_engine, batches=[batch]) assert len(validator.head(fetch_all=True)) == 20 assert not validator.expect_column_values_to_be_in_set( "date", value_set=["2020-01-02"] ).success
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): selectable, _, _ = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE ) df = None table_name = getattr(selectable, "name", None) if table_name is not None: try: if metric_value_kwargs["fetch_all"]: df = pd.read_sql_table( table_name=getattr(selectable, "name", None), schema=getattr(selectable, "schema", None), con=execution_engine.engine, ) else: df = next( pd.read_sql_table( table_name=getattr(selectable, "name", None), schema=getattr(selectable, "schema", None), con=execution_engine.engine, chunksize=metric_value_kwargs["n_rows"], ) ) except (ValueError, NotImplementedError): # it looks like MetaData that is used by pd.read_sql_table # cannot work on a temp table. # If it fails, we are trying to get the data using read_sql df = None except StopIteration: validator = Validator(execution_engine=execution_engine) columns = validator.get_metric( MetricConfiguration("table.columns", metric_domain_kwargs) ) df = pd.DataFrame(columns=columns) if df is None: # we want to compile our selectable stmt = sa.select(["*"]).select_from(selectable) if metric_value_kwargs["fetch_all"]: pass else: stmt = stmt.limit(metric_value_kwargs["n_rows"]) sql = stmt.compile( dialect=execution_engine.engine.dialect, compile_kwargs={"literal_binds": True}, ) df = pd.read_sql(sql, con=execution_engine.engine) return df
def test_sqlalchemy_datasource_processes_dataset_options( test_db_connection_string): datasource = SqlAlchemyDatasource( "SqlAlchemy", credentials={"url": test_db_connection_string}) batch_kwargs = datasource.process_batch_parameters( dataset_options={"caching": False}) batch_kwargs["query"] = "select * from table_1;" batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False batch_kwargs = datasource.process_batch_parameters( dataset_options={"caching": True}) batch_kwargs["query"] = "select * from table_1;" batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is True batch_kwargs = { "query": "select * from table_1;", "dataset_options": { "caching": False }, } batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False
def _self_check_fetch_batch( self, pretty_print: bool, example_data_reference: Any, data_asset_name: str, ): """ Helper function for self_check() to retrieve batch using example_data_reference and data_asset_name, all while printing helpful messages. First 5 rows of batch_data are printed by default. Args: pretty_print (bool): print to console? example_data_reference (Any): data_reference to retrieve data_asset_name (str): data_asset_name to retrieve """ if pretty_print: print(f"\n\t\tFetching batch data...") batch_definition_list = self._map_data_reference_to_batch_definition_list( data_reference=example_data_reference, data_asset_name=data_asset_name, ) assert len(batch_definition_list) == 1 batch_definition = batch_definition_list[0] # _execution_engine might be None for some tests if batch_definition is None or self._execution_engine is None: return {} batch_data, batch_spec, _ = self.get_batch_data_and_metadata( batch_definition=batch_definition ) # Note: get_batch_data_and_metadata will have loaded the data into the currently-defined execution engine. # Consequently, when we build a Validator, we do not need to specifically load the batch into it to # resolve metrics. validator = Validator(execution_engine=batch_data.execution_engine) df = validator.get_metric( MetricConfiguration( "table.head", {"batch_id": batch_definition.id}, {"n_rows": 5} ) ) n_rows = validator.get_metric( MetricConfiguration("table.row_count", {"batch_id": batch_definition.id}) ) if pretty_print and df is not None: print(f"\n\t\tShowing 5 rows") print(df) return { "batch_spec": batch_spec, "n_rows": n_rows, }
def test_validator_progress_bar_config_disabled(mock_tqdm, mock_validation_graph, mock_data_context): data_context = mock_data_context() data_context.progress_bars = ProgressBarsConfig(metric_calculations=False) engine = PandasExecutionEngine() validator = Validator(engine, data_context=data_context) # ValidationGraph is a complex object that requires len > 3 to not trigger tqdm mock_validation_graph.edges.__len__ = lambda _: 3 validator.resolve_validation_graph(mock_validation_graph, {}) assert mock_tqdm.called is True assert mock_tqdm.call_args[1]["disable"] is True
def test_validator_progress_bar_config_enabled(mock_tqdm, mock_validation_graph, mock_data_context): data_context = mock_data_context() engine = PandasExecutionEngine() validator = Validator(engine, data_context=data_context) # ValidationGraph is a complex object that requires len > 3 to not trigger tqdm mock_validation_graph.edges.__len__ = lambda _: 3 validator.resolve_validation_graph(mock_validation_graph, {}) # Still invoked but doesn't actually do anything due to `disabled` assert mock_tqdm.called is True assert mock_tqdm.call_args[1]["disable"] is False
def test_sqlalchemy_source_templating(sqlitedb_engine): datasource = SqlAlchemyDatasource(engine=sqlitedb_engine, generators={ "foo": { "class_name": "QueryBatchKwargsGenerator" } }) generator = datasource.get_generator("foo") generator.add_query("test", "select 'cat' as ${col_name};") batch = datasource.get_batch(generator.build_batch_kwargs("test", query_parameters={'col_name': "animal_name"})) dataset = Validator(batch, expectation_suite=ExpectationSuite("test"), expectation_engine=SqlAlchemyDataset).get_dataset() res = dataset.expect_column_to_exist("animal_name") assert res.success is True res = dataset.expect_column_values_to_be_in_set('animal_name', ['cat']) assert res.success is True
def test_pandas_datasource_processes_dataset_options(test_folder_connection_path): datasource = SparkDFDatasource('PandasCSV', generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path } } ) batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test") batch_kwargs["dataset_options"] = {"caching": False, "persist": False} batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False assert dataset._persist is False
def test_sa_expect_column_value_z_scores_to_be_less_than_impl(postgresql_engine): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]}) df.to_sql( name="z_score_test_data", con=postgresql_engine, index=False, if_exists="replace", ) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) expectation = ExpectColumnValueZScoresToBeLessThan(expectationConfiguration) engine = SqlAlchemyExecutionEngine(engine=postgresql_engine) engine.load_batch_data( "my_id", SqlAlchemyBatchData(execution_engine=engine, table_name="z_score_test_data"), ) result = expectation.validate(Validator(execution_engine=engine)) assert result == ExpectationValidationResult( success=True, )
def test_pandas_specify_not_include_unexpected_rows( dataframe_for_unexpected_rows, expected_evr_without_unexpected_rows): expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_values_to_be_in_set", kwargs={ "column": "a", "value_set": [1, 5, 22], "result_format": { "result_format": "COMPLETE", "include_unexpected_rows": False, }, }, ) expectation = ExpectColumnValuesToBeInSet(expectationConfiguration) batch: Batch = Batch(data=dataframe_for_unexpected_rows) engine = PandasExecutionEngine() validator = Validator( execution_engine=engine, batches=[ batch, ], ) result = expectation.validate(validator) assert result.result == expected_evr_without_unexpected_rows.result
def test_spark_expect_column_value_z_scores_to_be_less_than_impl( spark_session, basic_spark_df_execution_engine ): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]}) spark = get_or_create_spark_application( spark_config={ "spark.sql.catalogImplementation": "hive", "spark.executor.memory": "450m", # "spark.driver.allowMultipleContexts": "true", # This directive does not appear to have any effect. } ) df = spark.createDataFrame(df) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) expectation = ExpectColumnValueZScoresToBeLessThan(expectationConfiguration) engine = basic_spark_df_execution_engine engine.load_batch_data(batch_id="my_id", batch_data=df) result = expectation.validate(Validator(execution_engine=engine)) assert result == ExpectationValidationResult( success=True, )
def test_graph_validate_with_bad_config(basic_datasource): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]}) batch = basic_datasource.get_single_batch_from_batch_request( RuntimeBatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "data_asset_name": "IN_MEMORY_DATA_ASSET", "runtime_parameters": { "batch_data": df, }, "batch_identifiers": { "pipeline_stage_name": 0, "airflow_run_id": 0, "custom_key_0": 0, }, })) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_max_to_be_between", kwargs={ "column": "not_in_table", "min_value": 1, "max_value": 29 }, ) with pytest.raises(ge_exceptions.ExecutionEngineError) as eee: # noinspection PyUnusedLocal result = Validator(execution_engine=PandasExecutionEngine(), batches=[batch]).graph_validate( configurations=[expectation_configuration]) assert (str(eee.value) == 'Error: The column "not_in_table" in BatchData does not exist.')
def test_populate_dependencies(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) expectation = ExpectColumnValueZScoresToBeLessThan( expectationConfiguration) batch = Batch(data=df) graph = ValidationGraph() engine = PandasExecutionEngine() for configuration in [expectationConfiguration]: expectation_impl = get_expectation_impl( "expect_column_value_z_scores_to_be_less_than") validation_dependencies = expectation_impl( configuration).get_validation_dependencies( configuration, engine, ) for metric_configuration in validation_dependencies["metrics"].values( ): Validator(execution_engine=engine).build_metric_dependency_graph( graph, metric_configuration, configuration, execution_engine=engine) assert len(graph.edges) == 10
def test_graph_validate_with_bad_config(basic_datasource): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]}) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_max_to_be_between", kwargs={"column": "not_in_table", "min_value": 1, "max_value": 29}, ) expectation = ExpectColumnMaxToBeBetween(expectationConfiguration) batch = basic_datasource.get_single_batch_from_batch_request( BatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "batch_data": df, "partition_request": PartitionRequest( **{ "partition_identifiers": { "pipeline_stage_name": 0, "run_id": 0, "custom_key_0": 0, } } ), } ) ) try: result = Validator( execution_engine=PandasExecutionEngine(), batches=[batch] ).graph_validate(configurations=[expectationConfiguration]) except KeyError as e: result = e assert isinstance(result, KeyError)
def ge_validator_sqlalchemy() -> Validator: validator = Validator( execution_engine=SqlAlchemyExecutionEngine( connection_string="postgresql://localhost:5432/test"), batches=[ Batch( data=None, batch_request=BatchRequest( datasource_name="my_postgresql_datasource", data_connector_name="whole_table", data_asset_name="foo2", ), batch_definition=BatchDefinition( datasource_name="my_postgresql_datasource", data_connector_name="whole_table", data_asset_name="foo2", batch_identifiers=IDDict(), ), batch_spec=SqlAlchemyDatasourceBatchSpec({ "data_asset_name": "foo2", "table_name": "foo2", "batch_identifiers": {}, "schema_name": "public", "type": "table", }), ) ], ) return validator
def test_graph_validate_with_runtime_config(basic_datasource): df = pd.DataFrame( {"a": [1, 5, 22, 3, 5, 10, 2, 3], "b": [97, 332, 3, 4, 5, 6, 7, None]} ) batch = basic_datasource.get_single_batch_from_batch_request( BatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "data_asset_name": "IN_MEMORY_DATA_ASSET", "batch_data": df, "partition_request": PartitionRequest( **{ "batch_identifiers": { "pipeline_stage_name": 0, "airflow_run_id": 0, "custom_key_0": 0, } } ), } ) ) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={"column": "b", "mostly": 1, "threshold": 2, "double_sided": True}, ) try: result = Validator( execution_engine=PandasExecutionEngine(), batches=(batch,) ).graph_validate( configurations=[expectation_configuration], runtime_configuration={"result_format": "COMPLETE"}, ) except AssertionError as e: result = e assert result == [ ExpectationValidationResult( success=False, meta={}, result={ "element_count": 8, "unexpected_count": 1, "unexpected_percent": 12.5, "partial_unexpected_list": [332.0], "missing_count": 1, "missing_percent": 12.5, "unexpected_percent_nonmissing": 14.285714285714285, "partial_unexpected_index_list": None, "partial_unexpected_counts": [{"value": 332.0, "count": 1}], "unexpected_list": [332.0], "unexpected_index_list": None, }, expectation_config=None, exception_info=None, ) ]
def test_pandas_datasource_processes_dataset_options(test_folder_connection_path, test_backends): if "SparkDFDataset" not in test_backends: pytest.skip("Spark has not been enabled, so this test must be skipped.") datasource = SparkDFDatasource('PandasCSV', batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path } } ) batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test") batch_kwargs["dataset_options"] = {"caching": False, "persist": False} batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False assert dataset._persist is False
def test_to_make_sure_splitter_and_sampler_methods_are_optional( test_cases_for_sql_data_connector_sqlite_execution_engine, ): execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=BatchSpec( { "table_name": "table_partitioned_by_date_column__A", "partition_definition": {}, "sampling_method": "_sample_using_mod", "sampling_kwargs": { "column_name": "id", "mod": 10, "value": 8, }, } ) ) execution_engine.load_batch_data("__", batch_data) validator = Validator(execution_engine) assert len(validator.head(fetch_all=True)) == 12 batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=BatchSpec( { "table_name": "table_partitioned_by_date_column__A", "partition_definition": {}, } ) ) execution_engine.load_batch_data("__", batch_data) validator = Validator(execution_engine) assert len(validator.head(fetch_all=True)) == 120 batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=BatchSpec( { "table_name": "table_partitioned_by_date_column__A", "partition_definition": {}, "splitter_method": "_split_on_whole_table", "splitter_kwargs": {}, } ) ) execution_engine.load_batch_data("__", batch_data) validator = Validator(execution_engine) assert len(validator.head(fetch_all=True)) == 120
def _run_suite( self, dataset_name: str, dataset_path: Optional[str], df: Any, target_expectation_suite_name: str, run_id: str, ): target_suite = self.expectation_context.get_expectation_suite( target_expectation_suite_name) batch_markers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) batch_kwargs = {"datasource": generate_datasource_name(dataset_name)} if dataset_path: dataasset_name, _ = os.path.splitext( os.path.basename(dataset_path)) batch_kwargs["path"] = str(dataset_path) batch_kwargs["data_asset_name"] = dataasset_name batch = Batch( "kedro", batch_kwargs=BatchKwargs(batch_kwargs), data=df, batch_parameters=None, batch_markers=batch_markers, data_context=self.expectation_context, ) try: v = Validator( batch=batch, expectation_suite=target_suite, ) except ValueError: raise UnsupportedDataSet validator_dataset_batch = v.get_dataset() return self.expectation_context.run_validation_operator( "action_list_operator", [validator_dataset_batch], run_id=run_id)
def test_sampling_method__a_list( test_cases_for_sql_data_connector_sqlite_execution_engine, ): execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=BatchSpec({ "table_name": "table_partitioned_by_date_column__A", "partition_definition": {}, "splitter_method": "_split_on_whole_table", "splitter_kwargs": {}, "sampling_method": "_sample_using_a_list", "sampling_kwargs": { "column_name": "id", "value_list": [10, 20, 30, 40], }, })) execution_engine.load_batch_data("__", batch_data) validator = Validator(execution_engine) assert len(validator.head(fetch_all=True)) == 4
def test_graph_validate(basic_datasource): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]}) batch = basic_datasource.get_single_batch_from_batch_request( BatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "data_asset_name": "IN_MEMORY_DATA_ASSET", "batch_data": df, "partition_request": PartitionRequest( **{ "batch_identifiers": { "pipeline_stage_name": 0, "airflow_run_id": 0, "custom_key_0": 0, } }), })) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "b", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) result = Validator(execution_engine=PandasExecutionEngine(), batches=[batch]).graph_validate( configurations=[expectation_configuration]) assert result == [ ExpectationValidationResult( success=True, expectation_config=None, meta={}, result={ "element_count": 6, "unexpected_count": 0, "unexpected_percent": 0.0, "partial_unexpected_list": [], "missing_count": 1, "missing_percent": 16.666666666666664, "unexpected_percent_nonmissing": 0.0, }, exception_info=None, ) ]