def test_validator_default_expectation_args__pandas(basic_datasource): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]}) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "b", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) batch = basic_datasource.get_single_batch_from_batch_request( BatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "batch_data": df, "partition_request": PartitionRequest( **{ "partition_identifiers": { "pipeline_stage_name": 0, "run_id": 0, "custom_key_0": 0, } } ), } ) ) my_validator = Validator(execution_engine=PandasExecutionEngine(), batches=[batch]) print(my_validator.get_default_expectation_arguments())
def test_validator_default_expectation_args__pandas(basic_datasource): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]}) batch = basic_datasource.get_single_batch_from_batch_request( BatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "data_asset_name": "IN_MEMORY_DATA_ASSET", "batch_data": df, "partition_request": PartitionRequest( **{ "batch_identifiers": { "pipeline_stage_name": 0, "airflow_run_id": 0, "custom_key_0": 0, } }), })) my_validator = Validator(execution_engine=PandasExecutionEngine(), batches=[batch]) print(my_validator.get_default_expectation_arguments())
def test_graph_validate_with_bad_config(basic_datasource): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]}) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_max_to_be_between", kwargs={"column": "not_in_table", "min_value": 1, "max_value": 29}, ) expectation = ExpectColumnMaxToBeBetween(expectationConfiguration) batch = basic_datasource.get_single_batch_from_batch_request( BatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "batch_data": df, "partition_request": PartitionRequest( **{ "partition_identifiers": { "pipeline_stage_name": 0, "run_id": 0, "custom_key_0": 0, } } ), } ) ) try: result = Validator( execution_engine=PandasExecutionEngine(), batches=[batch] ).graph_validate(configurations=[expectationConfiguration]) except KeyError as e: result = e assert isinstance(result, KeyError)
def test_graph_validate_with_runtime_config(basic_datasource): df = pd.DataFrame( {"a": [1, 5, 22, 3, 5, 10, 2, 3], "b": [97, 332, 3, 4, 5, 6, 7, None]} ) batch = basic_datasource.get_single_batch_from_batch_request( BatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "data_asset_name": "IN_MEMORY_DATA_ASSET", "batch_data": df, "partition_request": PartitionRequest( **{ "batch_identifiers": { "pipeline_stage_name": 0, "airflow_run_id": 0, "custom_key_0": 0, } } ), } ) ) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={"column": "b", "mostly": 1, "threshold": 2, "double_sided": True}, ) try: result = Validator( execution_engine=PandasExecutionEngine(), batches=(batch,) ).graph_validate( configurations=[expectation_configuration], runtime_configuration={"result_format": "COMPLETE"}, ) except AssertionError as e: result = e assert result == [ ExpectationValidationResult( success=False, meta={}, result={ "element_count": 8, "unexpected_count": 1, "unexpected_percent": 12.5, "partial_unexpected_list": [332.0], "missing_count": 1, "missing_percent": 12.5, "unexpected_percent_nonmissing": 14.285714285714285, "partial_unexpected_index_list": None, "partial_unexpected_counts": [{"value": 332.0, "count": 1}], "unexpected_list": [332.0], "unexpected_index_list": None, }, expectation_config=None, exception_info=None, ) ]
def test_convert_batch_request_to_data_reference_string_using_regex(): pattern = r"^(.+)_(\d+)_(\d+)\.csv$" group_names = ["name", "timestamp", "price"] batch_request = BatchRequest( partition_request=PartitionRequest( **{"name": "alex", "timestamp": "20200809", "price": "1000",} ) ) assert ( convert_batch_request_to_data_reference_string_using_regex( batch_request=batch_request, regex_pattern=pattern, group_names=group_names ) == "alex_20200809_1000.csv" ) # Test an example with an uncaptured regex group (should return a WildcardDataReference) pattern = r"^(.+)_(\d+)_\d+\.csv$" group_names = ["name", "timestamp"] batch_request = BatchRequest( partition_request=PartitionRequest( **{"name": "alex", "timestamp": "20200809", "price": "1000",} ) ) assert ( convert_batch_request_to_data_reference_string_using_regex( batch_request=batch_request, regex_pattern=pattern, group_names=group_names ) == "alex_20200809_*.csv" ) # Test an example with an uncaptured regex group (should return a WildcardDataReference) pattern = r"^.+_(\d+)_(\d+)\.csv$" group_names = ["timestamp", "price"] batch_request = BatchRequest( partition_request=PartitionRequest( **{"name": "alex", "timestamp": "20200809", "price": "1000",} ) ) assert ( convert_batch_request_to_data_reference_string_using_regex( batch_request=batch_request, regex_pattern=pattern, group_names=group_names ) == "*_20200809_1000.csv" )
def test_graph_validate(basic_datasource): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]}) batch = basic_datasource.get_single_batch_from_batch_request( BatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "data_asset_name": "IN_MEMORY_DATA_ASSET", "batch_data": df, "partition_request": PartitionRequest( **{ "batch_identifiers": { "pipeline_stage_name": 0, "airflow_run_id": 0, "custom_key_0": 0, } }), })) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "b", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) result = Validator(execution_engine=PandasExecutionEngine(), batches=[batch]).graph_validate( configurations=[expectation_configuration]) assert result == [ ExpectationValidationResult( success=True, expectation_config=None, meta={}, result={ "element_count": 6, "unexpected_count": 0, "unexpected_percent": 0.0, "partial_unexpected_list": [], "missing_count": 1, "missing_percent": 16.666666666666664, "unexpected_percent_nonmissing": 0.0, }, exception_info=None, ) ]
def test_graph_validate_with_bad_config(basic_datasource): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]}) batch = basic_datasource.get_single_batch_from_batch_request( BatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "data_asset_name": "IN_MEMORY_DATA_ASSET", "batch_data": df, "partition_request": PartitionRequest( **{ "batch_identifiers": { "pipeline_stage_name": 0, "airflow_run_id": 0, "custom_key_0": 0, } }), })) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_max_to_be_between", kwargs={ "column": "not_in_table", "min_value": 1, "max_value": 29 }, ) with pytest.raises(ge_exceptions.ExecutionEngineError) as eee: # noinspection PyUnusedLocal result = Validator(execution_engine=PandasExecutionEngine(), batches=[batch]).graph_validate( configurations=[expectation_configuration]) assert (str(eee.value) == 'Error: The column "not_in_table" in BatchData does not exist.')
def test_alpha(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_alpha")) create_files_in_directory( directory=base_directory, file_name_list=[ "test_dir_alpha/A.csv", "test_dir_alpha/B.csv", "test_dir_alpha/C.csv", "test_dir_alpha/D.csv", ], ) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: ConfiguredAssetFilesystemDataConnector base_directory: {base_directory}/test_dir_alpha assets: A: glob_directive: "*.csv" default_regex: pattern: (.+)\\.csv group_names: - part_1 """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "datasource_name": "BASE", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) self_check_report = my_data_connector.self_check() print(json.dumps(self_check_report, indent=2)) assert self_check_report["class_name"] == "ConfiguredAssetFilesystemDataConnector" assert self_check_report["data_asset_count"] == 1 assert set(list(self_check_report["data_assets"].keys())) == {"A"} assert self_check_report["unmatched_data_reference_count"] == 0 my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition # Try to fetch a batch from a nonexistent asset my_batch_request: BatchRequest = BatchRequest( datasource_name="BASE", data_connector_name="general_filesystem_data_connector", data_asset_name="B", partition_request=None, ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 0 my_batch_request: BatchRequest = BatchRequest( datasource_name="BASE", data_connector_name="general_filesystem_data_connector", data_asset_name="A", partition_request=PartitionRequest(**{"batch_identifiers": {"part_1": "B"}}), ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 1
def test_return_all_batch_definitions_sorted(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp("test_return_all_batch_definitions_sorted") ) create_files_in_directory( directory=base_directory, file_name_list=[ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ], ) my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetFilesystemDataConnector datasource_name: test_environment #execution_engine: # class_name: PandasExecutionEngine base_directory: {base_directory} glob_directive: "*.csv" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price sorters: - orderby: asc class_name: LexicographicSorter name: name - datetime_format: "%Y%m%d" orderby: desc class_name: DateTimeSorter name: timestamp - orderby: desc class_name: NumericSorter name: price """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "datasource_name": "test_environment", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) self_check_report = my_data_connector.self_check() assert self_check_report["class_name"] == "ConfiguredAssetFilesystemDataConnector" assert self_check_report["data_asset_count"] == 1 assert self_check_report["data_assets"]["TestFiles"]["batch_definition_count"] == 10 assert self_check_report["unmatched_data_reference_count"] == 0 sorted_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", ) ) ) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "abe", "timestamp": "20200809", "price": "1040"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200819", "price": "1300"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200809", "price": "1000"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20201129", "price": "1900"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20200809", "price": "1500"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200811", "price": "1009"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200810", "price": "1003"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200713", "price": "1567"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200810", "price": "1001"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200809", "price": "1002"} ), ), ] # TEST 1: Sorting works assert expected == sorted_batch_definition_list my_batch_request: BatchRequest = BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_request=PartitionRequest( **{ "batch_identifiers": { "name": "james", "timestamp": "20200713", "price": "1567", } } ), ) my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition # TEST 2: Should only return the specified partition my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 1 my_batch_definition = my_batch_definition_list[0] expected_batch_definition: BatchDefinition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( **{ "name": "james", "timestamp": "20200713", "price": "1567", } ), ) assert my_batch_definition == expected_batch_definition # TEST 3: Without partition request, should return all 10 my_batch_request: BatchRequest = BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_request=None, ) # should return 10 my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 10
def test_alpha(): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "test_dir_alpha/A.csv", "test_dir_alpha/B.csv", "test_dir_alpha/C.csv", "test_dir_alpha/D.csv", ] for key in keys: client.put_object( Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key ) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: ConfiguredAssetS3DataConnector bucket: {bucket} prefix: test_dir_alpha assets: A: default_regex: pattern: .*(.+)\\.csv group_names: - part_1 """, ) my_data_connector: ConfiguredAssetS3DataConnector = instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_s3_data_connector", "datasource_name": "BASE", }, config_defaults={"module_name": "great_expectations.datasource.data_connector"}, ) self_check_report = my_data_connector.self_check() print(json.dumps(self_check_report, indent=2)) assert self_check_report["class_name"] == "ConfiguredAssetS3DataConnector" assert self_check_report["data_asset_count"] == 1 assert set(list(self_check_report["data_assets"].keys())) == {"A"} assert self_check_report["unmatched_data_reference_count"] == 0 my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition # Try to fetch a batch from a nonexistent asset my_batch_request: BatchRequest = BatchRequest( datasource_name="BASE", data_connector_name="general_s3_data_connector", data_asset_name="B", partition_request=None, ) my_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) assert len(my_batch_definition_list) == 0 my_batch_request: BatchRequest = BatchRequest( datasource_name="BASE", data_connector_name="general_s3_data_connector", data_asset_name="A", partition_request=PartitionRequest( **{"partition_identifiers": {"part_1": "B"}} ), ) my_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) assert len(my_batch_definition_list) == 1
def test_return_all_batch_definitions_sorted(): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ] for key in keys: client.put_object( Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key ) my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetS3DataConnector datasource_name: test_environment #execution_engine: # class_name: PandasExecutionEngine bucket: {bucket} prefix: "" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price sorters: - orderby: asc class_name: LexicographicSorter name: name - datetime_format: "%Y%m%d" orderby: desc class_name: DateTimeSorter name: timestamp - orderby: desc class_name: NumericSorter name: price """, ) my_data_connector: ConfiguredAssetS3DataConnector = instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_s3_data_connector", "datasource_name": "test_environment", }, config_defaults={"module_name": "great_expectations.datasource.data_connector"}, ) self_check_report = my_data_connector.self_check() assert self_check_report["class_name"] == "ConfiguredAssetS3DataConnector" assert self_check_report["data_asset_count"] == 1 assert self_check_report["data_assets"]["TestFiles"]["batch_definition_count"] == 10 assert self_check_report["unmatched_data_reference_count"] == 0 sorted_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", ) ) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "abe", "timestamp": "20200809", "price": "1040"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200819", "price": "1300"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200809", "price": "1000"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20201129", "price": "1900"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20200809", "price": "1500"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200811", "price": "1009"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200810", "price": "1003"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200713", "price": "1567"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200810", "price": "1001"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200809", "price": "1002"} ), ), ] # TEST 1: Sorting works assert expected == sorted_batch_definition_list my_batch_request: BatchRequest = BatchRequest( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_request=PartitionRequest( **{ "partition_identifiers": { "name": "james", "timestamp": "20200713", "price": "1567", } } ), ) my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition # TEST 2: Should only return the specified partition my_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) assert len(my_batch_definition_list) == 1 my_batch_definition = my_batch_definition_list[0] expected_batch_definition: BatchDefinition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( **{"name": "james", "timestamp": "20200713", "price": "1567",} ), ) assert my_batch_definition == expected_batch_definition # TEST 3: Without partition request, should return all 10 my_batch_request: BatchRequest = BatchRequest( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", partition_request=None, ) # should return 10 my_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) assert len(my_batch_definition_list) == 10
def get_batch_list( self, datasource_name: str = None, data_connector_name: str = None, data_asset_name: str = None, *, batch_request: BatchRequest = None, partition_request: Union[PartitionRequest, dict] = None, partition_identifiers: dict = None, limit: int = None, index=None, custom_filter_function: Callable = None, batch_spec_passthrough: Optional[dict] = None, sampling_method: str = None, sampling_kwargs: dict = None, splitter_method: str = None, splitter_kwargs: dict = None, **kwargs, ) -> List[Batch]: """Get the list of zero or more batches, based on a variety of flexible input types. Args: batch_request datasource_name data_connector_name data_asset_name batch_request partition_request partition_identifiers limit index custom_filter_function sampling_method sampling_kwargs splitter_method splitter_kwargs batch_spec_passthrough **kwargs Returns: (Batch) The requested batch `get_batch` is the main user-facing API for getting batches. In contrast to virtually all other methods in the class, it does not require typed or nested inputs. Instead, this method is intended to help the user pick the right parameters This method attempts to return any number of batches, including an empty list. """ datasource_name: str if batch_request: if not isinstance(batch_request, BatchRequest): raise TypeError( f"batch_request must be an instance of BatchRequest object, not {type(batch_request)}" ) datasource_name = batch_request.datasource_name else: datasource_name = datasource_name datasource: Datasource = self.datasources[datasource_name] if batch_request: # TODO: Raise a warning if any parameters besides batch_requests are specified return datasource.get_batch_list_from_batch_request( batch_request=batch_request) else: partition_request: PartitionRequest if partition_request is None: if partition_identifiers is None: partition_identifiers = kwargs else: # Raise a warning if kwargs exist pass # Currently, the implementation of splitting and sampling is inconsistent between the # Datasource and SimpleSqlalchemyDatasource classes. The former communicates these # directives to the underlying ExecutionEngine objects via "batch_spec_passthrough", which ultimately # gets merged with "batch_spec" and processed by the configured ExecutionEngine object. However, # SimpleSqlalchemyDatasource uses "PartitionRequest" to relay the splitting and sampling # directives to the SqlAlchemyExecutionEngine object. The problem with this is that if the querying # of partitions is implemented using the PartitionQuery class, it will not recognized the keys # representing the splitting and sampling directives and raise an exception. Additional work is needed # to decouple the directives that go into PartitionQuery from the other PartitionRequest directives. partition_request_params: dict = { "partition_identifiers": partition_identifiers, "limit": limit, "index": index, "custom_filter_function": custom_filter_function, } if sampling_method is not None: sampling_params: dict = { "sampling_method": sampling_method, } if sampling_kwargs is not None: sampling_params["sampling_kwargs"] = sampling_kwargs partition_request_params.update(sampling_params) if splitter_method is not None: splitter_params: dict = { "splitter_method": splitter_method, } if splitter_kwargs is not None: splitter_params["splitter_kwargs"] = splitter_kwargs partition_request_params.update(splitter_params) partition_request = PartitionRequest(partition_request_params) else: # Raise a warning if partition_identifiers or kwargs exist partition_request = PartitionRequest(partition_request) batch_request: BatchRequest = BatchRequest( datasource_name=datasource_name, data_connector_name=data_connector_name, data_asset_name=data_asset_name, partition_request=partition_request, batch_spec_passthrough=batch_spec_passthrough, ) return datasource.get_batch_list_from_batch_request( batch_request=batch_request)
def test_get_validator_expectation_suite_options( data_context_with_sql_datasource_for_testing_get_batch, ): context = data_context_with_sql_datasource_for_testing_get_batch context.create_expectation_suite("some_expectations") # Successful specification with an existing expectation_suite_name context.get_validator( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", date="2020-01-15", expectation_suite_name="some_expectations", ) # Successful specification with a fetched ExpectationSuite object some_expectations = context.get_expectation_suite("some_expectations") context.get_validator( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", date="2020-01-15", expectation_suite=some_expectations, ) # Successful specification with a fresh ExpectationSuite object some_more_expectations = context.create_expectation_suite( expectation_suite_name="some_more_expectations" ) context.get_validator( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", date="2020-01-15", expectation_suite=some_more_expectations, ) # Successful specification using overwrite_existing_expectation_suite context.get_validator( batch_request=BatchRequest( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", partition_request=PartitionRequest( partition_identifiers={"date": "2020-01-15"} ), ), create_expectation_suite_with_name="yet_more_expectations", # TODO: readd # overwrite_existing_expectation_suite=True, ) # Failed specification: incorrectly typed expectation suite with pytest.raises(TypeError): context.get_validator( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", date="2020-01-15", expectation_suite={ "im": "a", "dictionary": "not a", "ExepctationSuite": False, }, )
def test_get_batch(data_context_with_sql_datasource_for_testing_get_batch): context = data_context_with_sql_datasource_for_testing_get_batch print( json.dumps( context.datasources["my_sqlite_db"].get_available_data_asset_names(), indent=4, ) ) # Successful specification using a typed BatchRequest context.get_batch( batch_request=BatchRequest( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", partition_request=PartitionRequest( partition_identifiers={"date": "2020-01-15"} ), ) ) # Failed specification using an untyped BatchRequest with pytest.raises(TypeError): context.get_batch( batch_request={ "datasource_name": "my_sqlite_db", "data_connector_name": "daily", "data_asset_name": "table_partitioned_by_date_column__A", "partition_request": {"partition_identifiers": {"date": "2020-01-15"}}, } ) # Failed specification using an incomplete BatchRequest with pytest.raises(ValueError): context.get_batch( batch_request=BatchRequest( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", partition_request=PartitionRequest(partition_identifiers={}), ) ) # Failed specification using an incomplete BatchRequest with pytest.raises(ValueError): context.get_batch( batch_request=BatchRequest( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", ) ) # Failed specification using an incomplete BatchRequest with pytest.raises(TypeError): context.get_batch( batch_request=BatchRequest( datasource_name="my_sqlite_db", data_connector_name="daily", ) ) # Failed specification using an incomplete BatchRequest # with pytest.raises(ValueError): with pytest.raises(TypeError): context.get_batch( batch_request=BatchRequest( # datasource_name=MISSING data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", partition_request=PartitionRequest(partition_identifiers={}), ) ) # Successful specification using parameters context.get_batch( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", date="2020-01-15", ) # Successful specification using parameters without parameter names for the identifying triple # This is the thinnest this can plausibly get. context.get_batch( "my_sqlite_db", "daily", "table_partitioned_by_date_column__A", date="2020-01-15", ) # Successful specification using parameters without parameter names for the identifying triple # In the case of a data_asset containing a single Batch, we don't even need parameters context.get_batch( "my_sqlite_db", "whole_table", "table_partitioned_by_date_column__A", ) # Successful specification using parameters and partition_request context.get_batch( "my_sqlite_db", "daily", "table_partitioned_by_date_column__A", partition_request=PartitionRequest( {"partition_identifiers": {"date": "2020-01-15"}} ), ) # Successful specification using parameters and partition_identifiers context.get_batch( "my_sqlite_db", "daily", "table_partitioned_by_date_column__A", partition_identifiers={"date": "2020-01-15"}, )