def test__get_full_file_path_for_asset_spark(basic_spark_df_execution_engine,
                                             fs):
    """
    What does this test and why?
    File paths in DBFS need to use the `dbfs:/` protocol base instead of `/dbfs/` when
    being read using the `spark.read` method in the ExecutionEngine. In the data connector
    config however, the `/dbfs` version must be used. This test verifies that a config
    using a `/dbfs/` path is translated to `dbfs:/` when preparing the PathBatchSpec for the
    SparkDFExecutionEngine.
    """

    base_directory: str = "/dbfs/great_expectations"
    base_directory_colon: str = "dbfs:/great_expectations"
    fs.create_dir(base_directory)

    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "test_dir_0/A/B/C/logfile_0.csv",
            "test_dir_0/A/B/C/bigfile_1.csv",
            "test_dir_0/A/filename2.csv",
            "test_dir_0/A/filename3.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
            module_name: great_expectations.datasource.data_connector
            class_name: ConfiguredAssetDBFSDataConnector
            datasource_name: BASE
            base_directory: {base_directory}/test_dir_0/A
            glob_directive: "*"
            default_regex:
              pattern: (.+)\\.csv
              group_names:
              - name

            assets:
              A:
                base_directory: B/C
                glob_directive: "log*.csv"
                pattern: (.+)_(\\d+)\\.csv
                group_names:
                - name
                - number
        """, )

    my_data_connector: ConfiguredAssetDBFSDataConnector = instantiate_class_from_config(
        config=my_data_connector_yaml,
        runtime_environment={
            "name": "my_configured_asset_filesystem_data_connector",
            "execution_engine": basic_spark_df_execution_engine,
        },
        config_defaults={
            "module_name": "great_expectations.datasource.data_connector"
        },
    )
    my_data_connector.data_context_root_directory = base_directory

    assert (my_data_connector._get_full_file_path_for_asset(
        path="bigfile_1.csv", asset=my_data_connector.assets["A"]) ==
            f"{base_directory_colon}/test_dir_0/A/B/C/bigfile_1.csv")
    self_check_report = my_data_connector.self_check()
    assert self_check_report == {
        "class_name": "ConfiguredAssetDBFSDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": ["A"],
        "data_assets": {
            "A": {
                "batch_definition_count": 1,
                "example_data_references": ["logfile_0.csv"],
            }
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {},
    }

    my_batch_definition_list: List[BatchDefinition]
    my_batch_definition: BatchDefinition
    my_batch_request = BatchRequest(
        datasource_name="BASE",
        data_connector_name="my_configured_asset_filesystem_data_connector",
        data_asset_name="A",
        data_connector_query=None,
    )
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request))
    assert len(my_batch_definition_list) == 1

    my_batch_definition = my_batch_definition_list[0]
    batch_spec: BatchSpec = my_data_connector.build_batch_spec(
        batch_definition=my_batch_definition)

    assert isinstance(batch_spec, PathBatchSpec)
    assert batch_spec.path == "dbfs:/great_expectations/test_dir_0/A/B/C/logfile_0.csv"
        "account_url"] = "superconductivetesting.blob.core.windows.net"
datasource_config["data_connectors"]["configured_data_connector_name"][
    "azure_options"]["credential"] = CREDENTIAL
datasource_config["data_connectors"]["configured_data_connector_name"][
    "container"] = "superconductive-public"
datasource_config["data_connectors"]["configured_data_connector_name"][
    "name_starts_with"] = "data/taxi_yellow_tripdata_samples/"

context.test_yaml_config(yaml.dump(datasource_config))

context.add_datasource(**datasource_config)

# Here is a BatchRequest naming a data_asset
batch_request = BatchRequest(
    datasource_name="my_azure_datasource",
    data_connector_name="configured_data_connector_name",
    data_asset_name="<YOUR_DATA_ASSET_NAME>",
)

# Please note this override is only to provide good UX for docs and tests.
# In normal usage you'd set your data asset name directly in the BatchRequest above.
batch_request.data_asset_name = "taxi_data"

context.create_expectation_suite(expectation_suite_name="test_suite",
                                 overwrite_existing=True)
validator = context.get_validator(batch_request=batch_request,
                                  expectation_suite_name="test_suite")
print(validator.head())

# NOTE: The following code is only for testing and can be ignored by users.
assert isinstance(validator, ge.validator.validator.Validator)
예제 #3
0
def test_get_batch(data_context_with_sql_datasource_for_testing_get_batch):
    context = data_context_with_sql_datasource_for_testing_get_batch

    print(
        json.dumps(
            context.datasources["my_sqlite_db"].get_available_data_asset_names(),
            indent=4,
        )
    )

    # Successful specification using a typed BatchRequest
    context.get_batch(
        batch_request=BatchRequest(
            datasource_name="my_sqlite_db",
            data_connector_name="daily",
            data_asset_name="table_partitioned_by_date_column__A",
            partition_request=PartitionRequest(
                partition_identifiers={"date": "2020-01-15"}
            ),
        )
    )

    # Failed specification using an untyped BatchRequest
    with pytest.raises(TypeError):
        context.get_batch(
            batch_request={
                "datasource_name": "my_sqlite_db",
                "data_connector_name": "daily",
                "data_asset_name": "table_partitioned_by_date_column__A",
                "partition_request": {"partition_identifiers": {"date": "2020-01-15"}},
            }
        )

    # Failed specification using an incomplete BatchRequest
    with pytest.raises(ValueError):
        context.get_batch(
            batch_request=BatchRequest(
                datasource_name="my_sqlite_db",
                data_connector_name="daily",
                data_asset_name="table_partitioned_by_date_column__A",
                partition_request=PartitionRequest(partition_identifiers={}),
            )
        )

    # Failed specification using an incomplete BatchRequest
    with pytest.raises(ValueError):
        context.get_batch(
            batch_request=BatchRequest(
                datasource_name="my_sqlite_db",
                data_connector_name="daily",
                data_asset_name="table_partitioned_by_date_column__A",
            )
        )

    # Failed specification using an incomplete BatchRequest
    with pytest.raises(TypeError):
        context.get_batch(
            batch_request=BatchRequest(
                datasource_name="my_sqlite_db",
                data_connector_name="daily",
            )
        )

    # Failed specification using an incomplete BatchRequest
    # with pytest.raises(ValueError):
    with pytest.raises(TypeError):
        context.get_batch(
            batch_request=BatchRequest(
                # datasource_name=MISSING
                data_connector_name="daily",
                data_asset_name="table_partitioned_by_date_column__A",
                partition_request=PartitionRequest(partition_identifiers={}),
            )
        )

    # Successful specification using parameters
    context.get_batch(
        datasource_name="my_sqlite_db",
        data_connector_name="daily",
        data_asset_name="table_partitioned_by_date_column__A",
        date="2020-01-15",
    )

    # Successful specification using parameters without parameter names for the identifying triple
    # This is the thinnest this can plausibly get.
    context.get_batch(
        "my_sqlite_db",
        "daily",
        "table_partitioned_by_date_column__A",
        date="2020-01-15",
    )

    # Successful specification using parameters without parameter names for the identifying triple
    # In the case of a data_asset containing a single Batch, we don't even need parameters
    context.get_batch(
        "my_sqlite_db",
        "whole_table",
        "table_partitioned_by_date_column__A",
    )

    # Successful specification using parameters and partition_request
    context.get_batch(
        "my_sqlite_db",
        "daily",
        "table_partitioned_by_date_column__A",
        partition_request=PartitionRequest(
            {"partition_identifiers": {"date": "2020-01-15"}}
        ),
    )

    # Successful specification using parameters and partition_identifiers
    context.get_batch(
        "my_sqlite_db",
        "daily",
        "table_partitioned_by_date_column__A",
        partition_identifiers={"date": "2020-01-15"},
    )
예제 #4
0
    "<CONTAINER_PATH_TO_DATA>", "data/taxi_yellow_tripdata_samples/")
datasource_yaml = datasource_yaml.replace(
    "<YOUR_ACCOUNT_URL>", "superconductivetesting.blob.core.windows.net")
datasource_yaml = datasource_yaml.replace("<YOUR_CREDENTIAL>", CREDENTIAL)

context.test_yaml_config(datasource_yaml)

context.add_datasource(**yaml.load(datasource_yaml))

# Here is a BatchRequest naming a data_asset
batch_request = BatchRequest(
    datasource_name="my_azure_datasource",
    data_connector_name="configured_data_connector_name",
    data_asset_name="<YOUR_DATA_ASSET_NAME>",
    batch_spec_passthrough={
        "reader_method": "csv",
        "reader_options": {
            "header": True
        }
    },
)

# Please note this override is only to provide good UX for docs and tests.
# In normal usage you'd set your data asset name directly in the BatchRequest above.
batch_request.data_asset_name = "taxi_data"

context.create_expectation_suite(expectation_suite_name="test_suite",
                                 overwrite_existing=True)
validator = context.get_validator(batch_request=batch_request,
                                  expectation_suite_name="test_suite")
print(validator.head())
def test_basic_datasource_runtime_data_connector_error_checking(
    basic_datasource_with_runtime_data_connector, ):
    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    # Test for an unknown datasource
    with pytest.raises(ValueError):
        # noinspection PyUnusedLocal
        batch_list: List[
            Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="non_existent_datasource",
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset",
                ))

    # Test for an unknown data_connector
    with pytest.raises(ValueError):
        # noinspection PyUnusedLocal
        batch_list: List[
            Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name=basic_datasource_with_runtime_data_connector
                    .name,
                    data_connector_name="non_existent_data_connector",
                    data_asset_name="my_data_asset",
                ))

    # Test for illegal absence of partition_request when batch_data is specified
    with pytest.raises(ge_exceptions.DataConnectorError):
        # noinspection PyUnusedLocal
        batch_list: List[
            Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name=basic_datasource_with_runtime_data_connector
                    .name,
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset",
                    batch_data=test_df,
                    partition_request=None,
                ))

    # Test for illegal nullity of partition_request["partition_identifiers"] when batch_data is specified
    partition_request: dict = {"partition_identifiers": None}
    with pytest.raises(ge_exceptions.DataConnectorError):
        # noinspection PyUnusedLocal
        batch_list: List[
            Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name=basic_datasource_with_runtime_data_connector
                    .name,
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset",
                    batch_data=test_df,
                    partition_request=partition_request,
                ))

    # Test for illegal falsiness of partition_request["partition_identifiers"] when batch_data is specified
    partition_request: dict = {"partition_identifiers": {}}
    with pytest.raises(ge_exceptions.DataConnectorError):
        # noinspection PyUnusedLocal
        batch_list: List[
            Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name=basic_datasource_with_runtime_data_connector
                    .name,
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset",
                    batch_data=test_df,
                    partition_request=partition_request,
                ))
def test_alpha(tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("test_alpha"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "test_dir_alpha/A.csv",
            "test_dir_alpha/B.csv",
            "test_dir_alpha/C.csv",
            "test_dir_alpha/D.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
                module_name: great_expectations.datasource.data_connector
                class_name: ConfiguredAssetFilesystemDataConnector
                base_directory: {base_directory}/test_dir_alpha
                assets:
                  A:
                    glob_directive: "*.csv"
                default_regex:
                    pattern: (.+)\\.csv
                    group_names:
                    - part_1
            """,
    )

    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "general_filesystem_data_connector",
                "datasource_name": "BASE",
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        )
    )
    self_check_report = my_data_connector.self_check()
    print(json.dumps(self_check_report, indent=2))

    assert self_check_report["class_name"] == "ConfiguredAssetFilesystemDataConnector"
    assert self_check_report["data_asset_count"] == 1
    assert set(list(self_check_report["data_assets"].keys())) == {"A"}
    assert self_check_report["unmatched_data_reference_count"] == 0

    my_batch_definition_list: List[BatchDefinition]
    my_batch_definition: BatchDefinition

    # Try to fetch a batch from a nonexistent asset
    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="BASE",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="B",
        partition_request=None,
    )

    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request
        )
    )
    assert len(my_batch_definition_list) == 0

    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="BASE",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="A",
        partition_request=PartitionRequest(
            **{"partition_identifiers": {"part_1": "B"}}
        ),
    )
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request
        )
    )
    assert len(my_batch_definition_list) == 1
def test_relative_default_and_relative_asset_base_directory_paths(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp(
            "test_relative_default_and_relative_asset_base_directory_paths"
        )
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "test_dir_0/A/B/C/logfile_0.csv",
            "test_dir_0/A/B/C/bigfile_1.csv",
            "test_dir_0/A/filename2.csv",
            "test_dir_0/A/filename3.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
            module_name: great_expectations.datasource.data_connector
            class_name: ConfiguredAssetFilesystemDataConnector
            base_directory: test_dir_0/A
            glob_directive: "*"
            default_regex:
              pattern: (.+)\\.csv
              group_names:
              - name

            assets:
              A:
                base_directory: B/C
                glob_directive: "log*.csv"
                pattern: (.+)_(\\d+)\\.csv
                group_names:
                - name
                - number
        """,
    )

    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "my_configured_asset_filesystem_data_connector",
                "datasource_name": "BASE",
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        )
    )
    my_data_connector.data_context_root_directory = base_directory

    assert my_data_connector.base_directory == f"{base_directory}/test_dir_0/A"
    assert (
        my_data_connector._get_full_file_path_for_asset(
            path="bigfile_1.csv", asset=my_data_connector.assets["A"]
        )
        == f"{base_directory}/test_dir_0/A/B/C/bigfile_1.csv"
    )
    self_check_report = my_data_connector.self_check()
    assert self_check_report == {
        "class_name": "ConfiguredAssetFilesystemDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": ["A"],
        "data_assets": {
            "A": {
                "batch_definition_count": 1,
                "example_data_references": ["logfile_0.csv"],
            }
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
        "example_data_reference": {},
    }

    my_batch_definition_list: List[BatchDefinition]
    my_batch_definition: BatchDefinition
    my_batch_request = BatchRequest(
        datasource_name="BASE",
        data_connector_name="my_configured_asset_filesystem_data_connector",
        data_asset_name="A",
        partition_request=None,
    )
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request
        )
    )
    assert len(my_batch_definition_list) == 1
예제 #8
0
            - default_identifier_name
    default_inferred_data_connector_name:
        class_name: InferredAssetFilesystemDataConnector
        base_directory: ../data/
        default_regex:
          group_names:
            - data_asset_name
          pattern: (.*)
"""
context.test_yaml_config(datasource_yaml)
context.add_datasource(**yaml.load(datasource_yaml))

# Get Validator by creating ExpectationSuite and passing in BatchRequest
batch_request = BatchRequest(
    datasource_name="data__dir",
    data_connector_name="default_inferred_data_connector_name",
    data_asset_name="yellow_trip_data_sample_2019-01.csv",
    limit=1000,
)
context.create_expectation_suite(expectation_suite_name="taxi.demo")
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name="taxi.demo",
)
# NOTE: The following assertion is only for testing and can be ignored by users.
assert isinstance(validator, Validator)

# Profile the data with the UserConfigurableProfiler and save resulting ExpectationSuite
ignored_columns = [
    "vendor_id",
    "pickup_datetime",
    "dropoff_datetime",
def test_more_complex_instantiation_of_InferredAssetSqlDataConnector(
    test_cases_for_sql_data_connector_sqlite_execution_engine,
):
    my_data_connector = instantiate_class_from_config(
        config={
            "class_name": "InferredAssetSqlDataConnector",
            "name": "whole_table",
            "data_asset_name_suffix": "__whole",
            "include_schema_name": True,
        },
        runtime_environment={
            "execution_engine": test_cases_for_sql_data_connector_sqlite_execution_engine,
            "datasource_name": "my_test_datasource",
        },
        config_defaults={"module_name": "great_expectations.datasource.data_connector"},
    )

    report_object = my_data_connector.self_check()

    assert report_object == {
        "class_name": "InferredAssetSqlDataConnector",
        "data_asset_count": 21,
        "data_assets": {
            "main.table_containing_id_spacers_for_D__whole": {
                "batch_definition_count": 1,
                "example_data_references": [{}],
            },
            "main.table_full__I__whole": {
                "batch_definition_count": 1,
                "example_data_references": [{}],
            },
            "main.table_partitioned_by_date_column__A__whole": {
                "batch_definition_count": 1,
                "example_data_references": [{}],
            },
        },
        "example_data_asset_names": [
            "main.table_containing_id_spacers_for_D__whole",
            "main.table_full__I__whole",
            "main.table_partitioned_by_date_column__A__whole",
        ],
        "example_data_reference": {
            "batch_spec": {
                "partition_definition": {},
                "table_name": "main.table_containing_id_spacers_for_D",
            },
            "n_rows": 30,
        },
        "example_unmatched_data_references": [],
        "unmatched_data_reference_count": 0,
    }

    assert my_data_connector.get_available_data_asset_names() == [
        "main.table_containing_id_spacers_for_D__whole",
        "main.table_full__I__whole",
        "main.table_partitioned_by_date_column__A__whole",
        "main.table_partitioned_by_foreign_key__F__whole",
        "main.table_partitioned_by_incrementing_batch_id__E__whole",
        "main.table_partitioned_by_irregularly_spaced_incrementing_id_with_spacing_in_a_second_table__D__whole",
        "main.table_partitioned_by_multiple_columns__G__whole",
        "main.table_partitioned_by_regularly_spaced_incrementing_id_column__C__whole",
        "main.table_partitioned_by_timestamp_column__B__whole",
        "main.table_that_should_be_partitioned_by_random_hash__H__whole",
        "main.table_with_fk_reference_from_F__whole",
        "main.view_by_date_column__A__whole",
        "main.view_by_incrementing_batch_id__E__whole",
        "main.view_by_irregularly_spaced_incrementing_id_with_spacing_in_a_second_table__D__whole",
        "main.view_by_multiple_columns__G__whole",
        "main.view_by_regularly_spaced_incrementing_id_column__C__whole",
        "main.view_by_timestamp_column__B__whole",
        "main.view_containing_id_spacers_for_D__whole",
        "main.view_partitioned_by_foreign_key__F__whole",
        "main.view_that_should_be_partitioned_by_random_hash__H__whole",
        "main.view_with_fk_reference_from_F__whole",
    ]

    batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request(
        BatchRequest(
            datasource_name="my_test_datasource",
            data_connector_name="whole_table",
            data_asset_name="main.table_that_should_be_partitioned_by_random_hash__H__whole",
        )
    )
    assert len(batch_definition_list) == 1
예제 #10
0
context.create_expectation_suite(expectation_suite_name="test_suite",
                                 overwrite_existing=True)
validator = context.get_validator(batch_request=batch_request,
                                  expectation_suite_name="test_suite")
print(validator.head())

# NOTE: The following code is only for testing and can be ignored by users.
assert isinstance(validator, ge.validator.validator.Validator)

# Here is a BatchRequest naming a data_asset
batch_request = BatchRequest(
    datasource_name="my_gcs_datasource",
    data_connector_name="default_inferred_data_connector_name",
    data_asset_name="<YOUR_DATA_ASSET_NAME>",
    batch_spec_passthrough={
        "reader_method": "csv",
        "reader_options": {
            "header": True
        }
    },
)

# Please note this override is only to provide good UX for docs and tests.
# In normal usage you'd set your data asset name directly in the BatchRequest above.
batch_request.data_asset_name = (
    "data/taxi_yellow_tripdata_samples/yellow_tripdata_sample_2019-01")

context.create_expectation_suite(expectation_suite_name="test_suite",
                                 overwrite_existing=True)
validator = context.get_validator(batch_request=batch_request,
                                  expectation_suite_name="test_suite")
예제 #11
0
    },
    batch_identifiers={"default_identifier_name": "something_something"},
)
context.create_expectation_suite(expectation_suite_name="test_suite",
                                 overwrite_existing=True)
validator = context.get_validator(batch_request=batch_request,
                                  expectation_suite_name="test_suite")
print(validator.head())

# NOTE: The following code is only for testing and can be ignored by users.
assert isinstance(validator, ge.validator.validator.Validator)

# Here is a BatchRequest naming a table
batch_request = BatchRequest(
    datasource_name="my_sqlite_datasource",
    data_connector_name="default_inferred_data_connector_name",
    data_asset_name=
    "yellow_tripdata_sample_2019_01",  # this is the name of the table you want to retrieve
)
context.create_expectation_suite(expectation_suite_name="test_suite",
                                 overwrite_existing=True)
validator = context.get_validator(batch_request=batch_request,
                                  expectation_suite_name="test_suite")
print(validator.head())

# NOTE: The following code is only for testing and can be ignored by users.
assert isinstance(validator, ge.validator.validator.Validator)
assert [ds["name"]
        for ds in context.list_datasources()] == ["my_sqlite_datasource"]
assert "yellow_tripdata_sample_2019_01" in set(
    context.get_available_data_asset_names()["my_sqlite_datasource"]
    ["default_inferred_data_connector_name"])
예제 #12
0
context.create_expectation_suite(
    expectation_suite_name="test_suite", overwrite_existing=True
)
validator = context.get_validator(
    batch_request=batch_request, expectation_suite_name="test_suite"
)
print(validator.head())

# NOTE: The following code is only for testing and can be ignored by users.
assert isinstance(validator, ge.validator.validator.Validator)

# Second test for BatchRequest naming a table
batch_request = BatchRequest(
    datasource_name="my_redshift_datasource",
    data_connector_name="default_inferred_data_connector_name",
    data_asset_name="taxi_data",  # this is the name of the table you want to retrieve
)
context.create_expectation_suite(
    expectation_suite_name="test_suite", overwrite_existing=True
)
validator = context.get_validator(
    batch_request=batch_request, expectation_suite_name="test_suite"
)
print(validator.head())

# NOTE: The following code is only for testing and can be ignored by users.
assert isinstance(validator, ge.validator.validator.Validator)
assert [ds["name"] for ds in context.list_datasources()] == ["my_redshift_datasource"]
assert "taxi_data" in set(
    context.get_available_data_asset_names()["my_redshift_datasource"][
예제 #13
0
    batch_identifiers={"default_identifier_name": "default_identifier"},
)

context.create_expectation_suite(expectation_suite_name="test_suite",
                                 overwrite_existing=True)
validator = context.get_validator(batch_request=batch_request,
                                  expectation_suite_name="test_suite")
print(validator.head())

# NOTE: The following code is only for testing and can be ignored by users.
assert isinstance(validator, ge.validator.validator.Validator)

# Second test for BatchRequest naming a table
batch_request = BatchRequest(
    datasource_name="my_snowflake_datasource",
    data_connector_name="default_inferred_data_connector_name",
    data_asset_name=
    f"{sfSchema.lower()}.taxi_data",  # this is the name of the table you want to retrieve
)
context.create_expectation_suite(expectation_suite_name="test_suite",
                                 overwrite_existing=True)
validator = context.get_validator(batch_request=batch_request,
                                  expectation_suite_name="test_suite")
print(validator.head())

# NOTE: The following code is only for testing and can be ignored by users.
assert isinstance(validator, ge.validator.validator.Validator)
assert [ds["name"]
        for ds in context.list_datasources()] == ["my_snowflake_datasource"]
assert f"{sfSchema.lower()}.taxi_data" in set(
    context.get_available_data_asset_names()["my_snowflake_datasource"]
    ["default_inferred_data_connector_name"])
def test__get_full_file_path_for_asset_pandas(fs):
    """
    What does this test and why?
    File paths in DBFS need to use the `dbfs:/` protocol base instead of `/dbfs/` when
    being read using the `spark.read` method in the ExecutionEngine. HOWEVER when using a
    PandasExecutionEngine the file semantic `/dbfs/` version must be used instead.
    This test verifies that a config using a `/dbfs/` path is NOT translated to `dbfs:/`
    when preparing the PathBatchSpec for the PandasExecutionEngine.
    """

    # Copy boto modules into fake filesystem (see https://github.com/spulec/moto/issues/1682#issuecomment-645016188)
    for module in [boto3, botocore]:
        module_dir = pathlib.Path(module.__file__).parent
        fs.add_real_directory(module_dir, lazy_read=False)

    # Copy google credentials into fake filesystem if they exist on your filesystem
    google_cred_file = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
    if google_cred_file:
        fs.add_real_file(google_cred_file)

    base_directory: str = "/dbfs/great_expectations"
    fs.create_dir(base_directory)

    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "test_dir_0/A/B/C/logfile_0.csv",
            "test_dir_0/A/B/C/bigfile_1.csv",
            "test_dir_0/A/filename2.csv",
            "test_dir_0/A/filename3.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
            module_name: great_expectations.datasource.data_connector
            class_name: ConfiguredAssetDBFSDataConnector
            datasource_name: BASE
            base_directory: {base_directory}/test_dir_0/A
            glob_directive: "*"
            default_regex:
              pattern: (.+)\\.csv
              group_names:
              - name

            assets:
              A:
                base_directory: B/C
                glob_directive: "log*.csv"
                pattern: (.+)_(\\d+)\\.csv
                group_names:
                - name
                - number
        """, )

    my_data_connector: ConfiguredAssetDBFSDataConnector = instantiate_class_from_config(
        config=my_data_connector_yaml,
        runtime_environment={
            "name": "my_configured_asset_filesystem_data_connector",
            "execution_engine": PandasExecutionEngine(),
        },
        config_defaults={
            "module_name": "great_expectations.datasource.data_connector"
        },
    )
    my_data_connector.data_context_root_directory = base_directory

    assert (my_data_connector._get_full_file_path_for_asset(
        path="bigfile_1.csv", asset=my_data_connector.assets["A"]) ==
            f"{base_directory}/test_dir_0/A/B/C/bigfile_1.csv")
    self_check_report = my_data_connector.self_check()
    assert self_check_report == {
        "class_name": "ConfiguredAssetDBFSDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": ["A"],
        "data_assets": {
            "A": {
                "batch_definition_count": 1,
                "example_data_references": ["logfile_0.csv"],
            }
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {},
    }

    my_batch_definition_list: List[BatchDefinition]
    my_batch_definition: BatchDefinition
    my_batch_request = BatchRequest(
        datasource_name="BASE",
        data_connector_name="my_configured_asset_filesystem_data_connector",
        data_asset_name="A",
        data_connector_query=None,
    )
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request))
    assert len(my_batch_definition_list) == 1

    my_batch_definition = my_batch_definition_list[0]
    batch_spec: BatchSpec = my_data_connector.build_batch_spec(
        batch_definition=my_batch_definition)

    assert isinstance(batch_spec, PathBatchSpec)
    assert batch_spec.path == f"{base_directory}/test_dir_0/A/B/C/logfile_0.csv"
def test_return_all_batch_definitions_unsorted(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp("test_return_all_batch_definitions_unsorted")
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "alex_20200809_1000.csv",
            "eugene_20200809_1500.csv",
            "james_20200811_1009.csv",
            "abe_20200809_1040.csv",
            "will_20200809_1002.csv",
            "james_20200713_1567.csv",
            "eugene_20201129_1900.csv",
            "will_20200810_1001.csv",
            "james_20200810_1003.csv",
            "alex_20200819_1300.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
            class_name: ConfiguredAssetFilesystemDataConnector
            datasource_name: test_environment
            #execution_engine:
            #    class_name: PandasExecutionEngine
            base_directory: {base_directory}
            glob_directive: "*.csv"
            assets:
                TestFiles:
            default_regex:
                pattern: (.+)_(.+)_(.+)\\.csv
                group_names:
                    - name
                    - timestamp
                    - price
        """,
    )

    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "general_filesystem_data_connector",
                "datasource_name": "test_environment",
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        )
    )

    with pytest.raises(TypeError):
        my_data_connector.get_batch_definition_list_from_batch_request()

    # with unnamed data_asset_name
    unsorted_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name=None,
            )
        )
    )
    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "abe", "timestamp": "20200809", "price": "1040"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200809", "price": "1000"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200819", "price": "1300"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20200809", "price": "1500"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20201129", "price": "1900"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200713", "price": "1567"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200810", "price": "1003"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200811", "price": "1009"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200809", "price": "1002"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200810", "price": "1001"}
            ),
        ),
    ]
    assert expected == unsorted_batch_definition_list

    # with named data_asset_name
    unsorted_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
            )
        )
    )
    assert expected == unsorted_batch_definition_list
예제 #16
0
datasource_yaml = datasource_yaml.replace("<PATH_TO_YOUR_DATA_HERE>",
                                          data_dir_path)

context.test_yaml_config(datasource_yaml)

context.add_datasource(**yaml.load(datasource_yaml))
available_data_asset_names = context.datasources[
    "taxi_datasource"].get_available_data_asset_names(
        data_connector_names="default_inferred_data_connector_name"
    )["default_inferred_data_connector_name"]
assert len(available_data_asset_names) == 36

# Here is a BatchRequest naming an inferred data_asset.
batch_request = BatchRequest(
    datasource_name="taxi_datasource",
    data_connector_name="default_inferred_data_connector_name",
    data_asset_name="<YOUR_DATA_ASSET_NAME>",
)

# Please note this override is only to provide good UX for docs and tests.
# In normal usage you'd set your data asset name directly in the BatchRequest above.
batch_request.data_asset_name = "yellow_tripdata_sample_2019-01.csv"

context.create_expectation_suite(expectation_suite_name="test_suite",
                                 overwrite_existing=True)
validator = context.get_validator(batch_request=batch_request,
                                  expectation_suite_name="test_suite")
print(validator.head(n_rows=10))

batch_list = context.get_batch_list(batch_request=batch_request)
assert len(batch_list) == 1
def test_return_all_batch_definitions_sorted(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp("test_return_all_batch_definitions_sorted")
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "alex_20200809_1000.csv",
            "eugene_20200809_1500.csv",
            "james_20200811_1009.csv",
            "abe_20200809_1040.csv",
            "will_20200809_1002.csv",
            "james_20200713_1567.csv",
            "eugene_20201129_1900.csv",
            "will_20200810_1001.csv",
            "james_20200810_1003.csv",
            "alex_20200819_1300.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
        class_name: ConfiguredAssetFilesystemDataConnector
        datasource_name: test_environment
        #execution_engine:
        #    class_name: PandasExecutionEngine
        base_directory: {base_directory}
        glob_directive: "*.csv"
        assets:
            TestFiles:
        default_regex:
            pattern: (.+)_(.+)_(.+)\\.csv
            group_names:
                - name
                - timestamp
                - price
        sorters:
            - orderby: asc
              class_name: LexicographicSorter
              name: name
            - datetime_format: "%Y%m%d"
              orderby: desc
              class_name: DateTimeSorter
              name: timestamp
            - orderby: desc
              class_name: NumericSorter
              name: price

    """,
    )

    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "general_filesystem_data_connector",
                "datasource_name": "test_environment",
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        )
    )

    self_check_report = my_data_connector.self_check()

    assert self_check_report["class_name"] == "ConfiguredAssetFilesystemDataConnector"
    assert self_check_report["data_asset_count"] == 1
    assert self_check_report["data_assets"]["TestFiles"]["batch_definition_count"] == 10
    assert self_check_report["unmatched_data_reference_count"] == 0

    sorted_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
            )
        )
    )

    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "abe", "timestamp": "20200809", "price": "1040"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200819", "price": "1300"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200809", "price": "1000"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20201129", "price": "1900"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20200809", "price": "1500"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200811", "price": "1009"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200810", "price": "1003"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200713", "price": "1567"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200810", "price": "1001"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200809", "price": "1002"}
            ),
        ),
    ]

    # TEST 1: Sorting works
    assert expected == sorted_batch_definition_list

    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        partition_request=PartitionRequest(
            **{
                "partition_identifiers": {
                    "name": "james",
                    "timestamp": "20200713",
                    "price": "1567",
                }
            }
        ),
    )

    my_batch_definition_list: List[BatchDefinition]
    my_batch_definition: BatchDefinition

    # TEST 2: Should only return the specified partition
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request
        )
    )

    assert len(my_batch_definition_list) == 1
    my_batch_definition = my_batch_definition_list[0]
    expected_batch_definition: BatchDefinition = BatchDefinition(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        partition_definition=PartitionDefinition(
            **{
                "name": "james",
                "timestamp": "20200713",
                "price": "1567",
            }
        ),
    )
    assert my_batch_definition == expected_batch_definition

    # TEST 3: Without partition request, should return all 10
    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        partition_request=None,
    )
    # should return 10
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request
        )
    )
    assert len(my_batch_definition_list) == 10
def test_get_available_data_asset_names_with_single_partition_file_data_connector(
    sample_datasource_v013_with_single_partition_file_data_connector,
):
    datasource: Datasource = (
        sample_datasource_v013_with_single_partition_file_data_connector
    )
    data_connector_names: Optional[Union[List, str]] = None

    # Call "get_batch_list_from_batch_request()" to fill up the caches
    data_connector_name: str = "test_runtime_data_connector"
    data_asset_name: str = "IN_MEMORY_DATA_ASSET"
    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})
    batch_request: dict = {
        "datasource_name": datasource.name,
        "data_connector_name": data_connector_name,
        "data_asset_name": data_asset_name,
        "batch_data": test_df,
        "partition_request": {
            "partition_identifiers": {
                "airflow_run_id": 1234567890,
            },
            "limit": None,
        },
    }
    batch_request: BatchRequest = BatchRequest(**batch_request)
    # noinspection PyUnusedLocal
    batch_list: List[Batch] = datasource.get_batch_list_from_batch_request(
        batch_request=batch_request
    )

    expected_data_asset_names: dict = {
        "test_runtime_data_connector": [data_asset_name],
        "my_filesystem_data_connector": ["DEFAULT_ASSET_NAME"],
    }

    available_data_asset_names: dict = datasource.get_available_data_asset_names(
        data_connector_names=data_connector_names
    )

    assert set(available_data_asset_names.keys()) == set(
        expected_data_asset_names.keys()
    )
    for connector_name, asset_list in available_data_asset_names.items():
        assert set(asset_list) == set(expected_data_asset_names[connector_name])

    data_connector_names = [
        "my_filesystem_data_connector",
        "test_runtime_data_connector",
    ]

    expected_data_asset_names: dict = {
        "test_runtime_data_connector": [data_asset_name],
        "my_filesystem_data_connector": ["DEFAULT_ASSET_NAME"],
    }

    available_data_asset_names: dict = datasource.get_available_data_asset_names(
        data_connector_names=data_connector_names
    )

    assert set(available_data_asset_names.keys()) == set(
        expected_data_asset_names.keys()
    )
    for connector_name, asset_list in available_data_asset_names.items():
        assert set(asset_list) == set(expected_data_asset_names[connector_name])

    data_connector_names = ["my_filesystem_data_connector"]

    expected_data_asset_names: dict = {
        "my_filesystem_data_connector": ["DEFAULT_ASSET_NAME"]
    }

    available_data_asset_names: dict = datasource.get_available_data_asset_names(
        data_connector_names=data_connector_names
    )

    assert set(available_data_asset_names.keys()) == set(
        expected_data_asset_names.keys()
    )
    for connector_name, asset_list in available_data_asset_names.items():
        assert set(asset_list) == set(expected_data_asset_names[connector_name])

    data_connector_names = "my_filesystem_data_connector"

    expected_data_asset_names: dict = {
        "my_filesystem_data_connector": ["DEFAULT_ASSET_NAME"]
    }

    available_data_asset_names: dict = datasource.get_available_data_asset_names(
        data_connector_names=data_connector_names
    )

    assert set(available_data_asset_names.keys()) == set(
        expected_data_asset_names.keys()
    )
    for connector_name, asset_list in available_data_asset_names.items():
        assert set(asset_list) == set(expected_data_asset_names[connector_name])

    data_connector_names = ["my_filesystem_data_connector"]

    expected_data_asset_names: dict = {
        "my_filesystem_data_connector": ["DEFAULT_ASSET_NAME"]
    }

    available_data_asset_names: dict = datasource.get_available_data_asset_names(
        data_connector_names=data_connector_names
    )

    assert set(available_data_asset_names.keys()) == set(
        expected_data_asset_names.keys()
    )
    for connector_name, asset_list in available_data_asset_names.items():
        assert set(asset_list) == set(expected_data_asset_names[connector_name])
def test_foxtrot(tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("test_foxtrot"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "test_dir_foxtrot/A/A-1.csv",
            "test_dir_foxtrot/A/A-2.csv",
            "test_dir_foxtrot/A/A-3.csv",
            "test_dir_foxtrot/B/B-1.txt",
            "test_dir_foxtrot/B/B-2.txt",
            "test_dir_foxtrot/B/B-3.txt",
            "test_dir_foxtrot/C/C-2017.csv",
            "test_dir_foxtrot/C/C-2018.csv",
            "test_dir_foxtrot/C/C-2019.csv",
            "test_dir_foxtrot/D/D-aaa.csv",
            "test_dir_foxtrot/D/D-bbb.csv",
            "test_dir_foxtrot/D/D-ccc.csv",
            "test_dir_foxtrot/D/D-ddd.csv",
            "test_dir_foxtrot/D/D-eee.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
            module_name: great_expectations.datasource.data_connector
            class_name: ConfiguredAssetFilesystemDataConnector
            base_directory: {base_directory}/test_dir_foxtrot
            assets:
              A:
                base_directory: A/
              B:
                base_directory: B/
                pattern: (.*)-(.*)\\.txt
                group_names:
                - part_1
                - part_2
              C:
                glob_directive: "*"
                base_directory: C/
              D:
                glob_directive: "*"
                base_directory: D/
            default_regex:
                pattern: (.*)-(.*)\\.csv
                group_names:
                - part_1
                - part_2
        """,
    )

    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "general_filesystem_data_connector",
                "datasource_name": "BASE",
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        )
    )
    self_check_report = my_data_connector.self_check()
    assert self_check_report == {
        "class_name": "ConfiguredAssetFilesystemDataConnector",
        "data_asset_count": 4,
        "example_data_asset_names": ["A", "B", "C"],
        "data_assets": {
            "A": {
                "batch_definition_count": 3,
                "example_data_references": [
                    "A-1.csv",
                    "A-2.csv",
                    "A-3.csv",
                ],
            },
            "B": {
                "batch_definition_count": 3,
                "example_data_references": [
                    "B-1.txt",
                    "B-2.txt",
                    "B-3.txt",
                ],
            },
            "C": {
                "batch_definition_count": 3,
                "example_data_references": [
                    "C-2017.csv",
                    "C-2018.csv",
                    "C-2019.csv",
                ],
            },
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
        "example_data_reference": {},
    }
    my_batch_definition_list: List[BatchDefinition]
    my_batch_definition: BatchDefinition
    my_batch_request = BatchRequest(
        datasource_name="BASE",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="A",
        partition_request=None,
    )
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request
        )
    )
    assert len(my_batch_definition_list) == 3
예제 #20
0
def test_get_batch_definitions_and_get_batch_basics(
        basic_pandas_datasource_v013):
    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        basic_pandas_datasource_v013.
        data_connectors["my_filesystem_data_connector"])
    create_files_in_directory(
        my_data_connector.base_directory,
        ["A_1.csv", "A_2.csv", "A_3.csv", "B_1.csv", "B_2.csv", "B_3.csv"],
    )

    assert (len(
        basic_pandas_datasource_v013.get_available_batch_definitions(
            batch_request=BatchRequest(
                datasource_name="my_datasource",
                data_connector_name="my_filesystem_data_connector",
                data_asset_name="Titanic",
            ))) == 6)

    batch: Batch = basic_pandas_datasource_v013.get_batch_from_batch_definition(
        batch_definition=BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="my_filesystem_data_connector",
            data_asset_name="B1",
            batch_identifiers=IDDict({
                "letter": "B",
                "number": "1",
            }),
        ))

    # TODO Abe 20201104: Make sure this is what we truly want to do.
    assert batch.batch_request == {}
    assert isinstance(batch.data.dataframe, pd.DataFrame)
    assert batch.batch_definition == BatchDefinition(
        datasource_name="my_datasource",
        data_connector_name="my_filesystem_data_connector",
        data_asset_name="B1",
        batch_identifiers=IDDict({
            "letter": "B",
            "number": "1",
        }),
    )

    batch_list: List[
        Batch] = basic_pandas_datasource_v013.get_batch_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="my_datasource",
                data_connector_name="my_filesystem_data_connector",
                data_asset_name="B1",
                data_connector_query={
                    "batch_filter_parameters": {
                        "letter": "B",
                        "number": "1",
                    }
                },
            ))
    assert len(batch_list) == 0

    batch_list: List[
        Batch] = basic_pandas_datasource_v013.get_batch_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="my_datasource",
                data_connector_name="my_filesystem_data_connector",
                data_asset_name="Titanic",
                data_connector_query={
                    "batch_filter_parameters": {
                        "letter": "B",
                        "number": "1",
                    }
                },
            ))
    assert len(batch_list) == 1
    assert isinstance(batch_list[0].data.dataframe, pd.DataFrame)

    my_df: pd.DataFrame = pd.DataFrame({"x": range(10), "y": range(10)})
    batch: Batch = basic_pandas_datasource_v013.get_batch_from_batch_definition(
        batch_definition=BatchDefinition(
            "my_datasource",
            "_pipeline",
            "_pipeline",
            batch_identifiers=IDDict({"some_random_id": 1}),
        ),
        batch_data=my_df,
    )
    # TODO Abe 20201104: Make sure this is what we truly want to do.
    assert batch.batch_request == {}
def test_batches_are_accessible(
    multibatch_generic_csv_generator,
    multibatch_generic_csv_generator_context,
):
    """
    What does this test and why?
    Batches created in the multibatch_generic_csv_generator fixture should be available using the
    multibatch_generic_csv_generator_context
    This test most likely duplicates tests elsewhere, but it is more of a test of the configurable fixture.
    """

    context: DataContext = multibatch_generic_csv_generator_context
    data_relative_path = "../data"
    data_path = os.path.join(context.root_directory, data_relative_path)
    datasource_name = "generic_csv_generator"
    data_connector_name = "daily_data_connector"
    asset_name = "daily_data_asset"

    datasource = context.datasources[datasource_name]

    data_connector = datasource.data_connectors[data_connector_name]

    total_batches: int = 20
    file_list = multibatch_generic_csv_generator(
        data_path=data_path, num_event_batches=total_batches)

    assert (
        data_connector._get_data_reference_list_from_cache_by_data_asset_name(
            data_asset_name=asset_name) == file_list)

    batch_request_1 = BatchRequest(
        datasource_name="generic_csv_generator",
        data_connector_name="daily_data_connector",
        data_asset_name="daily_data_asset",
        data_connector_query={
            "index": -1,
        },
    )
    # Should give most recent batch
    validator_1 = context.get_validator(
        batch_request=batch_request_1,
        create_expectation_suite_with_name="my_expectation_suite_name_1",
    )
    metric_max = validator_1.get_metric(
        MetricConfiguration("column.max",
                            metric_domain_kwargs={"column": "batch_num"}))
    assert metric_max == total_batches
    metric_value_set = validator_1.get_metric(
        MetricConfiguration(
            "column.distinct_values",
            metric_domain_kwargs={"column": "string_cardinality_3"},
        ))
    assert metric_value_set == {"category0", "category1", "category2"}

    batch_request_2 = BatchRequest(
        datasource_name="generic_csv_generator",
        data_connector_name="daily_data_connector",
        data_asset_name="daily_data_asset",
        data_connector_query={
            "index": -2,
        },
    )
    validator_2 = context.get_validator(
        batch_request=batch_request_2,
        create_expectation_suite_with_name="my_expectation_suite_name_2",
    )
    metric_max = validator_2.get_metric(
        MetricConfiguration("column.max",
                            metric_domain_kwargs={"column": "batch_num"}))
    assert metric_max == total_batches - 1
    metric_value_set = validator_2.get_metric(
        MetricConfiguration(
            "column.distinct_values",
            metric_domain_kwargs={"column": "string_cardinality_3"},
        ))
    assert metric_value_set == {"category0", "category1", "category2"}

    for batch_num in range(1, total_batches + 1):
        batch_request = BatchRequest(
            datasource_name="generic_csv_generator",
            data_connector_name="daily_data_connector",
            data_asset_name="daily_data_asset",
            data_connector_query={
                "index": -batch_num,
            },
        )
        validator = context.get_validator(
            batch_request=batch_request,
            create_expectation_suite_with_name=
            f"my_expectation_suite_name__{batch_num}",
        )
        metric_max = validator.get_metric(
            MetricConfiguration("column.max",
                                metric_domain_kwargs={"column": "batch_num"}))
        assert metric_max == (total_batches + 1) - batch_num
        metric_value_set = validator.get_metric(
            MetricConfiguration(
                "column.distinct_values",
                metric_domain_kwargs={"column": "string_cardinality_3"},
            ))
        assert metric_value_set == {"category0", "category1", "category2"}
예제 #22
0
def test_complex_regex_example_with_implicit_data_asset_names(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp(
            "test_complex_regex_example_with_implicit_data_asset_names"
        )
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "2020/01/alpha-1001.csv",
            "2020/01/beta-1002.csv",
            "2020/02/alpha-1003.csv",
            "2020/02/beta-1004.csv",
            "2020/03/alpha-1005.csv",
            "2020/03/beta-1006.csv",
            "2020/04/beta-1007.csv",
        ],
    )

    my_data_connector: InferredAssetFilesystemDataConnector = InferredAssetFilesystemDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        default_regex={
            "pattern": r"(\d{4})/(\d{2})/(.+)-\d+\.csv",
            "group_names": ["year_dir", "month_dir", "data_asset_name"],
        },
        glob_directive="*/*/*.csv",
        base_directory=base_directory,
    )

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    # Test for an unknown execution environment
    with pytest.raises(ValueError):
        # noinspection PyUnusedLocal
        batch_definition_list: List[
            BatchDefinition
        ] = my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="non_existent_datasource",
                data_connector_name="my_data_connector",
                data_asset_name="my_data_asset",
            )
        )

    # Test for an unknown data_connector
    with pytest.raises(ValueError):
        # noinspection PyUnusedLocal
        batch_definition_list: List[
            BatchDefinition
        ] = my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_DATASOURCE_NAME",
                data_connector_name="non_existent_data_connector",
                data_asset_name="my_data_asset",
            )
        )

    assert (
        len(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="FAKE_DATASOURCE_NAME",
                    data_connector_name="my_data_connector",
                    data_asset_name="alpha",
                )
            )
        )
        == 3
    )

    assert (
        len(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    data_connector_name="my_data_connector", data_asset_name="alpha",
                )
            )
        )
        == 3
    )

    assert (
        len(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    data_connector_name="my_data_connector", data_asset_name="beta",
                )
            )
        )
        == 4
    )

    assert my_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=BatchRequest(
            datasource_name="FAKE_DATASOURCE_NAME",
            data_connector_name="my_data_connector",
            data_asset_name="alpha",
            partition_request={
                "partition_identifiers": {"year_dir": "2020", "month_dir": "03",}
            },
        )
    ) == [
        BatchDefinition(
            datasource_name="FAKE_DATASOURCE_NAME",
            data_connector_name="my_data_connector",
            data_asset_name="alpha",
            partition_definition=PartitionDefinition(year_dir="2020", month_dir="03",),
        )
    ]
# In normal usage you'd set your path directly in the yaml above.
datasource_yaml = datasource_yaml.replace("<YOUR_GCS_BUCKET_HERE>",
                                          "test_docs_data")
datasource_yaml = datasource_yaml.replace(
    "<BUCKET_PATH_TO_DATA>", "data/taxi_yellow_tripdata_samples/")

context.test_yaml_config(datasource_yaml)
# <snippet>
context.add_datasource(**yaml.load(datasource_yaml))
# </snippet>

# batch_request with data_asset_name
# <snippet>
batch_request = BatchRequest(
    datasource_name="my_gcs_datasource",
    data_connector_name="default_inferred_data_connector_name",
    data_asset_name="<YOUR_DATA_ASSET_NAME>",
)
# </snippet>

# Please note this override is only to provide good UX for docs and tests.
# In normal usage you'd set your data asset name directly in the BatchRequest above.
batch_request.data_asset_name = (
    "data/taxi_yellow_tripdata_samples/yellow_tripdata_sample_2019-01")

# <snippet>
context.create_expectation_suite(expectation_suite_name="test_gcs_suite",
                                 overwrite_existing=True)

validator = context.get_validator(batch_request=batch_request,
                                  expectation_suite_name="test_gcs_suite")
예제 #24
0
def test_redundant_information_in_naming_convention_bucket_sorted(tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("logs"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "some_bucket/2021/01/01/log_file-20210101.txt.gz",
            "some_bucket/2021/01/02/log_file-20210102.txt.gz",
            "some_bucket/2021/01/03/log_file-20210103.txt.gz",
            "some_bucket/2021/01/04/log_file-20210104.txt.gz",
            "some_bucket/2021/01/05/log_file-20210105.txt.gz",
            "some_bucket/2021/01/06/log_file-20210106.txt.gz",
            "some_bucket/2021/01/07/log_file-20210107.txt.gz",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
          module_name: great_expectations.datasource.data_connector
          class_name: InferredAssetFilesystemDataConnector
          datasource_name: test_environment
          name: my_inferred_asset_filesystem_data_connector
          base_directory: {base_directory}/
          glob_directive: "*/*/*/*/*.txt.gz"
          default_regex:
              group_names:
                  - data_asset_name
                  - year
                  - month
                  - day
                  - full_date
              pattern: (\\w{{11}})/(\\d{{4}})/(\\d{{2}})/(\\d{{2}})/log_file-(.*)\\.txt\\.gz
          sorters:
              - orderby: desc
                class_name: DateTimeSorter
                name: full_date

          """,
    )

    my_data_connector: InferredAssetFilesystemDataConnector = instantiate_class_from_config(
        config=my_data_connector_yaml,
        runtime_environment={
            "name": "my_inferred_asset_filesystem_data_connector",
            "datasource_name": "test_environment",
            "execution_engine": "BASE_ENGINE",
        },
        config_defaults={"module_name": "great_expectations.datasource.data_connector"},
    )

    sorted_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request(
        BatchRequest(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
        )
    )

    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition(
                {"year": "2021", "month": "01", "day": "07", "full_date": "20210107"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition(
                {"year": "2021", "month": "01", "day": "06", "full_date": "20210106"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition(
                {"year": "2021", "month": "01", "day": "05", "full_date": "20210105"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition(
                {"year": "2021", "month": "01", "day": "04", "full_date": "20210104"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition(
                {"year": "2021", "month": "01", "day": "03", "full_date": "20210103"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition(
                {"year": "2021", "month": "01", "day": "02", "full_date": "20210102"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition(
                {"year": "2021", "month": "01", "day": "01", "full_date": "20210101"}
            ),
        ),
    ]
    assert expected == sorted_batch_definition_list
data_dir_path = "data"
CONNECTION_STRING = f"sqlite:///{data_dir_path}/yellow_tripdata.db"

datasource_yaml = datasource_yaml.replace("<CONNECTION_STRING>", CONNECTION_STRING)

context.test_yaml_config(datasource_yaml)

context.add_datasource(**yaml.load(datasource_yaml))
available_data_asset_names = context.datasources[
    "taxi_datasource"
].get_available_data_asset_names(data_connector_names="whole_table")["whole_table"]
assert len(available_data_asset_names) == 2
# Here is a BatchRequest referring to an un-partitioned inferred data_asset.
batch_request = BatchRequest(
    datasource_name="taxi_datasource",
    data_connector_name="whole_table",
    data_asset_name="<YOUR_DATA_ASSET_NAME>",
)

# Please note this override is only to provide good UX for docs and tests.
# In normal usage you'd set your data asset name directly in the BatchRequest above.
batch_request.data_asset_name: str = "main.yellow_tripdata_sample_2019_01"

context.create_expectation_suite(
    expectation_suite_name="test_suite", overwrite_existing=True
)
validator = context.get_validator(
    batch_request=batch_request, expectation_suite_name="test_suite"
)
print(validator.head(n_rows=10))
batch_request.runtime_parameters[
    "path"] = "data/yellow_tripdata_sample_2019-01.csv"

context.create_expectation_suite(expectation_suite_name="test_suite",
                                 overwrite_existing=True)
validator = context.get_validator(batch_request=batch_request,
                                  expectation_suite_name="test_suite")
print(validator.head())

# NOTE: The following code is only for testing and can be ignored by users.
assert isinstance(validator, ge.validator.validator.Validator)

# Here is a BatchRequest naming a data_asset
batch_request = BatchRequest(
    datasource_name="my_filesystem_datasource",
    data_connector_name="default_inferred_data_connector_name",
    data_asset_name="<YOUR_DATA_ASSET_NAME>",
)

# Please note this override is only to provide good UX for docs and tests.
# In normal usage you'd set your data asset name directly in the BatchRequest above.
batch_request.data_asset_name = "yellow_tripdata_sample_2019-01"

context.create_expectation_suite(expectation_suite_name="test_suite",
                                 overwrite_existing=True)
validator = context.get_validator(batch_request=batch_request,
                                  expectation_suite_name="test_suite")
print(validator.head())

# NOTE: The following code is only for testing and can be ignored by users.
assert isinstance(validator, ge.validator.validator.Validator)
예제 #27
0
            - data_asset_name
          pattern: (.*)
"""

# Note : this override is for internal GE purposes, and is intended to helps us better understand how the
# Getting Started Guide is being used. It can be ignored by users.
datasource_yaml = datasource_yaml.replace("getting_started_datasource",
                                          GETTING_STARTED_DATASOURCE_NAME)

context.test_yaml_config(datasource_yaml)
context.add_datasource(**yaml.load(datasource_yaml))

# Get Validator by creating ExpectationSuite and passing in BatchRequest
batch_request = BatchRequest(
    datasource_name="getting_started_datasource",
    data_connector_name="default_inferred_data_connector_name",
    data_asset_name="yellow_tripdata_sample_2019-01.csv",
    limit=1000,
)

# Note : this override is for internal GE purposes, and is intended to helps us better understand how the
# Getting Started Guide is being used. It can be ignored by users.
batch_request = BatchRequest(
    datasource_name=GETTING_STARTED_DATASOURCE_NAME,
    data_connector_name="default_inferred_data_connector_name",
    data_asset_name="yellow_tripdata_sample_2019-01.csv",
    limit=1000,
)

expectation_suite_name = "getting_started_expectation_suite_taxi.demo"

# Note : this override is for internal GE purposes, and is intended to helps us better understand how the
def test_example_with_explicit_data_asset_names(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp("test_example_with_explicit_data_asset_names")
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "my_base_directory/alpha/files/go/here/alpha-202001.csv",
            "my_base_directory/alpha/files/go/here/alpha-202002.csv",
            "my_base_directory/alpha/files/go/here/alpha-202003.csv",
            "my_base_directory/beta_here/beta-202001.txt",
            "my_base_directory/beta_here/beta-202002.txt",
            "my_base_directory/beta_here/beta-202003.txt",
            "my_base_directory/beta_here/beta-202004.txt",
            "my_base_directory/gamma-202001.csv",
            "my_base_directory/gamma-202002.csv",
            "my_base_directory/gamma-202003.csv",
            "my_base_directory/gamma-202004.csv",
            "my_base_directory/gamma-202005.csv",
        ],
    )
    yaml_string = f"""
class_name: ConfiguredAssetFilesystemDataConnector
datasource_name: FAKE_DATASOURCE_NAME
base_directory: {base_directory}/my_base_directory/
default_regex:
    pattern: ^(.+)-(\\d{{4}})(\\d{{2}})\\.(csv|txt)$
    group_names:
        - data_asset_name
        - year_dir
        - month_dir
assets:
    alpha:
        base_directory: {base_directory}/my_base_directory/alpha/files/go/here/
        glob_directive: "*.csv"
    beta:
        base_directory: {base_directory}/my_base_directory/beta_here/
        glob_directive: "*.txt"
    gamma:
        glob_directive: "*.csv"

    """
    config = yaml.load(yaml_string)
    my_data_connector = instantiate_class_from_config(
        config,
        config_defaults={"module_name": "great_expectations.datasource.data_connector"},
        runtime_environment={"name": "my_data_connector"},
    )
    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    assert len(my_data_connector.get_unmatched_data_references()) == 0

    assert (
        len(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    data_connector_name="my_data_connector",
                    data_asset_name="alpha",
                )
            )
        )
        == 3
    )

    assert (
        len(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    data_connector_name="my_data_connector",
                    data_asset_name="beta",
                )
            )
        )
        == 4
    )

    assert (
        len(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    data_connector_name="my_data_connector",
                    data_asset_name="gamma",
                )
            )
        )
        == 5
    )
예제 #29
0
def test_get_validator_expectation_suite_options(
    data_context_with_sql_datasource_for_testing_get_batch,
):
    context = data_context_with_sql_datasource_for_testing_get_batch
    context.create_expectation_suite("some_expectations")

    # Successful specification with an existing expectation_suite_name
    context.get_validator(
        datasource_name="my_sqlite_db",
        data_connector_name="daily",
        data_asset_name="table_partitioned_by_date_column__A",
        date="2020-01-15",
        expectation_suite_name="some_expectations",
    )

    # Successful specification with a fetched ExpectationSuite object
    some_expectations = context.get_expectation_suite("some_expectations")
    context.get_validator(
        datasource_name="my_sqlite_db",
        data_connector_name="daily",
        data_asset_name="table_partitioned_by_date_column__A",
        date="2020-01-15",
        expectation_suite=some_expectations,
    )

    # Successful specification with a fresh ExpectationSuite object
    some_more_expectations = context.create_expectation_suite(
        expectation_suite_name="some_more_expectations"
    )
    context.get_validator(
        datasource_name="my_sqlite_db",
        data_connector_name="daily",
        data_asset_name="table_partitioned_by_date_column__A",
        date="2020-01-15",
        expectation_suite=some_more_expectations,
    )

    # Successful specification using overwrite_existing_expectation_suite
    context.get_validator(
        batch_request=BatchRequest(
            datasource_name="my_sqlite_db",
            data_connector_name="daily",
            data_asset_name="table_partitioned_by_date_column__A",
            partition_request=PartitionRequest(
                partition_identifiers={"date": "2020-01-15"}
            ),
        ),
        create_expectation_suite_with_name="yet_more_expectations",
        # TODO: readd
        # overwrite_existing_expectation_suite=True,
    )

    # Failed specification: incorrectly typed expectation suite
    with pytest.raises(TypeError):
        context.get_validator(
            datasource_name="my_sqlite_db",
            data_connector_name="daily",
            data_asset_name="table_partitioned_by_date_column__A",
            date="2020-01-15",
            expectation_suite={
                "im": "a",
                "dictionary": "not a",
                "ExepctationSuite": False,
            },
        )
예제 #30
0
def test_redundant_information_in_naming_convention_bucket_sorted(
        mock_gcs_conn, mock_list_keys, mock_emit):
    my_data_connector_yaml = yaml.load(
        """
          module_name: great_expectations.datasource.data_connector
          class_name: InferredAssetGCSDataConnector
          datasource_name: test_environment
          name: my_inferred_asset_filesystem_data_connector
          bucket_or_name: test_bucket
          prefix: ""
          default_regex:
              group_names:
                  - data_asset_name
                  - year
                  - month
                  - day
                  - full_date
              pattern: (\\w{11})/(\\d{4})/(\\d{2})/(\\d{2})/log_file-(.*)\\.txt\\.gz
          sorters:
              - orderby: desc
                class_name: DateTimeSorter
                name: full_date
          """, )

    my_data_connector: InferredAssetGCSDataConnector = instantiate_class_from_config(
        config=my_data_connector_yaml,
        runtime_environment={
            "name": "my_inferred_asset_filesystem_data_connector",
            "execution_engine": PandasExecutionEngine(),
        },
        config_defaults={
            "module_name": "great_expectations.datasource.data_connector"
        },
    )

    sorted_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name=
                "my_inferred_asset_filesystem_data_connector",
                data_asset_name="some_bucket",
            )))

    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            batch_identifiers=IDDict({
                "year": "2021",
                "month": "01",
                "day": "07",
                "full_date": "20210107"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            batch_identifiers=IDDict({
                "year": "2021",
                "month": "01",
                "day": "06",
                "full_date": "20210106"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            batch_identifiers=IDDict({
                "year": "2021",
                "month": "01",
                "day": "05",
                "full_date": "20210105"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            batch_identifiers=IDDict({
                "year": "2021",
                "month": "01",
                "day": "04",
                "full_date": "20210104"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            batch_identifiers=IDDict({
                "year": "2021",
                "month": "01",
                "day": "03",
                "full_date": "20210103"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            batch_identifiers=IDDict({
                "year": "2021",
                "month": "01",
                "day": "02",
                "full_date": "20210102"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            batch_identifiers=IDDict({
                "year": "2021",
                "month": "01",
                "day": "01",
                "full_date": "20210101"
            }),
        ),
    ]
    assert expected == sorted_batch_definition_list