def test__file_object_caching_for_FileDataConnector(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp(
            "basic_data_connector__filesystem_data_connector"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "pretend/path/A-100.csv",
            "pretend/path/A-101.csv",
            "pretend/directory/B-1.csv",
            "pretend/directory/B-2.csv",
        ],
    )

    my_data_connector = ConfiguredAssetFilesystemDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE",
        execution_engine=PandasExecutionEngine(),
        base_directory=base_directory,
        glob_directive="*/*/*.csv",
        default_regex={
            "pattern": "(.*).csv",
            "group_names": ["name"],
        },
        assets={"stuff": {}},
    )

    assert my_data_connector.get_data_reference_list_count() == 0
    assert len(my_data_connector.get_unmatched_data_references()) == 0

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    assert len(my_data_connector.get_unmatched_data_references()) == 0
    assert my_data_connector.get_data_reference_list_count() == 4
def test_reader_fn_parameters(spark_session, basic_spark_df_execution_engine,
                              tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("test_csv"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "test-A.csv",
        ],
    )
    test_df_small_csv_path = base_directory + "/test-A.csv"
    engine = basic_spark_df_execution_engine
    fn = engine._get_reader_fn(reader=spark_session.read,
                               path=test_df_small_csv_path)
    assert "<bound method DataFrameReader.csv" in str(fn)

    test_sparkdf_with_header_param = basic_spark_df_execution_engine.get_batch_data(
        PathBatchSpec(
            path=test_df_small_csv_path,
            data_asset_name="DATA_ASSET",
            reader_options={"header": True},
        )).dataframe
    assert test_sparkdf_with_header_param.head() == Row(x="1", y="2")

    test_sparkdf_with_no_header_param = basic_spark_df_execution_engine.get_batch_data(
        PathBatchSpec(path=test_df_small_csv_path,
                      data_asset_name="DATA_ASSET")).dataframe
    assert test_sparkdf_with_no_header_param.head() == Row(_c0="x", _c1="y")
def test_simple_regex_example_with_implicit_data_asset_names_self_check(
    tmp_path_factory,
):
    base_directory = str(
        tmp_path_factory.mktemp(
            "test_simple_regex_example_with_implicit_data_asset_names"
        )
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "A-100.csv",
            "A-101.csv",
            "B-1.csv",
            "B-2.csv",
            "CCC.csv",
        ],
    )

    my_data_connector: InferredAssetFilesystemDataConnector = (
        InferredAssetFilesystemDataConnector(
            name="my_data_connector",
            datasource_name="FAKE_DATASOURCE_NAME",
            execution_engine=PandasExecutionEngine(),
            default_regex={
                "pattern": r"(.+)-(\d+)\.csv",
                "group_names": [
                    "data_asset_name",
                    "number",
                ],
            },
            glob_directive="*",
            base_directory=base_directory,
        )
    )

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    self_check_report_object = my_data_connector.self_check()

    assert self_check_report_object == {
        "class_name": "InferredAssetFilesystemDataConnector",
        "data_asset_count": 2,
        "example_data_asset_names": ["A", "B"],
        "data_assets": {
            "A": {
                "example_data_references": ["A-100.csv", "A-101.csv"],
                "batch_definition_count": 2,
            },
            "B": {
                "example_data_references": ["B-1.csv", "B-2.csv"],
                "batch_definition_count": 2,
            },
        },
        "example_unmatched_data_references": ["CCC.csv"],
        "unmatched_data_reference_count": 1,
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {},
    }
示例#4
0
def test_return_all_batch_definitions_too_many_sorters(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp(
            "test_return_all_batch_definitions_too_many_sorters"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "alex_20200809_1000.csv",
            "eugene_20200809_1500.csv",
            "james_20200811_1009.csv",
            "abe_20200809_1040.csv",
            "will_20200809_1002.csv",
            "james_20200713_1567.csv",
            "eugene_20201129_1900.csv",
            "will_20200810_1001.csv",
            "james_20200810_1003.csv",
            "alex_20200819_1300.csv",
        ],
    )
    my_data_connector_yaml = yaml.load(
        f"""
        class_name: ConfiguredAssetFilesystemDataConnector
        datasource_name: test_environment
        #execution_engine:
        #    class_name: PandasExecutionEngine
        base_directory: {base_directory}
        glob_directive: "*.csv"
        assets:
            TestFiles:
        default_regex:
            pattern: (.+)_.+_.+\\.csv
            group_names:
                - name
        sorters:
            - orderby: asc
              class_name: LexicographicSorter
              name: name
            - datetime_format: "%Y%m%d"
              orderby: desc
              class_name: DateTimeSorter
              name: timestamp
            - orderby: desc
              class_name: NumericSorter
              name: price

    """, )
    with pytest.raises(ge_exceptions.DataConnectorError):
        # noinspection PyUnusedLocal
        my_data_connector: ConfiguredAssetFilesystemDataConnector = (
            instantiate_class_from_config(
                config=my_data_connector_yaml,
                runtime_environment={
                    "name": "general_filesystem_data_connector",
                    "datasource_name": "test_environment",
                },
                config_defaults={
                    "module_name":
                    "great_expectations.datasource.data_connector"
                },
            ))
def test_redundant_information_in_naming_convention_bucket(
    empty_data_context, tmp_path_factory
):
    context = empty_data_context

    base_directory = str(tmp_path_factory.mktemp("logs"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "some_bucket/2021/01/01/log_file-20210101.txt.gz",
            "some_bucket/2021/01/02/log_file-20210102.txt.gz",
            "some_bucket/2021/01/03/log_file-20210103.txt.gz",
            "some_bucket/2021/01/04/log_file-20210104.txt.gz",
            "some_bucket/2021/01/05/log_file-20210105.txt.gz",
            "some_bucket/2021/01/06/log_file-20210106.txt.gz",
            "some_bucket/2021/01/07/log_file-20210107.txt.gz",
        ],
    )

    report_object = context.test_yaml_config(
        f"""
          module_name: great_expectations.datasource.data_connector
          class_name: InferredAssetFilesystemDataConnector
          datasource_name: FAKE_DATASOURCE
          name: TEST_DATA_CONNECTOR
          base_directory: {base_directory}/
          glob_directive: "*/*/*/*/*.txt.gz"
          default_regex:
              group_names:
                  - data_asset_name
                  - year
                  - month
                  - day
              pattern: (\\w{{11}})/(\\d{{4}})/(\\d{{2}})/(\\d{{2}})/log_file-.*\\.txt\\.gz
              """,
        runtime_environment={
            "execution_engine": PandasExecutionEngine(),
        },
        return_mode="report_object",
    )

    assert report_object == {
        "class_name": "InferredAssetFilesystemDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": ["some_bucket"],
        "data_assets": {
            "some_bucket": {
                "batch_definition_count": 7,
                "example_data_references": [
                    "some_bucket/2021/01/01/log_file-*.txt.gz",
                    "some_bucket/2021/01/02/log_file-*.txt.gz",
                    "some_bucket/2021/01/03/log_file-*.txt.gz",
                ],
            }
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {},
    }
def test_basic_instantiation(tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("test_basic_instantiation"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "alpha-1.csv",
            "alpha-2.csv",
            "alpha-3.csv",
        ],
    )

    my_data_connector = ConfiguredAssetFilesystemDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        execution_engine=PandasExecutionEngine(),
        default_regex={
            "pattern": "alpha-(.*)\\.csv",
            "group_names": ["index"],
        },
        base_directory=base_directory,
        assets={"alpha": {}},
    )

    assert my_data_connector.self_check() == {
        "class_name": "ConfiguredAssetFilesystemDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": [
            "alpha",
        ],
        "data_assets": {
            "alpha": {
                "example_data_references": [
                    "alpha-1.csv",
                    "alpha-2.csv",
                    "alpha-3.csv",
                ],
                "batch_definition_count":
                3,
            },
        },
        "example_unmatched_data_references": [],
        "unmatched_data_reference_count": 0,
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {},
    }

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()
    assert my_data_connector.get_data_reference_list_count() == 3
    assert my_data_connector.get_unmatched_data_references() == []

    # Illegal execution environment name
    with pytest.raises(ValueError):
        print(
            my_data_connector.get_batch_definition_list_from_batch_request(
                BatchRequest(
                    datasource_name="something",
                    data_connector_name="my_data_connector",
                    data_asset_name="something",
                )))
def test_for_self_check_using_InferredAssetFilesystemDataConnector_SparkDFExecutionEngine(
        spark_session, tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp(
            "basic_data_connector_inferred_asset_filesystem_data_connector"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "alex_20201010_1000.csv",
            "abe_202011111_2000.csv",
            "will_20201212_3000.csv",
        ],
    )
    my_data_connector = InferredAssetFilesystemDataConnector(
        name="my_data_connector",
        base_directory=base_directory,
        glob_directive="*.csv",
        datasource_name="FAKE_DATASOURCE",
        execution_engine=SparkDFExecutionEngine(),
        default_regex={
            "pattern": "(.+)_(\\d+)_(\\d+)\\.csv",
            "group_names": ["data_asset_name", "timestamp", "size"],
        },
    )
    self_check_results = my_data_connector.self_check()
    assert self_check_results["data_asset_count"] == 3
    assert self_check_results["example_data_reference"]["n_rows"] == 3
def test_redundant_information_in_naming_convention_bucket_too_many_sorters(
    tmp_path_factory,
):
    base_directory = str(tmp_path_factory.mktemp("logs"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "some_bucket/2021/01/01/log_file-20210101.txt.gz",
            "some_bucket/2021/01/02/log_file-20210102.txt.gz",
            "some_bucket/2021/01/03/log_file-20210103.txt.gz",
            "some_bucket/2021/01/04/log_file-20210104.txt.gz",
            "some_bucket/2021/01/05/log_file-20210105.txt.gz",
            "some_bucket/2021/01/06/log_file-20210106.txt.gz",
            "some_bucket/2021/01/07/log_file-20210107.txt.gz",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
        module_name: great_expectations.datasource.data_connector
        class_name: InferredAssetFilesystemDataConnector
        datasource_name: test_environment
        name: my_inferred_asset_filesystem_data_connector
        base_directory: {base_directory}/
        glob_directive: "*/*/*/*/*.txt.gz"
        default_regex:
            group_names:
                - data_asset_name
                - year
                - month
                - day
                - full_date
            pattern: (\\w{{11}})/(\\d{{4}})/(\\d{{2}})/(\\d{{2}})/log_file-(.*)\\.txt\\.gz
        sorters:
            - datetime_format: "%Y%m%d"
              orderby: desc
              class_name: DateTimeSorter
              name: timestamp
            - orderby: desc
              class_name: NumericSorter
              name: price
          """,
    )

    with pytest.raises(ge_exceptions.DataConnectorError):
        # noinspection PyUnusedLocal
        my_data_connector: InferredAssetFilesystemDataConnector = (
            instantiate_class_from_config(
                config=my_data_connector_yaml,
                runtime_environment={
                    "name": "my_inferred_asset_filesystem_data_connector",
                    "datasource_name": "test_environment",
                    "execution_engine": "BASE_ENGINE",
                },
                config_defaults={
                    "module_name": "great_expectations.datasource.data_connector"
                },
            )
        )
def test_redundant_information_in_naming_convention_timestamp(
    empty_data_context, tmp_path_factory
):
    context = empty_data_context

    base_directory = str(tmp_path_factory.mktemp("logs"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "log_file-2021-01-01-035419.163324.txt.gz",
            "log_file-2021-01-02-035513.905752.txt.gz",
            "log_file-2021-01-03-035455.848839.txt.gz",
            "log_file-2021-01-04-035251.47582.txt.gz",
            "log_file-2021-01-05-033034.289789.txt.gz",
            "log_file-2021-01-06-034958.505688.txt.gz",
            "log_file-2021-01-07-033545.600898.txt.gz",
        ],
    )

    report_object = context.test_yaml_config(
        f"""
          module_name: great_expectations.datasource.data_connector
          class_name: InferredAssetFilesystemDataConnector
          datasource_name: FAKE_DATASOURCE
          name: TEST_DATA_CONNECTOR
          base_directory: {base_directory}/
          glob_directive: "*.txt.gz"
          default_regex:
              group_names:
                - data_asset_name
                - year
                - month
                - day
              pattern: (log_file)-(\\d{{4}})-(\\d{{2}})-(\\d{{2}})-.*\\.*\\.txt\\.gz
      """,
        runtime_environment={
            "execution_engine": PandasExecutionEngine(),
        },
        return_mode="report_object",
    )
    assert report_object == {
        "class_name": "InferredAssetFilesystemDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": ["log_file"],
        "data_assets": {
            "log_file": {
                "batch_definition_count": 7,
                "example_data_references": [
                    "log_file-2021-01-01-*.txt.gz",
                    "log_file-2021-01-02-*.txt.gz",
                    "log_file-2021-01-03-*.txt.gz",
                ],
            }
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {},
    }
示例#10
0
def sample_datasource_v013_with_single_partition_file_data_connector(
    tmp_path_factory, ):
    base_directory: str = str(
        tmp_path_factory.mktemp(
            "basic_pandas_datasource_v013_single_partition_filesystem_data_connector"
        ))

    sample_datasource: Datasource = instantiate_class_from_config(
        yaml.load(
            f"""
class_name: Datasource

execution_engine:
    class_name: PandasExecutionEngine

data_connectors:
    test_runtime_data_connector:
        module_name: great_expectations.datasource.data_connector
        class_name: RuntimeDataConnector
        batch_identifiers:
            - pipeline_stage_name
            - airflow_run_id

    my_filesystem_data_connector:
        class_name: InferredAssetFilesystemDataConnector
        base_directory: {base_directory}
        # TODO: <Alex>Investigate: this potentially breaks the data_reference centric design.</Alex>
        glob_directive: "*.csv"
        # glob_directive: "*"

        default_regex:
            # TODO: <Alex>Investigate: this potentially breaks the data_reference centric design.</Alex>
            pattern: (.+)_(\\d+)\\.csv
            # pattern: (.+)_(\\d+)\\.[a-z][a-z][a-z]
            group_names:
            - letter
            - number
    """, ),
        runtime_environment={"name": "my_datasource"},
        config_defaults={"module_name": "great_expectations.datasource"},
    )

    sample_file_names: List[str] = [
        "alex_20200809_1000.csv",
        "eugene_20200809_1500.csv",
        "james_20200811_1009.csv",
        "abe_20200809_1040.csv",
        "will_20200809_1002.csv",
        "james_20200713_1567.csv",
        "eugene_20201129_1900.csv",
        "will_20200810_1001.csv",
        "james_20200810_1003.csv",
        "alex_20200819_1300.csv",
    ]

    create_files_in_directory(directory=base_directory,
                              file_name_list=sample_file_names)

    return sample_datasource
def test_instantiation_from_a_config_regex_does_not_match_paths(
        empty_data_context, tmp_path_factory):
    context = empty_data_context

    base_directory = str(tmp_path_factory.mktemp("test_test_yaml_config"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "alpha-1.csv",
            "alpha-2.csv",
            "alpha-3.csv",
        ],
    )

    report_object = context.test_yaml_config(
        f"""
module_name: great_expectations.datasource.data_connector
class_name: ConfiguredAssetFilesystemDataConnector
datasource_name: FAKE_DATASOURCE
name: TEST_DATA_CONNECTOR

base_directory: {base_directory}
# glob_directive: "*.csv"

default_regex:
    pattern: beta-(.*)\\.csv
    group_names:
        - index

assets:
    alpha:

    """,
        return_mode="report_object",
    )

    assert report_object == {
        "class_name":
        "ConfiguredAssetFilesystemDataConnector",
        "data_asset_count":
        1,
        "example_data_asset_names": [
            "alpha",
        ],
        "data_assets": {
            "alpha": {
                "example_data_references": [],
                "batch_definition_count": 0
            },
        },
        "example_unmatched_data_references": [
            "alpha-1.csv",
            "alpha-2.csv",
            "alpha-3.csv",
        ],
        "unmatched_data_reference_count":
        3,
        "example_data_reference": {},
    }
示例#12
0
def test_redundant_information_in_naming_convention_random_hash(
    empty_data_context, tmp_path_factory
):
    context = empty_data_context

    base_directory = str(tmp_path_factory.mktemp("logs"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "2021/01/01/log_file-2f1e94b40f310274b485e72050daf591.txt.gz",
            "2021/01/02/log_file-7f5d35d4f90bce5bf1fad680daac48a2.txt.gz",
            "2021/01/03/log_file-99d5ed1123f877c714bbe9a2cfdffc4b.txt.gz",
            "2021/01/04/log_file-885d40a5661bbbea053b2405face042f.txt.gz",
            "2021/01/05/log_file-d8e478f817b608729cfc8fb750ebfc84.txt.gz",
            "2021/01/06/log_file-b1ca8d1079c00fd4e210f7ef31549162.txt.gz",
            "2021/01/07/log_file-d34b4818c52e74b7827504920af19a5c.txt.gz",
        ],
    )

    report_object = context.test_yaml_config(
        f"""
          module_name: great_expectations.datasource.data_connector
          class_name: InferredAssetFilesystemDataConnector
          datasource_name: FAKE_DATASOURCE
          name: TEST_DATA_CONNECTOR
          base_directory: {base_directory}/
          glob_directive: "*/*/*/*.txt.gz"
          default_regex:
              group_names:
                - year
                - month
                - day
                - data_asset_name
              pattern: (\\d{{4}})/(\\d{{2}})/(\\d{{2}})/(log_file)-.*\\.txt\\.gz

              """,
        return_mode="report_object",
    )

    assert report_object == {
        "class_name": "InferredAssetFilesystemDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": ["log_file"],
        "data_assets": {
            "log_file": {
                "batch_definition_count": 7,
                "example_data_references": [
                    "2021/01/01/log_file-*.txt.gz",
                    "2021/01/02/log_file-*.txt.gz",
                    "2021/01/03/log_file-*.txt.gz",
                ],
            }
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {},
    }
def test__get_full_file_path_spark(basic_spark_df_execution_engine, fs):
    """
    What does this test and why?
    File paths in DBFS need to use the `dbfs:/` protocol base instead of `/dbfs/` when
    being read using the `spark.read` method in the ExecutionEngine. In the data connector
    config however, the `/dbfs` version must be used. This test verifies that a config
    using a `/dbfs/` path is translated to `dbfs:/` when preparing the PathBatchSpec for the
    SparkDFExecutionEngine.
    """

    base_directory: str = "/dbfs/great_expectations"
    base_directory_colon: str = "dbfs:/great_expectations"
    fs.create_dir(base_directory)

    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "path/A-100.csv",
            "path/A-101.csv",
            "directory/B-1.csv",
            "directory/B-2.csv",
        ],
    )

    my_data_connector: InferredAssetDBFSDataConnector = InferredAssetDBFSDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        execution_engine=basic_spark_df_execution_engine,
        default_regex={
            "pattern": r"(.+)/(.+)-(\d+)\.csv",
            "group_names": ["data_asset_name", "letter", "number"],
        },
        glob_directive="*/*.csv",
        base_directory=base_directory,
    )

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    assert my_data_connector.get_data_reference_list_count() == 4
    assert my_data_connector.get_unmatched_data_references() == []

    my_batch_definition_list: List[
        BatchDefinition] = my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_DATASOURCE_NAME",
                data_connector_name="my_data_connector",
                data_asset_name="path",
            ))
    assert len(my_batch_definition_list) == 2

    my_batch_definition: BatchDefinition = my_batch_definition_list[0]
    batch_spec: BatchSpec = my_data_connector.build_batch_spec(
        batch_definition=my_batch_definition)

    assert isinstance(batch_spec, PathBatchSpec)
    assert batch_spec.path == f"{base_directory_colon}/path/A-100.csv"
示例#14
0
def test_spark_with_batch_spec_passthrough(tmp_path_factory, spark_session):
    base_directory: str = str(
        tmp_path_factory.mktemp("basic_spark_datasource_v013_filesystem_data_connector")
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "test-A.csv",
        ],
    )
    basic_datasource: Datasource = instantiate_class_from_config(
        yaml.load(
            f"""
        class_name: Datasource

        execution_engine:
            class_name: SparkDFExecutionEngine
            spark_config:
                spark.master: local[*]
                spark.executor.memory: 6g
                spark.driver.memory: 6g
                spark.ui.showConsoleProgress: false
                spark.sql.shuffle.partitions: 2
                spark.default.parallelism: 4
        data_connectors:
            simple_filesystem_data_connector:
                class_name: InferredAssetFilesystemDataConnector
                base_directory: {base_directory}
                batch_spec_passthrough:
                    reader_options:
                        header: True
                glob_directive: '*'
                default_regex:
                    pattern: (.+)\\.csv
                    group_names:
                    - data_asset_name
            """,
        ),
        runtime_environment={"name": "my_datasource"},
        config_defaults={"module_name": "great_expectations.datasource"},
    )

    data_connector_name: str = "simple_filesystem_data_connector"
    data_asset_name: str = "test-A"

    batch_request: dict = {
        "datasource_name": "my_datasource",
        "data_connector_name": data_connector_name,
        "data_asset_name": data_asset_name,
    }

    batch = basic_datasource.get_batch_list_from_batch_request(
        BatchRequest(**batch_request)
    )
    # check that the batch_spec_passthrough has worked
    assert batch[0].data.dataframe.head() == Row(x="1", y="2")
def test_one_year_as_1_data_asset_12_batches(empty_data_context,
                                             tmp_path_factory):
    context: DataContext = empty_data_context
    base_directory: str = str(tmp_path_factory.mktemp("log_data"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "some_bucket/report_2018-01.csv",
            "some_bucket/report_2018-02.csv",
            "some_bucket/report_2018-03.csv",
            "some_bucket/report_2018-04.csv",
            "some_bucket/report_2018-05.csv",
            "some_bucket/report_2018-06.csv",
            "some_bucket/report_2018-07.csv",
            "some_bucket/report_2018-08.csv",
            "some_bucket/report_2018-09.csv",
            "some_bucket/report_2018-10.csv",
            "some_bucket/report_2018-11.csv",
            "some_bucket/report_2018-12.csv",
        ],
    )
    datasource_yaml: str = f"""
    name: taxi_datasource
    class_name: Datasource
    module_name: great_expectations.datasource
    execution_engine:
      module_name: great_expectations.execution_engine
      class_name: PandasExecutionEngine
    data_connectors:
        default_configured_data_connector_name:
            class_name: ConfiguredAssetFilesystemDataConnector
            base_directory: {base_directory}/some_bucket
            glob_directive: "*.csv"
            assets:
                report_2018:
                    pattern: (.+)_(.+)-(.+)\\.csv
                    group_names:
                        - name
                        - year
                        - month
        """
    context.test_yaml_config(datasource_yaml)
    context.add_datasource(**yaml.load(datasource_yaml))
    datasource: Datasource = context.get_datasource(
        datasource_name="taxi_datasource")
    data_asset_names: dict = datasource.get_available_data_asset_names(
        data_connector_names="default_configured_data_connector_name")
    # making the result deterministic
    data_asset_names["default_configured_data_connector_name"].sort()
    assert data_asset_names == {
        "default_configured_data_connector_name": ["report_2018"]
    }
    assert len(data_asset_names["default_configured_data_connector_name"]) == 1
def test_self_check(tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("test_self_check"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "A-100.csv",
            "A-101.csv",
            "B-1.csv",
            "B-2.csv",
        ],
    )

    my_data_connector: InferredAssetFilesystemDataConnector = InferredAssetFilesystemDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        default_regex={
            "pattern": r"(.+)-(\d+)\.csv",
            "group_names": ["data_asset_name", "number"],
        },
        glob_directive="*",
        base_directory=base_directory,
    )

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    self_check_report_object = my_data_connector.self_check()

    assert self_check_report_object == {
        "class_name": "InferredAssetFilesystemDataConnector",
        "data_asset_count": 2,
        "example_data_asset_names": ["A", "B"],
        "data_assets": {
            "A": {
                "example_data_references": ["A-100.csv", "A-101.csv"],
                "batch_definition_count": 2,
            },
            "B": {
                "example_data_references": ["B-1.csv", "B-2.csv"],
                "batch_definition_count": 2,
            },
        },
        "example_unmatched_data_references": [],
        "unmatched_data_reference_count": 0,
        "example_data_reference": {},
    }
def test_datasource_config(empty_data_context):
    temp_dir = str(tempfile.mkdtemp())
    create_files_in_directory(
        directory=temp_dir,
        file_name_list=[
            "alex_20200809_1000.csv",
            "eugene_20200809_1500.csv",
            "james_20200811_1009.csv",
            "abe_20200809_1040.csv",
            "will_20200809_1002.csv",
            "james_20200713_1567.csv",
            "eugene_20201129_1900.csv",
            "will_20200810_1001.csv",
            "james_20200810_1003.csv",
            "alex_20200819_1300.csv",
        ],
    )
    print(temp_dir)

    return_obj = empty_data_context.test_yaml_config(
        yaml_config=f"""
class_name: Datasource

execution_engine:
    class_name: PandasExecutionEngine

data_connectors:
    my_filesystem_data_connector:
        # class_name: ConfiguredAssetFilesystemDataConnector
        class_name: InferredAssetFilesystemDataConnector
        base_directory: {temp_dir}
        glob_directive: '*.csv'
        default_regex:
            pattern: (.+)_(\\d+)\\.csv
            group_names:
            - letter
            - number
""",
        return_mode="report_object",
    )

    print(json.dumps(return_obj, indent=2))

    assert set(return_obj.keys()) == {"execution_engine", "data_connectors"}
    sub_obj = return_obj["data_connectors"]["my_filesystem_data_connector"]
def test_basic_instantiation(tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("test_basic_instantiation"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "path/A-100.csv",
            "path/A-101.csv",
            "directory/B-1.csv",
            "directory/B-2.csv",
        ],
    )

    my_data_connector: InferredAssetFilesystemDataConnector = (
        InferredAssetFilesystemDataConnector(
            name="my_data_connector",
            datasource_name="FAKE_DATASOURCE_NAME",
            execution_engine=PandasExecutionEngine(),
            default_regex={
                "pattern": r"(.+)/(.+)-(\d+)\.csv",
                "group_names": ["data_asset_name", "letter", "number"],
            },
            glob_directive="*/*.csv",
            base_directory=base_directory,
        )
    )

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    assert my_data_connector.get_data_reference_list_count() == 4
    assert my_data_connector.get_unmatched_data_references() == []

    # Illegal execution environment name
    with pytest.raises(ValueError):
        print(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="something",
                    data_connector_name="my_data_connector",
                    data_asset_name="something",
                )
            )
        )
def basic_files_dataconnector_yaml(tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("get_previous_partition"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "my_asset/AAA.csv",
            "my_asset/BBB.csv",
            "my_asset/CCC.csv",
        ],
    )

    # These are all part of `my_asset`
    # it has 3 partitions.... AAA, BBB, CCC
    #

    # <WILL> this is going to be configured in a weird way
    # we will ignore data_assets??

    return (
        base_directory,
        f"""
        class_name: InferredAssetFilesystemDataConnector
        base_directory: {base_directory}
        glob_directive: "*/*.csv"
        datasource_name: general_data_source
        default_regex:
          pattern: .*\\/(my_asset)\\/(.*).csv
          group_names:
            - data_asset_name
            - name
        sorters:
          - name: name
            class_name: LexicographicSorter
            orderby: desc
       """,
    )
示例#20
0
def test_relative_default_and_relative_asset_base_directory_paths(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp(
            "test_relative_default_and_relative_asset_base_directory_paths"
        )
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "test_dir_0/A/B/C/logfile_0.csv",
            "test_dir_0/A/B/C/bigfile_1.csv",
            "test_dir_0/A/filename2.csv",
            "test_dir_0/A/filename3.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
            module_name: great_expectations.datasource.data_connector
            class_name: ConfiguredAssetFilesystemDataConnector
            base_directory: test_dir_0/A
            glob_directive: "*"
            default_regex:
              pattern: (.+)\\.csv
              group_names:
              - name

            assets:
              A:
                base_directory: B/C
                glob_directive: "log*.csv"
                pattern: (.+)_(\\d+)\\.csv
                group_names:
                - name
                - number
        """,
    )

    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "my_configured_asset_filesystem_data_connector",
                "datasource_name": "BASE",
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        )
    )
    my_data_connector.data_context_root_directory = base_directory

    assert my_data_connector.base_directory == f"{base_directory}/test_dir_0/A"
    assert (
        my_data_connector._get_full_file_path_for_asset(
            path="bigfile_1.csv", asset=my_data_connector.assets["A"]
        )
        == f"{base_directory}/test_dir_0/A/B/C/bigfile_1.csv"
    )
    self_check_report = my_data_connector.self_check()
    assert self_check_report == {
        "class_name": "ConfiguredAssetFilesystemDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": ["A"],
        "data_assets": {
            "A": {
                "batch_definition_count": 1,
                "example_data_references": ["logfile_0.csv"],
            }
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
        "example_data_reference": {},
    }

    my_batch_definition_list: List[BatchDefinition]
    my_batch_definition: BatchDefinition
    my_batch_request = BatchRequest(
        datasource_name="BASE",
        data_connector_name="my_configured_asset_filesystem_data_connector",
        data_asset_name="A",
        partition_request=None,
    )
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request
        )
    )
    assert len(my_batch_definition_list) == 1
示例#21
0
def test_foxtrot(tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("test_foxtrot"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "test_dir_foxtrot/A/A-1.csv",
            "test_dir_foxtrot/A/A-2.csv",
            "test_dir_foxtrot/A/A-3.csv",
            "test_dir_foxtrot/B/B-1.txt",
            "test_dir_foxtrot/B/B-2.txt",
            "test_dir_foxtrot/B/B-3.txt",
            "test_dir_foxtrot/C/C-2017.csv",
            "test_dir_foxtrot/C/C-2018.csv",
            "test_dir_foxtrot/C/C-2019.csv",
            "test_dir_foxtrot/D/D-aaa.csv",
            "test_dir_foxtrot/D/D-bbb.csv",
            "test_dir_foxtrot/D/D-ccc.csv",
            "test_dir_foxtrot/D/D-ddd.csv",
            "test_dir_foxtrot/D/D-eee.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
            module_name: great_expectations.datasource.data_connector
            class_name: ConfiguredAssetFilesystemDataConnector
            base_directory: {base_directory}/test_dir_foxtrot
            assets:
              A:
                base_directory: A/
              B:
                base_directory: B/
                pattern: (.*)-(.*)\\.txt
                group_names:
                - part_1
                - part_2
              C:
                glob_directive: "*"
                base_directory: C/
              D:
                glob_directive: "*"
                base_directory: D/
            default_regex:
                pattern: (.*)-(.*)\\.csv
                group_names:
                - part_1
                - part_2
        """,
    )

    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "general_filesystem_data_connector",
                "datasource_name": "BASE",
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        )
    )
    self_check_report = my_data_connector.self_check()
    assert self_check_report == {
        "class_name": "ConfiguredAssetFilesystemDataConnector",
        "data_asset_count": 4,
        "example_data_asset_names": ["A", "B", "C"],
        "data_assets": {
            "A": {
                "batch_definition_count": 3,
                "example_data_references": [
                    "A-1.csv",
                    "A-2.csv",
                    "A-3.csv",
                ],
            },
            "B": {
                "batch_definition_count": 3,
                "example_data_references": [
                    "B-1.txt",
                    "B-2.txt",
                    "B-3.txt",
                ],
            },
            "C": {
                "batch_definition_count": 3,
                "example_data_references": [
                    "C-2017.csv",
                    "C-2018.csv",
                    "C-2019.csv",
                ],
            },
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
        "example_data_reference": {},
    }
    my_batch_definition_list: List[BatchDefinition]
    my_batch_definition: BatchDefinition
    my_batch_request = BatchRequest(
        datasource_name="BASE",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="A",
        partition_request=None,
    )
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request
        )
    )
    assert len(my_batch_definition_list) == 3
示例#22
0
def test_alpha(tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("test_alpha"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "test_dir_alpha/A.csv",
            "test_dir_alpha/B.csv",
            "test_dir_alpha/C.csv",
            "test_dir_alpha/D.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
                module_name: great_expectations.datasource.data_connector
                class_name: ConfiguredAssetFilesystemDataConnector
                base_directory: {base_directory}/test_dir_alpha
                assets:
                  A:
                    glob_directive: "*.csv"
                default_regex:
                    pattern: (.+)\\.csv
                    group_names:
                    - part_1
            """,
    )

    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "general_filesystem_data_connector",
                "datasource_name": "BASE",
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        )
    )
    self_check_report = my_data_connector.self_check()
    print(json.dumps(self_check_report, indent=2))

    assert self_check_report["class_name"] == "ConfiguredAssetFilesystemDataConnector"
    assert self_check_report["data_asset_count"] == 1
    assert set(list(self_check_report["data_assets"].keys())) == {"A"}
    assert self_check_report["unmatched_data_reference_count"] == 0

    my_batch_definition_list: List[BatchDefinition]
    my_batch_definition: BatchDefinition

    # Try to fetch a batch from a nonexistent asset
    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="BASE",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="B",
        partition_request=None,
    )

    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request
        )
    )
    assert len(my_batch_definition_list) == 0

    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="BASE",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="A",
        partition_request=PartitionRequest(**{"batch_identifiers": {"part_1": "B"}}),
    )
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request
        )
    )
    assert len(my_batch_definition_list) == 1
示例#23
0
def test_return_all_batch_definitions_sorted(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp("test_return_all_batch_definitions_sorted")
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "alex_20200809_1000.csv",
            "eugene_20200809_1500.csv",
            "james_20200811_1009.csv",
            "abe_20200809_1040.csv",
            "will_20200809_1002.csv",
            "james_20200713_1567.csv",
            "eugene_20201129_1900.csv",
            "will_20200810_1001.csv",
            "james_20200810_1003.csv",
            "alex_20200819_1300.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
        class_name: ConfiguredAssetFilesystemDataConnector
        datasource_name: test_environment
        #execution_engine:
        #    class_name: PandasExecutionEngine
        base_directory: {base_directory}
        glob_directive: "*.csv"
        assets:
            TestFiles:
        default_regex:
            pattern: (.+)_(.+)_(.+)\\.csv
            group_names:
                - name
                - timestamp
                - price
        sorters:
            - orderby: asc
              class_name: LexicographicSorter
              name: name
            - datetime_format: "%Y%m%d"
              orderby: desc
              class_name: DateTimeSorter
              name: timestamp
            - orderby: desc
              class_name: NumericSorter
              name: price

    """,
    )

    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "general_filesystem_data_connector",
                "datasource_name": "test_environment",
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        )
    )

    self_check_report = my_data_connector.self_check()

    assert self_check_report["class_name"] == "ConfiguredAssetFilesystemDataConnector"
    assert self_check_report["data_asset_count"] == 1
    assert self_check_report["data_assets"]["TestFiles"]["batch_definition_count"] == 10
    assert self_check_report["unmatched_data_reference_count"] == 0

    sorted_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
            )
        )
    )

    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "abe", "timestamp": "20200809", "price": "1040"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200819", "price": "1300"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200809", "price": "1000"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20201129", "price": "1900"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20200809", "price": "1500"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200811", "price": "1009"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200810", "price": "1003"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200713", "price": "1567"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200810", "price": "1001"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200809", "price": "1002"}
            ),
        ),
    ]

    # TEST 1: Sorting works
    assert expected == sorted_batch_definition_list

    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        partition_request=PartitionRequest(
            **{
                "batch_identifiers": {
                    "name": "james",
                    "timestamp": "20200713",
                    "price": "1567",
                }
            }
        ),
    )

    my_batch_definition_list: List[BatchDefinition]
    my_batch_definition: BatchDefinition

    # TEST 2: Should only return the specified partition
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request
        )
    )

    assert len(my_batch_definition_list) == 1
    my_batch_definition = my_batch_definition_list[0]
    expected_batch_definition: BatchDefinition = BatchDefinition(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        partition_definition=PartitionDefinition(
            **{
                "name": "james",
                "timestamp": "20200713",
                "price": "1567",
            }
        ),
    )
    assert my_batch_definition == expected_batch_definition

    # TEST 3: Without partition request, should return all 10
    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        partition_request=None,
    )
    # should return 10
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request
        )
    )
    assert len(my_batch_definition_list) == 10
示例#24
0
def test_return_all_batch_definitions_unsorted(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp("test_return_all_batch_definitions_unsorted")
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "alex_20200809_1000.csv",
            "eugene_20200809_1500.csv",
            "james_20200811_1009.csv",
            "abe_20200809_1040.csv",
            "will_20200809_1002.csv",
            "james_20200713_1567.csv",
            "eugene_20201129_1900.csv",
            "will_20200810_1001.csv",
            "james_20200810_1003.csv",
            "alex_20200819_1300.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
            class_name: ConfiguredAssetFilesystemDataConnector
            datasource_name: test_environment
            #execution_engine:
            #    class_name: PandasExecutionEngine
            base_directory: {base_directory}
            glob_directive: "*.csv"
            assets:
                TestFiles:
            default_regex:
                pattern: (.+)_(.+)_(.+)\\.csv
                group_names:
                    - name
                    - timestamp
                    - price
        """,
    )

    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "general_filesystem_data_connector",
                "datasource_name": "test_environment",
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        )
    )

    with pytest.raises(TypeError):
        my_data_connector.get_batch_definition_list_from_batch_request()

    # with unnamed data_asset_name
    with pytest.raises(TypeError):
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name=None,
            )
        )

    # with unnamed data_asset_name
    unsorted_batch_definition_list = (
        my_data_connector._get_batch_definition_list_from_batch_request(
            BatchRequestBase(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name=None,
            )
        )
    )
    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "abe", "timestamp": "20200809", "price": "1040"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200809", "price": "1000"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200819", "price": "1300"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20200809", "price": "1500"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20201129", "price": "1900"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200713", "price": "1567"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200810", "price": "1003"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200811", "price": "1009"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200809", "price": "1002"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200810", "price": "1001"}
            ),
        ),
    ]
    assert expected == unsorted_batch_definition_list

    # with named data_asset_name
    unsorted_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
            )
        )
    )
    assert expected == unsorted_batch_definition_list
示例#25
0
def test_basic_instantiation_with_nested_directories(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp("test_basic_instantiation_with_nested_directories")
    )
    os.makedirs(os.path.join(base_directory, "foo"))
    create_files_in_directory(
        directory=os.path.join(base_directory, "foo"),
        file_name_list=[
            "alpha-1.csv",
            "alpha-2.csv",
            "alpha-3.csv",
        ],
    )

    my_data_connector = ConfiguredAssetFilesystemDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        default_regex={
            "pattern": "alpha-(.*)\\.csv",
            "group_names": ["index"],
        },
        base_directory=os.path.join(base_directory, "foo"),
        assets={"alpha": {}},
    )

    assert my_data_connector.self_check() == {
        "class_name": "ConfiguredAssetFilesystemDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": [
            "alpha",
        ],
        "data_assets": {
            "alpha": {
                "example_data_references": [
                    "alpha-1.csv",
                    "alpha-2.csv",
                    "alpha-3.csv",
                ],
                "batch_definition_count": 3,
            },
        },
        "example_unmatched_data_references": [],
        "unmatched_data_reference_count": 0,
        "example_data_reference": {},
    }

    my_data_connector = ConfiguredAssetFilesystemDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        default_regex={
            "pattern": "alpha-(.*)\\.csv",
            "group_names": ["index"],
        },
        base_directory=base_directory,
        assets={"alpha": {"base_directory": "foo"}},
    )

    assert my_data_connector.self_check() == {
        "class_name": "ConfiguredAssetFilesystemDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": [
            "alpha",
        ],
        "data_assets": {
            "alpha": {
                "example_data_references": [
                    "alpha-1.csv",
                    "alpha-2.csv",
                    "alpha-3.csv",
                ],
                "batch_definition_count": 3,
            },
        },
        "example_unmatched_data_references": [],
        "unmatched_data_reference_count": 0,
        "example_data_reference": {},
    }

    my_data_connector = ConfiguredAssetFilesystemDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        default_regex={
            "pattern": "foo/alpha-(.*)\\.csv",
            "group_names": ["index"],
        },
        base_directory=base_directory,
        assets={"alpha": {}},
    )

    assert my_data_connector.self_check() == {
        "class_name": "ConfiguredAssetFilesystemDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": [
            "alpha",
        ],
        "data_assets": {
            "alpha": {
                "example_data_references": [
                    "foo/alpha-1.csv",
                    "foo/alpha-2.csv",
                    "foo/alpha-3.csv",
                ],
                "batch_definition_count": 3,
            },
        },
        "example_unmatched_data_references": ["foo"],
        "unmatched_data_reference_count": 1,
        "example_data_reference": {},
    }
def test_instantiation_from_a_config(mock_emit,
                                     empty_data_context_stats_enabled,
                                     tmp_path_factory):
    context: DataContext = empty_data_context_stats_enabled

    base_directory = str(
        tmp_path_factory.mktemp("test_instantiation_from_a_config"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "alpha-1.csv",
            "alpha-2.csv",
            "alpha-3.csv",
        ],
    )

    report_object = context.test_yaml_config(
        f"""
module_name: great_expectations.datasource.data_connector
class_name: ConfiguredAssetFilesystemDataConnector
datasource_name: FAKE_DATASOURCE
name: TEST_DATA_CONNECTOR

base_directory: {base_directory}
# glob_directive: "*.csv"

default_regex:
    pattern: alpha-(.*)\\.csv
    group_names:
        - index

assets:
    alpha:
    """,
        runtime_environment={
            "execution_engine": PandasExecutionEngine(),
        },
        return_mode="report_object",
    )

    assert report_object == {
        "class_name": "ConfiguredAssetFilesystemDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": [
            "alpha",
        ],
        "data_assets": {
            "alpha": {
                "example_data_references": [
                    "alpha-1.csv",
                    "alpha-2.csv",
                    "alpha-3.csv",
                ],
                "batch_definition_count":
                3,
            },
        },
        "example_unmatched_data_references": [],
        "unmatched_data_reference_count": 0,
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {},
    }
    assert mock_emit.call_count == 1
    # Substitute current anonymized name since it changes for each run
    anonymized_name = mock_emit.call_args_list[0][0][0]["event_payload"][
        "anonymized_name"]
    expected_call_args_list = [
        mock.call({
            "event": "data_context.test_yaml_config",
            "event_payload": {
                "anonymized_name": anonymized_name,
                "parent_class": "ConfiguredAssetFilesystemDataConnector",
            },
            "success": True,
        }),
    ]
    assert mock_emit.call_args_list == expected_call_args_list
示例#27
0
def test_redundant_information_in_naming_convention_bucket_sorted(tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("logs"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "some_bucket/2021/01/01/log_file-20210101.txt.gz",
            "some_bucket/2021/01/02/log_file-20210102.txt.gz",
            "some_bucket/2021/01/03/log_file-20210103.txt.gz",
            "some_bucket/2021/01/04/log_file-20210104.txt.gz",
            "some_bucket/2021/01/05/log_file-20210105.txt.gz",
            "some_bucket/2021/01/06/log_file-20210106.txt.gz",
            "some_bucket/2021/01/07/log_file-20210107.txt.gz",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
          module_name: great_expectations.datasource.data_connector
          class_name: InferredAssetFilesystemDataConnector
          datasource_name: test_environment
          name: my_inferred_asset_filesystem_data_connector
          base_directory: {base_directory}/
          glob_directive: "*/*/*/*/*.txt.gz"
          default_regex:
              group_names:
                  - data_asset_name
                  - year
                  - month
                  - day
                  - full_date
              pattern: (\\w{{11}})/(\\d{{4}})/(\\d{{2}})/(\\d{{2}})/log_file-(.*)\\.txt\\.gz
          sorters:
              - orderby: desc
                class_name: DateTimeSorter
                name: full_date

          """,
    )

    my_data_connector: InferredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "my_inferred_asset_filesystem_data_connector",
                "datasource_name": "test_environment",
                "execution_engine": "BASE_ENGINE",
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        )
    )

    sorted_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="my_inferred_asset_filesystem_data_connector",
                data_asset_name="some_bucket",
            )
        )
    )

    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            batch_identifiers=IDDict(
                {"year": "2021", "month": "01", "day": "07", "full_date": "20210107"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            batch_identifiers=IDDict(
                {"year": "2021", "month": "01", "day": "06", "full_date": "20210106"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            batch_identifiers=IDDict(
                {"year": "2021", "month": "01", "day": "05", "full_date": "20210105"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            batch_identifiers=IDDict(
                {"year": "2021", "month": "01", "day": "04", "full_date": "20210104"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            batch_identifiers=IDDict(
                {"year": "2021", "month": "01", "day": "03", "full_date": "20210103"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            batch_identifiers=IDDict(
                {"year": "2021", "month": "01", "day": "02", "full_date": "20210102"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            batch_identifiers=IDDict(
                {"year": "2021", "month": "01", "day": "01", "full_date": "20210101"}
            ),
        ),
    ]
    assert expected == sorted_batch_definition_list
示例#28
0
def test_nested_directory_data_asset_name_in_folder(
    empty_data_context, tmp_path_factory
):
    context = empty_data_context

    base_directory = str(
        tmp_path_factory.mktemp("test_nested_directory_data_asset_name_in_folder")
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "A/A-1.csv",
            "A/A-2.csv",
            "A/A-3.csv",
            "B/B-1.csv",
            "B/B-2.csv",
            "B/B-3.csv",
            "C/C-1.csv",
            "C/C-2.csv",
            "C/C-3.csv",
            "D/D-1.csv",
            "D/D-2.csv",
            "D/D-3.csv",
        ],
    )

    report_object = context.test_yaml_config(
        f"""
    module_name: great_expectations.datasource.data_connector
    class_name: InferredAssetFilesystemDataConnector
    datasource_name: FAKE_DATASOURCE
    name: TEST_DATA_CONNECTOR
    base_directory: {base_directory}/
    glob_directive: "*/*.csv"
    default_regex:
        group_names:
            - data_asset_name
            - letter
            - number
        pattern: (\\w{{1}})\\/(\\w{{1}})-(\\d{{1}})\\.csv
        """,
        return_mode="report_object",
    )

    assert report_object == {
        "class_name": "InferredAssetFilesystemDataConnector",
        "data_asset_count": 4,
        "example_data_asset_names": ["A", "B", "C"],
        "data_assets": {
            "A": {
                "batch_definition_count": 3,
                "example_data_references": ["A/A-1.csv", "A/A-2.csv", "A/A-3.csv"],
            },
            "B": {
                "batch_definition_count": 3,
                "example_data_references": ["B/B-1.csv", "B/B-2.csv", "B/B-3.csv"],
            },
            "C": {
                "batch_definition_count": 3,
                "example_data_references": ["C/C-1.csv", "C/C-2.csv", "C/C-3.csv"],
            },
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {},
    }
def test_get_batch_definitions_and_get_batch_basics(basic_pandas_datasource_v013):
    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        basic_pandas_datasource_v013.data_connectors["my_filesystem_data_connector"]
    )
    create_files_in_directory(
        my_data_connector.base_directory,
        ["A_1.csv", "A_2.csv", "A_3.csv", "B_1.csv", "B_2.csv", "B_3.csv"],
    )

    assert (
        len(
            basic_pandas_datasource_v013.get_available_batch_definitions(
                batch_request=BatchRequest(
                    datasource_name="my_datasource",
                    data_connector_name="my_filesystem_data_connector",
                    data_asset_name="Titanic",
                )
            )
        )
        == 6
    )

    batch: Batch = basic_pandas_datasource_v013.get_batch_from_batch_definition(
        batch_definition=BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="my_filesystem_data_connector",
            data_asset_name="B1",
            partition_definition=PartitionDefinition(
                {
                    "letter": "B",
                    "number": "1",
                }
            ),
        )
    )

    # TODO Abe 20201104: Make sure this is what we truly want to do.
    assert batch.batch_request == {}
    assert isinstance(batch.data.dataframe, pd.DataFrame)
    assert batch.batch_definition == BatchDefinition(
        datasource_name="my_datasource",
        data_connector_name="my_filesystem_data_connector",
        data_asset_name="B1",
        partition_definition=PartitionDefinition(
            {
                "letter": "B",
                "number": "1",
            }
        ),
    )

    batch_list: List[
        Batch
    ] = basic_pandas_datasource_v013.get_batch_list_from_batch_request(
        batch_request=BatchRequest(
            datasource_name="my_datasource",
            data_connector_name="my_filesystem_data_connector",
            data_asset_name="B1",
            partition_request={
                "batch_identifiers": {
                    "letter": "B",
                    "number": "1",
                }
            },
        )
    )
    assert len(batch_list) == 0

    batch_list: List[
        Batch
    ] = basic_pandas_datasource_v013.get_batch_list_from_batch_request(
        batch_request=BatchRequest(
            datasource_name="my_datasource",
            data_connector_name="my_filesystem_data_connector",
            data_asset_name="Titanic",
            partition_request={
                "batch_identifiers": {
                    "letter": "B",
                    "number": "1",
                }
            },
        )
    )
    assert len(batch_list) == 1
    assert isinstance(batch_list[0].data.dataframe, pd.DataFrame)

    my_df: pd.DataFrame = pd.DataFrame({"x": range(10), "y": range(10)})
    batch: Batch = basic_pandas_datasource_v013.get_batch_from_batch_definition(
        batch_definition=BatchDefinition(
            "my_datasource",
            "_pipeline",
            "_pipeline",
            partition_definition=PartitionDefinition({"some_random_id": 1}),
        ),
        batch_data=my_df,
    )
    # TODO Abe 20201104: Make sure this is what we truly want to do.
    assert batch.batch_request == {}
示例#30
0
def test_example_with_explicit_data_asset_names(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp("test_example_with_explicit_data_asset_names")
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "my_base_directory/alpha/files/go/here/alpha-202001.csv",
            "my_base_directory/alpha/files/go/here/alpha-202002.csv",
            "my_base_directory/alpha/files/go/here/alpha-202003.csv",
            "my_base_directory/beta_here/beta-202001.txt",
            "my_base_directory/beta_here/beta-202002.txt",
            "my_base_directory/beta_here/beta-202003.txt",
            "my_base_directory/beta_here/beta-202004.txt",
            "my_base_directory/gamma-202001.csv",
            "my_base_directory/gamma-202002.csv",
            "my_base_directory/gamma-202003.csv",
            "my_base_directory/gamma-202004.csv",
            "my_base_directory/gamma-202005.csv",
        ],
    )
    yaml_string = f"""
class_name: ConfiguredAssetFilesystemDataConnector
datasource_name: FAKE_DATASOURCE_NAME
base_directory: {base_directory}/my_base_directory/
default_regex:
    pattern: ^(.+)-(\\d{{4}})(\\d{{2}})\\.(csv|txt)$
    group_names:
        - data_asset_name
        - year_dir
        - month_dir
assets:
    alpha:
        base_directory: {base_directory}/my_base_directory/alpha/files/go/here/
        glob_directive: "*.csv"
    beta:
        base_directory: {base_directory}/my_base_directory/beta_here/
        glob_directive: "*.txt"
    gamma:
        glob_directive: "*.csv"

    """
    config = yaml.load(yaml_string)
    my_data_connector = instantiate_class_from_config(
        config,
        config_defaults={"module_name": "great_expectations.datasource.data_connector"},
        runtime_environment={"name": "my_data_connector"},
    )
    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    assert len(my_data_connector.get_unmatched_data_references()) == 0

    assert (
        len(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="FAKE_DATASOURCE_NAME",
                    data_connector_name="my_data_connector",
                    data_asset_name="alpha",
                )
            )
        )
        == 3
    )

    assert (
        len(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="FAKE_DATASOURCE_NAME",
                    data_connector_name="my_data_connector",
                    data_asset_name="beta",
                )
            )
        )
        == 4
    )

    assert (
        len(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="FAKE_DATASOURCE_NAME",
                    data_connector_name="my_data_connector",
                    data_asset_name="gamma",
                )
            )
        )
        == 5
    )