def test__file_object_caching_for_FileDataConnector(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp( "basic_data_connector__filesystem_data_connector")) create_files_in_directory( directory=base_directory, file_name_list=[ "pretend/path/A-100.csv", "pretend/path/A-101.csv", "pretend/directory/B-1.csv", "pretend/directory/B-2.csv", ], ) my_data_connector = ConfiguredAssetFilesystemDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE", execution_engine=PandasExecutionEngine(), base_directory=base_directory, glob_directive="*/*/*.csv", default_regex={ "pattern": "(.*).csv", "group_names": ["name"], }, assets={"stuff": {}}, ) assert my_data_connector.get_data_reference_list_count() == 0 assert len(my_data_connector.get_unmatched_data_references()) == 0 # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() assert len(my_data_connector.get_unmatched_data_references()) == 0 assert my_data_connector.get_data_reference_list_count() == 4
def test_reader_fn_parameters(spark_session, basic_spark_df_execution_engine, tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_csv")) create_files_in_directory( directory=base_directory, file_name_list=[ "test-A.csv", ], ) test_df_small_csv_path = base_directory + "/test-A.csv" engine = basic_spark_df_execution_engine fn = engine._get_reader_fn(reader=spark_session.read, path=test_df_small_csv_path) assert "<bound method DataFrameReader.csv" in str(fn) test_sparkdf_with_header_param = basic_spark_df_execution_engine.get_batch_data( PathBatchSpec( path=test_df_small_csv_path, data_asset_name="DATA_ASSET", reader_options={"header": True}, )).dataframe assert test_sparkdf_with_header_param.head() == Row(x="1", y="2") test_sparkdf_with_no_header_param = basic_spark_df_execution_engine.get_batch_data( PathBatchSpec(path=test_df_small_csv_path, data_asset_name="DATA_ASSET")).dataframe assert test_sparkdf_with_no_header_param.head() == Row(_c0="x", _c1="y")
def test_simple_regex_example_with_implicit_data_asset_names_self_check( tmp_path_factory, ): base_directory = str( tmp_path_factory.mktemp( "test_simple_regex_example_with_implicit_data_asset_names" ) ) create_files_in_directory( directory=base_directory, file_name_list=[ "A-100.csv", "A-101.csv", "B-1.csv", "B-2.csv", "CCC.csv", ], ) my_data_connector: InferredAssetFilesystemDataConnector = ( InferredAssetFilesystemDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=PandasExecutionEngine(), default_regex={ "pattern": r"(.+)-(\d+)\.csv", "group_names": [ "data_asset_name", "number", ], }, glob_directive="*", base_directory=base_directory, ) ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() self_check_report_object = my_data_connector.self_check() assert self_check_report_object == { "class_name": "InferredAssetFilesystemDataConnector", "data_asset_count": 2, "example_data_asset_names": ["A", "B"], "data_assets": { "A": { "example_data_references": ["A-100.csv", "A-101.csv"], "batch_definition_count": 2, }, "B": { "example_data_references": ["B-1.csv", "B-2.csv"], "batch_definition_count": 2, }, }, "example_unmatched_data_references": ["CCC.csv"], "unmatched_data_reference_count": 1, # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": {}, }
def test_return_all_batch_definitions_too_many_sorters(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp( "test_return_all_batch_definitions_too_many_sorters")) create_files_in_directory( directory=base_directory, file_name_list=[ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ], ) my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetFilesystemDataConnector datasource_name: test_environment #execution_engine: # class_name: PandasExecutionEngine base_directory: {base_directory} glob_directive: "*.csv" assets: TestFiles: default_regex: pattern: (.+)_.+_.+\\.csv group_names: - name sorters: - orderby: asc class_name: LexicographicSorter name: name - datetime_format: "%Y%m%d" orderby: desc class_name: DateTimeSorter name: timestamp - orderby: desc class_name: NumericSorter name: price """, ) with pytest.raises(ge_exceptions.DataConnectorError): # noinspection PyUnusedLocal my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "datasource_name": "test_environment", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ))
def test_redundant_information_in_naming_convention_bucket( empty_data_context, tmp_path_factory ): context = empty_data_context base_directory = str(tmp_path_factory.mktemp("logs")) create_files_in_directory( directory=base_directory, file_name_list=[ "some_bucket/2021/01/01/log_file-20210101.txt.gz", "some_bucket/2021/01/02/log_file-20210102.txt.gz", "some_bucket/2021/01/03/log_file-20210103.txt.gz", "some_bucket/2021/01/04/log_file-20210104.txt.gz", "some_bucket/2021/01/05/log_file-20210105.txt.gz", "some_bucket/2021/01/06/log_file-20210106.txt.gz", "some_bucket/2021/01/07/log_file-20210107.txt.gz", ], ) report_object = context.test_yaml_config( f""" module_name: great_expectations.datasource.data_connector class_name: InferredAssetFilesystemDataConnector datasource_name: FAKE_DATASOURCE name: TEST_DATA_CONNECTOR base_directory: {base_directory}/ glob_directive: "*/*/*/*/*.txt.gz" default_regex: group_names: - data_asset_name - year - month - day pattern: (\\w{{11}})/(\\d{{4}})/(\\d{{2}})/(\\d{{2}})/log_file-.*\\.txt\\.gz """, runtime_environment={ "execution_engine": PandasExecutionEngine(), }, return_mode="report_object", ) assert report_object == { "class_name": "InferredAssetFilesystemDataConnector", "data_asset_count": 1, "example_data_asset_names": ["some_bucket"], "data_assets": { "some_bucket": { "batch_definition_count": 7, "example_data_references": [ "some_bucket/2021/01/01/log_file-*.txt.gz", "some_bucket/2021/01/02/log_file-*.txt.gz", "some_bucket/2021/01/03/log_file-*.txt.gz", ], } }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": {}, }
def test_basic_instantiation(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_basic_instantiation")) create_files_in_directory( directory=base_directory, file_name_list=[ "alpha-1.csv", "alpha-2.csv", "alpha-3.csv", ], ) my_data_connector = ConfiguredAssetFilesystemDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=PandasExecutionEngine(), default_regex={ "pattern": "alpha-(.*)\\.csv", "group_names": ["index"], }, base_directory=base_directory, assets={"alpha": {}}, ) assert my_data_connector.self_check() == { "class_name": "ConfiguredAssetFilesystemDataConnector", "data_asset_count": 1, "example_data_asset_names": [ "alpha", ], "data_assets": { "alpha": { "example_data_references": [ "alpha-1.csv", "alpha-2.csv", "alpha-3.csv", ], "batch_definition_count": 3, }, }, "example_unmatched_data_references": [], "unmatched_data_reference_count": 0, # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": {}, } # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() assert my_data_connector.get_data_reference_list_count() == 3 assert my_data_connector.get_unmatched_data_references() == [] # Illegal execution environment name with pytest.raises(ValueError): print( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="something", data_connector_name="my_data_connector", data_asset_name="something", )))
def test_for_self_check_using_InferredAssetFilesystemDataConnector_SparkDFExecutionEngine( spark_session, tmp_path_factory): base_directory = str( tmp_path_factory.mktemp( "basic_data_connector_inferred_asset_filesystem_data_connector")) create_files_in_directory( directory=base_directory, file_name_list=[ "alex_20201010_1000.csv", "abe_202011111_2000.csv", "will_20201212_3000.csv", ], ) my_data_connector = InferredAssetFilesystemDataConnector( name="my_data_connector", base_directory=base_directory, glob_directive="*.csv", datasource_name="FAKE_DATASOURCE", execution_engine=SparkDFExecutionEngine(), default_regex={ "pattern": "(.+)_(\\d+)_(\\d+)\\.csv", "group_names": ["data_asset_name", "timestamp", "size"], }, ) self_check_results = my_data_connector.self_check() assert self_check_results["data_asset_count"] == 3 assert self_check_results["example_data_reference"]["n_rows"] == 3
def test_redundant_information_in_naming_convention_bucket_too_many_sorters( tmp_path_factory, ): base_directory = str(tmp_path_factory.mktemp("logs")) create_files_in_directory( directory=base_directory, file_name_list=[ "some_bucket/2021/01/01/log_file-20210101.txt.gz", "some_bucket/2021/01/02/log_file-20210102.txt.gz", "some_bucket/2021/01/03/log_file-20210103.txt.gz", "some_bucket/2021/01/04/log_file-20210104.txt.gz", "some_bucket/2021/01/05/log_file-20210105.txt.gz", "some_bucket/2021/01/06/log_file-20210106.txt.gz", "some_bucket/2021/01/07/log_file-20210107.txt.gz", ], ) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: InferredAssetFilesystemDataConnector datasource_name: test_environment name: my_inferred_asset_filesystem_data_connector base_directory: {base_directory}/ glob_directive: "*/*/*/*/*.txt.gz" default_regex: group_names: - data_asset_name - year - month - day - full_date pattern: (\\w{{11}})/(\\d{{4}})/(\\d{{2}})/(\\d{{2}})/log_file-(.*)\\.txt\\.gz sorters: - datetime_format: "%Y%m%d" orderby: desc class_name: DateTimeSorter name: timestamp - orderby: desc class_name: NumericSorter name: price """, ) with pytest.raises(ge_exceptions.DataConnectorError): # noinspection PyUnusedLocal my_data_connector: InferredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "my_inferred_asset_filesystem_data_connector", "datasource_name": "test_environment", "execution_engine": "BASE_ENGINE", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) )
def test_redundant_information_in_naming_convention_timestamp( empty_data_context, tmp_path_factory ): context = empty_data_context base_directory = str(tmp_path_factory.mktemp("logs")) create_files_in_directory( directory=base_directory, file_name_list=[ "log_file-2021-01-01-035419.163324.txt.gz", "log_file-2021-01-02-035513.905752.txt.gz", "log_file-2021-01-03-035455.848839.txt.gz", "log_file-2021-01-04-035251.47582.txt.gz", "log_file-2021-01-05-033034.289789.txt.gz", "log_file-2021-01-06-034958.505688.txt.gz", "log_file-2021-01-07-033545.600898.txt.gz", ], ) report_object = context.test_yaml_config( f""" module_name: great_expectations.datasource.data_connector class_name: InferredAssetFilesystemDataConnector datasource_name: FAKE_DATASOURCE name: TEST_DATA_CONNECTOR base_directory: {base_directory}/ glob_directive: "*.txt.gz" default_regex: group_names: - data_asset_name - year - month - day pattern: (log_file)-(\\d{{4}})-(\\d{{2}})-(\\d{{2}})-.*\\.*\\.txt\\.gz """, runtime_environment={ "execution_engine": PandasExecutionEngine(), }, return_mode="report_object", ) assert report_object == { "class_name": "InferredAssetFilesystemDataConnector", "data_asset_count": 1, "example_data_asset_names": ["log_file"], "data_assets": { "log_file": { "batch_definition_count": 7, "example_data_references": [ "log_file-2021-01-01-*.txt.gz", "log_file-2021-01-02-*.txt.gz", "log_file-2021-01-03-*.txt.gz", ], } }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": {}, }
def sample_datasource_v013_with_single_partition_file_data_connector( tmp_path_factory, ): base_directory: str = str( tmp_path_factory.mktemp( "basic_pandas_datasource_v013_single_partition_filesystem_data_connector" )) sample_datasource: Datasource = instantiate_class_from_config( yaml.load( f""" class_name: Datasource execution_engine: class_name: PandasExecutionEngine data_connectors: test_runtime_data_connector: module_name: great_expectations.datasource.data_connector class_name: RuntimeDataConnector batch_identifiers: - pipeline_stage_name - airflow_run_id my_filesystem_data_connector: class_name: InferredAssetFilesystemDataConnector base_directory: {base_directory} # TODO: <Alex>Investigate: this potentially breaks the data_reference centric design.</Alex> glob_directive: "*.csv" # glob_directive: "*" default_regex: # TODO: <Alex>Investigate: this potentially breaks the data_reference centric design.</Alex> pattern: (.+)_(\\d+)\\.csv # pattern: (.+)_(\\d+)\\.[a-z][a-z][a-z] group_names: - letter - number """, ), runtime_environment={"name": "my_datasource"}, config_defaults={"module_name": "great_expectations.datasource"}, ) sample_file_names: List[str] = [ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ] create_files_in_directory(directory=base_directory, file_name_list=sample_file_names) return sample_datasource
def test_instantiation_from_a_config_regex_does_not_match_paths( empty_data_context, tmp_path_factory): context = empty_data_context base_directory = str(tmp_path_factory.mktemp("test_test_yaml_config")) create_files_in_directory( directory=base_directory, file_name_list=[ "alpha-1.csv", "alpha-2.csv", "alpha-3.csv", ], ) report_object = context.test_yaml_config( f""" module_name: great_expectations.datasource.data_connector class_name: ConfiguredAssetFilesystemDataConnector datasource_name: FAKE_DATASOURCE name: TEST_DATA_CONNECTOR base_directory: {base_directory} # glob_directive: "*.csv" default_regex: pattern: beta-(.*)\\.csv group_names: - index assets: alpha: """, return_mode="report_object", ) assert report_object == { "class_name": "ConfiguredAssetFilesystemDataConnector", "data_asset_count": 1, "example_data_asset_names": [ "alpha", ], "data_assets": { "alpha": { "example_data_references": [], "batch_definition_count": 0 }, }, "example_unmatched_data_references": [ "alpha-1.csv", "alpha-2.csv", "alpha-3.csv", ], "unmatched_data_reference_count": 3, "example_data_reference": {}, }
def test_redundant_information_in_naming_convention_random_hash( empty_data_context, tmp_path_factory ): context = empty_data_context base_directory = str(tmp_path_factory.mktemp("logs")) create_files_in_directory( directory=base_directory, file_name_list=[ "2021/01/01/log_file-2f1e94b40f310274b485e72050daf591.txt.gz", "2021/01/02/log_file-7f5d35d4f90bce5bf1fad680daac48a2.txt.gz", "2021/01/03/log_file-99d5ed1123f877c714bbe9a2cfdffc4b.txt.gz", "2021/01/04/log_file-885d40a5661bbbea053b2405face042f.txt.gz", "2021/01/05/log_file-d8e478f817b608729cfc8fb750ebfc84.txt.gz", "2021/01/06/log_file-b1ca8d1079c00fd4e210f7ef31549162.txt.gz", "2021/01/07/log_file-d34b4818c52e74b7827504920af19a5c.txt.gz", ], ) report_object = context.test_yaml_config( f""" module_name: great_expectations.datasource.data_connector class_name: InferredAssetFilesystemDataConnector datasource_name: FAKE_DATASOURCE name: TEST_DATA_CONNECTOR base_directory: {base_directory}/ glob_directive: "*/*/*/*.txt.gz" default_regex: group_names: - year - month - day - data_asset_name pattern: (\\d{{4}})/(\\d{{2}})/(\\d{{2}})/(log_file)-.*\\.txt\\.gz """, return_mode="report_object", ) assert report_object == { "class_name": "InferredAssetFilesystemDataConnector", "data_asset_count": 1, "example_data_asset_names": ["log_file"], "data_assets": { "log_file": { "batch_definition_count": 7, "example_data_references": [ "2021/01/01/log_file-*.txt.gz", "2021/01/02/log_file-*.txt.gz", "2021/01/03/log_file-*.txt.gz", ], } }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": {}, }
def test__get_full_file_path_spark(basic_spark_df_execution_engine, fs): """ What does this test and why? File paths in DBFS need to use the `dbfs:/` protocol base instead of `/dbfs/` when being read using the `spark.read` method in the ExecutionEngine. In the data connector config however, the `/dbfs` version must be used. This test verifies that a config using a `/dbfs/` path is translated to `dbfs:/` when preparing the PathBatchSpec for the SparkDFExecutionEngine. """ base_directory: str = "/dbfs/great_expectations" base_directory_colon: str = "dbfs:/great_expectations" fs.create_dir(base_directory) create_files_in_directory( directory=base_directory, file_name_list=[ "path/A-100.csv", "path/A-101.csv", "directory/B-1.csv", "directory/B-2.csv", ], ) my_data_connector: InferredAssetDBFSDataConnector = InferredAssetDBFSDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=basic_spark_df_execution_engine, default_regex={ "pattern": r"(.+)/(.+)-(\d+)\.csv", "group_names": ["data_asset_name", "letter", "number"], }, glob_directive="*/*.csv", base_directory=base_directory, ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() assert my_data_connector.get_data_reference_list_count() == 4 assert my_data_connector.get_unmatched_data_references() == [] my_batch_definition_list: List[ BatchDefinition] = my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="path", )) assert len(my_batch_definition_list) == 2 my_batch_definition: BatchDefinition = my_batch_definition_list[0] batch_spec: BatchSpec = my_data_connector.build_batch_spec( batch_definition=my_batch_definition) assert isinstance(batch_spec, PathBatchSpec) assert batch_spec.path == f"{base_directory_colon}/path/A-100.csv"
def test_spark_with_batch_spec_passthrough(tmp_path_factory, spark_session): base_directory: str = str( tmp_path_factory.mktemp("basic_spark_datasource_v013_filesystem_data_connector") ) create_files_in_directory( directory=base_directory, file_name_list=[ "test-A.csv", ], ) basic_datasource: Datasource = instantiate_class_from_config( yaml.load( f""" class_name: Datasource execution_engine: class_name: SparkDFExecutionEngine spark_config: spark.master: local[*] spark.executor.memory: 6g spark.driver.memory: 6g spark.ui.showConsoleProgress: false spark.sql.shuffle.partitions: 2 spark.default.parallelism: 4 data_connectors: simple_filesystem_data_connector: class_name: InferredAssetFilesystemDataConnector base_directory: {base_directory} batch_spec_passthrough: reader_options: header: True glob_directive: '*' default_regex: pattern: (.+)\\.csv group_names: - data_asset_name """, ), runtime_environment={"name": "my_datasource"}, config_defaults={"module_name": "great_expectations.datasource"}, ) data_connector_name: str = "simple_filesystem_data_connector" data_asset_name: str = "test-A" batch_request: dict = { "datasource_name": "my_datasource", "data_connector_name": data_connector_name, "data_asset_name": data_asset_name, } batch = basic_datasource.get_batch_list_from_batch_request( BatchRequest(**batch_request) ) # check that the batch_spec_passthrough has worked assert batch[0].data.dataframe.head() == Row(x="1", y="2")
def test_one_year_as_1_data_asset_12_batches(empty_data_context, tmp_path_factory): context: DataContext = empty_data_context base_directory: str = str(tmp_path_factory.mktemp("log_data")) create_files_in_directory( directory=base_directory, file_name_list=[ "some_bucket/report_2018-01.csv", "some_bucket/report_2018-02.csv", "some_bucket/report_2018-03.csv", "some_bucket/report_2018-04.csv", "some_bucket/report_2018-05.csv", "some_bucket/report_2018-06.csv", "some_bucket/report_2018-07.csv", "some_bucket/report_2018-08.csv", "some_bucket/report_2018-09.csv", "some_bucket/report_2018-10.csv", "some_bucket/report_2018-11.csv", "some_bucket/report_2018-12.csv", ], ) datasource_yaml: str = f""" name: taxi_datasource class_name: Datasource module_name: great_expectations.datasource execution_engine: module_name: great_expectations.execution_engine class_name: PandasExecutionEngine data_connectors: default_configured_data_connector_name: class_name: ConfiguredAssetFilesystemDataConnector base_directory: {base_directory}/some_bucket glob_directive: "*.csv" assets: report_2018: pattern: (.+)_(.+)-(.+)\\.csv group_names: - name - year - month """ context.test_yaml_config(datasource_yaml) context.add_datasource(**yaml.load(datasource_yaml)) datasource: Datasource = context.get_datasource( datasource_name="taxi_datasource") data_asset_names: dict = datasource.get_available_data_asset_names( data_connector_names="default_configured_data_connector_name") # making the result deterministic data_asset_names["default_configured_data_connector_name"].sort() assert data_asset_names == { "default_configured_data_connector_name": ["report_2018"] } assert len(data_asset_names["default_configured_data_connector_name"]) == 1
def test_self_check(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_self_check")) create_files_in_directory( directory=base_directory, file_name_list=[ "A-100.csv", "A-101.csv", "B-1.csv", "B-2.csv", ], ) my_data_connector: InferredAssetFilesystemDataConnector = InferredAssetFilesystemDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", default_regex={ "pattern": r"(.+)-(\d+)\.csv", "group_names": ["data_asset_name", "number"], }, glob_directive="*", base_directory=base_directory, ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() self_check_report_object = my_data_connector.self_check() assert self_check_report_object == { "class_name": "InferredAssetFilesystemDataConnector", "data_asset_count": 2, "example_data_asset_names": ["A", "B"], "data_assets": { "A": { "example_data_references": ["A-100.csv", "A-101.csv"], "batch_definition_count": 2, }, "B": { "example_data_references": ["B-1.csv", "B-2.csv"], "batch_definition_count": 2, }, }, "example_unmatched_data_references": [], "unmatched_data_reference_count": 0, "example_data_reference": {}, }
def test_datasource_config(empty_data_context): temp_dir = str(tempfile.mkdtemp()) create_files_in_directory( directory=temp_dir, file_name_list=[ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ], ) print(temp_dir) return_obj = empty_data_context.test_yaml_config( yaml_config=f""" class_name: Datasource execution_engine: class_name: PandasExecutionEngine data_connectors: my_filesystem_data_connector: # class_name: ConfiguredAssetFilesystemDataConnector class_name: InferredAssetFilesystemDataConnector base_directory: {temp_dir} glob_directive: '*.csv' default_regex: pattern: (.+)_(\\d+)\\.csv group_names: - letter - number """, return_mode="report_object", ) print(json.dumps(return_obj, indent=2)) assert set(return_obj.keys()) == {"execution_engine", "data_connectors"} sub_obj = return_obj["data_connectors"]["my_filesystem_data_connector"]
def test_basic_instantiation(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_basic_instantiation")) create_files_in_directory( directory=base_directory, file_name_list=[ "path/A-100.csv", "path/A-101.csv", "directory/B-1.csv", "directory/B-2.csv", ], ) my_data_connector: InferredAssetFilesystemDataConnector = ( InferredAssetFilesystemDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=PandasExecutionEngine(), default_regex={ "pattern": r"(.+)/(.+)-(\d+)\.csv", "group_names": ["data_asset_name", "letter", "number"], }, glob_directive="*/*.csv", base_directory=base_directory, ) ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() assert my_data_connector.get_data_reference_list_count() == 4 assert my_data_connector.get_unmatched_data_references() == [] # Illegal execution environment name with pytest.raises(ValueError): print( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="something", data_connector_name="my_data_connector", data_asset_name="something", ) ) )
def basic_files_dataconnector_yaml(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("get_previous_partition")) create_files_in_directory( directory=base_directory, file_name_list=[ "my_asset/AAA.csv", "my_asset/BBB.csv", "my_asset/CCC.csv", ], ) # These are all part of `my_asset` # it has 3 partitions.... AAA, BBB, CCC # # <WILL> this is going to be configured in a weird way # we will ignore data_assets?? return ( base_directory, f""" class_name: InferredAssetFilesystemDataConnector base_directory: {base_directory} glob_directive: "*/*.csv" datasource_name: general_data_source default_regex: pattern: .*\\/(my_asset)\\/(.*).csv group_names: - data_asset_name - name sorters: - name: name class_name: LexicographicSorter orderby: desc """, )
def test_relative_default_and_relative_asset_base_directory_paths(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp( "test_relative_default_and_relative_asset_base_directory_paths" ) ) create_files_in_directory( directory=base_directory, file_name_list=[ "test_dir_0/A/B/C/logfile_0.csv", "test_dir_0/A/B/C/bigfile_1.csv", "test_dir_0/A/filename2.csv", "test_dir_0/A/filename3.csv", ], ) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: ConfiguredAssetFilesystemDataConnector base_directory: test_dir_0/A glob_directive: "*" default_regex: pattern: (.+)\\.csv group_names: - name assets: A: base_directory: B/C glob_directive: "log*.csv" pattern: (.+)_(\\d+)\\.csv group_names: - name - number """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "my_configured_asset_filesystem_data_connector", "datasource_name": "BASE", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) my_data_connector.data_context_root_directory = base_directory assert my_data_connector.base_directory == f"{base_directory}/test_dir_0/A" assert ( my_data_connector._get_full_file_path_for_asset( path="bigfile_1.csv", asset=my_data_connector.assets["A"] ) == f"{base_directory}/test_dir_0/A/B/C/bigfile_1.csv" ) self_check_report = my_data_connector.self_check() assert self_check_report == { "class_name": "ConfiguredAssetFilesystemDataConnector", "data_asset_count": 1, "example_data_asset_names": ["A"], "data_assets": { "A": { "batch_definition_count": 1, "example_data_references": ["logfile_0.csv"], } }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], "example_data_reference": {}, } my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition my_batch_request = BatchRequest( datasource_name="BASE", data_connector_name="my_configured_asset_filesystem_data_connector", data_asset_name="A", partition_request=None, ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 1
def test_foxtrot(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_foxtrot")) create_files_in_directory( directory=base_directory, file_name_list=[ "test_dir_foxtrot/A/A-1.csv", "test_dir_foxtrot/A/A-2.csv", "test_dir_foxtrot/A/A-3.csv", "test_dir_foxtrot/B/B-1.txt", "test_dir_foxtrot/B/B-2.txt", "test_dir_foxtrot/B/B-3.txt", "test_dir_foxtrot/C/C-2017.csv", "test_dir_foxtrot/C/C-2018.csv", "test_dir_foxtrot/C/C-2019.csv", "test_dir_foxtrot/D/D-aaa.csv", "test_dir_foxtrot/D/D-bbb.csv", "test_dir_foxtrot/D/D-ccc.csv", "test_dir_foxtrot/D/D-ddd.csv", "test_dir_foxtrot/D/D-eee.csv", ], ) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: ConfiguredAssetFilesystemDataConnector base_directory: {base_directory}/test_dir_foxtrot assets: A: base_directory: A/ B: base_directory: B/ pattern: (.*)-(.*)\\.txt group_names: - part_1 - part_2 C: glob_directive: "*" base_directory: C/ D: glob_directive: "*" base_directory: D/ default_regex: pattern: (.*)-(.*)\\.csv group_names: - part_1 - part_2 """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "datasource_name": "BASE", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) self_check_report = my_data_connector.self_check() assert self_check_report == { "class_name": "ConfiguredAssetFilesystemDataConnector", "data_asset_count": 4, "example_data_asset_names": ["A", "B", "C"], "data_assets": { "A": { "batch_definition_count": 3, "example_data_references": [ "A-1.csv", "A-2.csv", "A-3.csv", ], }, "B": { "batch_definition_count": 3, "example_data_references": [ "B-1.txt", "B-2.txt", "B-3.txt", ], }, "C": { "batch_definition_count": 3, "example_data_references": [ "C-2017.csv", "C-2018.csv", "C-2019.csv", ], }, }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], "example_data_reference": {}, } my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition my_batch_request = BatchRequest( datasource_name="BASE", data_connector_name="general_filesystem_data_connector", data_asset_name="A", partition_request=None, ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 3
def test_alpha(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_alpha")) create_files_in_directory( directory=base_directory, file_name_list=[ "test_dir_alpha/A.csv", "test_dir_alpha/B.csv", "test_dir_alpha/C.csv", "test_dir_alpha/D.csv", ], ) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: ConfiguredAssetFilesystemDataConnector base_directory: {base_directory}/test_dir_alpha assets: A: glob_directive: "*.csv" default_regex: pattern: (.+)\\.csv group_names: - part_1 """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "datasource_name": "BASE", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) self_check_report = my_data_connector.self_check() print(json.dumps(self_check_report, indent=2)) assert self_check_report["class_name"] == "ConfiguredAssetFilesystemDataConnector" assert self_check_report["data_asset_count"] == 1 assert set(list(self_check_report["data_assets"].keys())) == {"A"} assert self_check_report["unmatched_data_reference_count"] == 0 my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition # Try to fetch a batch from a nonexistent asset my_batch_request: BatchRequest = BatchRequest( datasource_name="BASE", data_connector_name="general_filesystem_data_connector", data_asset_name="B", partition_request=None, ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 0 my_batch_request: BatchRequest = BatchRequest( datasource_name="BASE", data_connector_name="general_filesystem_data_connector", data_asset_name="A", partition_request=PartitionRequest(**{"batch_identifiers": {"part_1": "B"}}), ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 1
def test_return_all_batch_definitions_sorted(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp("test_return_all_batch_definitions_sorted") ) create_files_in_directory( directory=base_directory, file_name_list=[ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ], ) my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetFilesystemDataConnector datasource_name: test_environment #execution_engine: # class_name: PandasExecutionEngine base_directory: {base_directory} glob_directive: "*.csv" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price sorters: - orderby: asc class_name: LexicographicSorter name: name - datetime_format: "%Y%m%d" orderby: desc class_name: DateTimeSorter name: timestamp - orderby: desc class_name: NumericSorter name: price """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "datasource_name": "test_environment", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) self_check_report = my_data_connector.self_check() assert self_check_report["class_name"] == "ConfiguredAssetFilesystemDataConnector" assert self_check_report["data_asset_count"] == 1 assert self_check_report["data_assets"]["TestFiles"]["batch_definition_count"] == 10 assert self_check_report["unmatched_data_reference_count"] == 0 sorted_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", ) ) ) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "abe", "timestamp": "20200809", "price": "1040"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200819", "price": "1300"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200809", "price": "1000"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20201129", "price": "1900"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20200809", "price": "1500"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200811", "price": "1009"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200810", "price": "1003"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200713", "price": "1567"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200810", "price": "1001"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200809", "price": "1002"} ), ), ] # TEST 1: Sorting works assert expected == sorted_batch_definition_list my_batch_request: BatchRequest = BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_request=PartitionRequest( **{ "batch_identifiers": { "name": "james", "timestamp": "20200713", "price": "1567", } } ), ) my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition # TEST 2: Should only return the specified partition my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 1 my_batch_definition = my_batch_definition_list[0] expected_batch_definition: BatchDefinition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( **{ "name": "james", "timestamp": "20200713", "price": "1567", } ), ) assert my_batch_definition == expected_batch_definition # TEST 3: Without partition request, should return all 10 my_batch_request: BatchRequest = BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_request=None, ) # should return 10 my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 10
def test_return_all_batch_definitions_unsorted(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp("test_return_all_batch_definitions_unsorted") ) create_files_in_directory( directory=base_directory, file_name_list=[ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ], ) my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetFilesystemDataConnector datasource_name: test_environment #execution_engine: # class_name: PandasExecutionEngine base_directory: {base_directory} glob_directive: "*.csv" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "datasource_name": "test_environment", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) with pytest.raises(TypeError): my_data_connector.get_batch_definition_list_from_batch_request() # with unnamed data_asset_name with pytest.raises(TypeError): my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name=None, ) ) # with unnamed data_asset_name unsorted_batch_definition_list = ( my_data_connector._get_batch_definition_list_from_batch_request( BatchRequestBase( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name=None, ) ) ) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "abe", "timestamp": "20200809", "price": "1040"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200809", "price": "1000"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200819", "price": "1300"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20200809", "price": "1500"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20201129", "price": "1900"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200713", "price": "1567"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200810", "price": "1003"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200811", "price": "1009"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200809", "price": "1002"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200810", "price": "1001"} ), ), ] assert expected == unsorted_batch_definition_list # with named data_asset_name unsorted_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", ) ) ) assert expected == unsorted_batch_definition_list
def test_basic_instantiation_with_nested_directories(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp("test_basic_instantiation_with_nested_directories") ) os.makedirs(os.path.join(base_directory, "foo")) create_files_in_directory( directory=os.path.join(base_directory, "foo"), file_name_list=[ "alpha-1.csv", "alpha-2.csv", "alpha-3.csv", ], ) my_data_connector = ConfiguredAssetFilesystemDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", default_regex={ "pattern": "alpha-(.*)\\.csv", "group_names": ["index"], }, base_directory=os.path.join(base_directory, "foo"), assets={"alpha": {}}, ) assert my_data_connector.self_check() == { "class_name": "ConfiguredAssetFilesystemDataConnector", "data_asset_count": 1, "example_data_asset_names": [ "alpha", ], "data_assets": { "alpha": { "example_data_references": [ "alpha-1.csv", "alpha-2.csv", "alpha-3.csv", ], "batch_definition_count": 3, }, }, "example_unmatched_data_references": [], "unmatched_data_reference_count": 0, "example_data_reference": {}, } my_data_connector = ConfiguredAssetFilesystemDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", default_regex={ "pattern": "alpha-(.*)\\.csv", "group_names": ["index"], }, base_directory=base_directory, assets={"alpha": {"base_directory": "foo"}}, ) assert my_data_connector.self_check() == { "class_name": "ConfiguredAssetFilesystemDataConnector", "data_asset_count": 1, "example_data_asset_names": [ "alpha", ], "data_assets": { "alpha": { "example_data_references": [ "alpha-1.csv", "alpha-2.csv", "alpha-3.csv", ], "batch_definition_count": 3, }, }, "example_unmatched_data_references": [], "unmatched_data_reference_count": 0, "example_data_reference": {}, } my_data_connector = ConfiguredAssetFilesystemDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", default_regex={ "pattern": "foo/alpha-(.*)\\.csv", "group_names": ["index"], }, base_directory=base_directory, assets={"alpha": {}}, ) assert my_data_connector.self_check() == { "class_name": "ConfiguredAssetFilesystemDataConnector", "data_asset_count": 1, "example_data_asset_names": [ "alpha", ], "data_assets": { "alpha": { "example_data_references": [ "foo/alpha-1.csv", "foo/alpha-2.csv", "foo/alpha-3.csv", ], "batch_definition_count": 3, }, }, "example_unmatched_data_references": ["foo"], "unmatched_data_reference_count": 1, "example_data_reference": {}, }
def test_instantiation_from_a_config(mock_emit, empty_data_context_stats_enabled, tmp_path_factory): context: DataContext = empty_data_context_stats_enabled base_directory = str( tmp_path_factory.mktemp("test_instantiation_from_a_config")) create_files_in_directory( directory=base_directory, file_name_list=[ "alpha-1.csv", "alpha-2.csv", "alpha-3.csv", ], ) report_object = context.test_yaml_config( f""" module_name: great_expectations.datasource.data_connector class_name: ConfiguredAssetFilesystemDataConnector datasource_name: FAKE_DATASOURCE name: TEST_DATA_CONNECTOR base_directory: {base_directory} # glob_directive: "*.csv" default_regex: pattern: alpha-(.*)\\.csv group_names: - index assets: alpha: """, runtime_environment={ "execution_engine": PandasExecutionEngine(), }, return_mode="report_object", ) assert report_object == { "class_name": "ConfiguredAssetFilesystemDataConnector", "data_asset_count": 1, "example_data_asset_names": [ "alpha", ], "data_assets": { "alpha": { "example_data_references": [ "alpha-1.csv", "alpha-2.csv", "alpha-3.csv", ], "batch_definition_count": 3, }, }, "example_unmatched_data_references": [], "unmatched_data_reference_count": 0, # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": {}, } assert mock_emit.call_count == 1 # Substitute current anonymized name since it changes for each run anonymized_name = mock_emit.call_args_list[0][0][0]["event_payload"][ "anonymized_name"] expected_call_args_list = [ mock.call({ "event": "data_context.test_yaml_config", "event_payload": { "anonymized_name": anonymized_name, "parent_class": "ConfiguredAssetFilesystemDataConnector", }, "success": True, }), ] assert mock_emit.call_args_list == expected_call_args_list
def test_redundant_information_in_naming_convention_bucket_sorted(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("logs")) create_files_in_directory( directory=base_directory, file_name_list=[ "some_bucket/2021/01/01/log_file-20210101.txt.gz", "some_bucket/2021/01/02/log_file-20210102.txt.gz", "some_bucket/2021/01/03/log_file-20210103.txt.gz", "some_bucket/2021/01/04/log_file-20210104.txt.gz", "some_bucket/2021/01/05/log_file-20210105.txt.gz", "some_bucket/2021/01/06/log_file-20210106.txt.gz", "some_bucket/2021/01/07/log_file-20210107.txt.gz", ], ) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: InferredAssetFilesystemDataConnector datasource_name: test_environment name: my_inferred_asset_filesystem_data_connector base_directory: {base_directory}/ glob_directive: "*/*/*/*/*.txt.gz" default_regex: group_names: - data_asset_name - year - month - day - full_date pattern: (\\w{{11}})/(\\d{{4}})/(\\d{{2}})/(\\d{{2}})/log_file-(.*)\\.txt\\.gz sorters: - orderby: desc class_name: DateTimeSorter name: full_date """, ) my_data_connector: InferredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "my_inferred_asset_filesystem_data_connector", "datasource_name": "test_environment", "execution_engine": "BASE_ENGINE", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) sorted_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", ) ) ) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", batch_identifiers=IDDict( {"year": "2021", "month": "01", "day": "07", "full_date": "20210107"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", batch_identifiers=IDDict( {"year": "2021", "month": "01", "day": "06", "full_date": "20210106"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", batch_identifiers=IDDict( {"year": "2021", "month": "01", "day": "05", "full_date": "20210105"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", batch_identifiers=IDDict( {"year": "2021", "month": "01", "day": "04", "full_date": "20210104"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", batch_identifiers=IDDict( {"year": "2021", "month": "01", "day": "03", "full_date": "20210103"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", batch_identifiers=IDDict( {"year": "2021", "month": "01", "day": "02", "full_date": "20210102"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="my_inferred_asset_filesystem_data_connector", data_asset_name="some_bucket", batch_identifiers=IDDict( {"year": "2021", "month": "01", "day": "01", "full_date": "20210101"} ), ), ] assert expected == sorted_batch_definition_list
def test_nested_directory_data_asset_name_in_folder( empty_data_context, tmp_path_factory ): context = empty_data_context base_directory = str( tmp_path_factory.mktemp("test_nested_directory_data_asset_name_in_folder") ) create_files_in_directory( directory=base_directory, file_name_list=[ "A/A-1.csv", "A/A-2.csv", "A/A-3.csv", "B/B-1.csv", "B/B-2.csv", "B/B-3.csv", "C/C-1.csv", "C/C-2.csv", "C/C-3.csv", "D/D-1.csv", "D/D-2.csv", "D/D-3.csv", ], ) report_object = context.test_yaml_config( f""" module_name: great_expectations.datasource.data_connector class_name: InferredAssetFilesystemDataConnector datasource_name: FAKE_DATASOURCE name: TEST_DATA_CONNECTOR base_directory: {base_directory}/ glob_directive: "*/*.csv" default_regex: group_names: - data_asset_name - letter - number pattern: (\\w{{1}})\\/(\\w{{1}})-(\\d{{1}})\\.csv """, return_mode="report_object", ) assert report_object == { "class_name": "InferredAssetFilesystemDataConnector", "data_asset_count": 4, "example_data_asset_names": ["A", "B", "C"], "data_assets": { "A": { "batch_definition_count": 3, "example_data_references": ["A/A-1.csv", "A/A-2.csv", "A/A-3.csv"], }, "B": { "batch_definition_count": 3, "example_data_references": ["B/B-1.csv", "B/B-2.csv", "B/B-3.csv"], }, "C": { "batch_definition_count": 3, "example_data_references": ["C/C-1.csv", "C/C-2.csv", "C/C-3.csv"], }, }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": {}, }
def test_get_batch_definitions_and_get_batch_basics(basic_pandas_datasource_v013): my_data_connector: ConfiguredAssetFilesystemDataConnector = ( basic_pandas_datasource_v013.data_connectors["my_filesystem_data_connector"] ) create_files_in_directory( my_data_connector.base_directory, ["A_1.csv", "A_2.csv", "A_3.csv", "B_1.csv", "B_2.csv", "B_3.csv"], ) assert ( len( basic_pandas_datasource_v013.get_available_batch_definitions( batch_request=BatchRequest( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="Titanic", ) ) ) == 6 ) batch: Batch = basic_pandas_datasource_v013.get_batch_from_batch_definition( batch_definition=BatchDefinition( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="B1", partition_definition=PartitionDefinition( { "letter": "B", "number": "1", } ), ) ) # TODO Abe 20201104: Make sure this is what we truly want to do. assert batch.batch_request == {} assert isinstance(batch.data.dataframe, pd.DataFrame) assert batch.batch_definition == BatchDefinition( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="B1", partition_definition=PartitionDefinition( { "letter": "B", "number": "1", } ), ) batch_list: List[ Batch ] = basic_pandas_datasource_v013.get_batch_list_from_batch_request( batch_request=BatchRequest( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="B1", partition_request={ "batch_identifiers": { "letter": "B", "number": "1", } }, ) ) assert len(batch_list) == 0 batch_list: List[ Batch ] = basic_pandas_datasource_v013.get_batch_list_from_batch_request( batch_request=BatchRequest( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="Titanic", partition_request={ "batch_identifiers": { "letter": "B", "number": "1", } }, ) ) assert len(batch_list) == 1 assert isinstance(batch_list[0].data.dataframe, pd.DataFrame) my_df: pd.DataFrame = pd.DataFrame({"x": range(10), "y": range(10)}) batch: Batch = basic_pandas_datasource_v013.get_batch_from_batch_definition( batch_definition=BatchDefinition( "my_datasource", "_pipeline", "_pipeline", partition_definition=PartitionDefinition({"some_random_id": 1}), ), batch_data=my_df, ) # TODO Abe 20201104: Make sure this is what we truly want to do. assert batch.batch_request == {}
def test_example_with_explicit_data_asset_names(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp("test_example_with_explicit_data_asset_names") ) create_files_in_directory( directory=base_directory, file_name_list=[ "my_base_directory/alpha/files/go/here/alpha-202001.csv", "my_base_directory/alpha/files/go/here/alpha-202002.csv", "my_base_directory/alpha/files/go/here/alpha-202003.csv", "my_base_directory/beta_here/beta-202001.txt", "my_base_directory/beta_here/beta-202002.txt", "my_base_directory/beta_here/beta-202003.txt", "my_base_directory/beta_here/beta-202004.txt", "my_base_directory/gamma-202001.csv", "my_base_directory/gamma-202002.csv", "my_base_directory/gamma-202003.csv", "my_base_directory/gamma-202004.csv", "my_base_directory/gamma-202005.csv", ], ) yaml_string = f""" class_name: ConfiguredAssetFilesystemDataConnector datasource_name: FAKE_DATASOURCE_NAME base_directory: {base_directory}/my_base_directory/ default_regex: pattern: ^(.+)-(\\d{{4}})(\\d{{2}})\\.(csv|txt)$ group_names: - data_asset_name - year_dir - month_dir assets: alpha: base_directory: {base_directory}/my_base_directory/alpha/files/go/here/ glob_directive: "*.csv" beta: base_directory: {base_directory}/my_base_directory/beta_here/ glob_directive: "*.txt" gamma: glob_directive: "*.csv" """ config = yaml.load(yaml_string) my_data_connector = instantiate_class_from_config( config, config_defaults={"module_name": "great_expectations.datasource.data_connector"}, runtime_environment={"name": "my_data_connector"}, ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() assert len(my_data_connector.get_unmatched_data_references()) == 0 assert ( len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", ) ) ) == 3 ) assert ( len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="beta", ) ) ) == 4 ) assert ( len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="gamma", ) ) ) == 5 )