def test_self_check(mock_gcs_conn, mock_list_keys, mock_emit): my_data_connector: InferredAssetGCSDataConnector = InferredAssetGCSDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=PandasExecutionEngine(), default_regex={ "pattern": r"(.+)-(\d+)\.csv", "group_names": ["data_asset_name", "number"], }, bucket_or_name="test_bucket", prefix="", ) my_data_connector._refresh_data_references_cache() self_check_report_object = my_data_connector.self_check() assert self_check_report_object == { "class_name": "InferredAssetGCSDataConnector", "data_asset_count": 2, "example_data_asset_names": ["A", "B"], "data_assets": { "A": { "example_data_references": ["A-100.csv", "A-101.csv"], "batch_definition_count": 2, }, "B": { "example_data_references": ["B-1.csv", "B-2.csv"], "batch_definition_count": 2, }, }, "example_unmatched_data_references": [], "unmatched_data_reference_count": 0, }
def test_complex_regex_example_with_implicit_data_asset_names( mock_gcs_conn, mock_list_keys, mock_emit): my_data_connector: InferredAssetGCSDataConnector = InferredAssetGCSDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=PandasExecutionEngine(), default_regex={ "pattern": r"(\d{4})/(\d{2})/(.+)-\d+\.csv", "group_names": ["year_dir", "month_dir", "data_asset_name"], }, bucket_or_name="test_bucket", prefix="", ) my_data_connector._refresh_data_references_cache() assert (len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", ))) == 3) assert (len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="beta", ))) == 4) assert my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", data_connector_query={ "batch_filter_parameters": { "year_dir": "2020", "month_dir": "03", } }, )) == [ BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", batch_identifiers=IDDict( year_dir="2020", month_dir="03", ), ) ]
def test_get_batch_definition_list_from_batch_request_with_nonexistent_datasource_name_raises_error( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context_stats_enabled): my_data_connector = InferredAssetGCSDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=PandasExecutionEngine(), default_regex={ "pattern": r"(.+)/(.+)-(\d+)\.csv", "group_names": ["data_asset_name", "letter", "number"], }, bucket_or_name="test_bucket", prefix="", ) # Raises error in `DataConnector._validate_batch_request()` due to `datasource_name` in BatchRequest not matching DataConnector `datasource_name` with pytest.raises(ValueError): my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="something", data_connector_name="my_data_connector", data_asset_name="something", ))
def test_instantiation_without_args(mock_gcs_conn, mock_list_keys, expected_config_dict): my_data_connector = InferredAssetGCSDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=PandasExecutionEngine(), default_regex={ "pattern": r"(.+)/(.+)-(\d+)\.csv", "group_names": ["data_asset_name", "letter", "number"], }, bucket_or_name="test_bucket", prefix="", ) assert my_data_connector.self_check() == expected_config_dict my_data_connector._refresh_data_references_cache() assert my_data_connector.get_data_reference_list_count() == 4 assert my_data_connector.get_unmatched_data_references() == []
def test_get_batch_definition_list_from_batch_request_with_unknown_data_connector_raises_error( mock_gcs_conn, mock_list_keys, mock_emit): my_data_connector: InferredAssetGCSDataConnector = InferredAssetGCSDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=PandasExecutionEngine(), default_regex={ "pattern": r"(\d{4})/(\d{2})/(.+)-\d+\.csv", "group_names": ["year_dir", "month_dir", "data_asset_name"], }, bucket_or_name="test_bucket", prefix="", ) my_data_connector._refresh_data_references_cache() # Raises error in `DataConnector._validate_batch_request()` due to `data-connector_name` in BatchRequest not matching DataConnector name with pytest.raises(ValueError): my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="non_existent_data_connector", data_asset_name="my_data_asset", ))