def _get_data_reference_list_from_cache_by_data_asset_name( self, data_asset_name: str) -> List[str]: """ Fetch data_references corresponding to data_asset_name from the cache. """ regex_config: dict = self._get_regex_config( data_asset_name=data_asset_name) pattern: str = regex_config["pattern"] group_names: List[str] = regex_config["group_names"] batch_definition_list = self._get_batch_definition_list_from_batch_request( batch_request=BatchRequestBase( datasource_name=self.datasource_name, data_connector_name=self.name, data_asset_name=data_asset_name, )) path_list: List[str] = [ map_batch_definition_to_data_reference_string_using_regex( batch_definition=batch_definition, regex_pattern=pattern, group_names=group_names, ) for batch_definition in batch_definition_list ] # TODO: Sort with a real sorter here path_list.sort() return path_list
def get_available_data_asset_names(self) -> List[str]: """ Return the list of asset names known by this DataConnector Returns: A list of available names """ if len(self._data_references_cache) == 0: self._refresh_data_references_cache() # This will fetch ALL batch_definitions in the cache batch_definition_list: List[ BatchDefinition ] = self._get_batch_definition_list_from_batch_request( batch_request=BatchRequestBase( datasource_name=self.datasource_name, data_connector_name=self.name ) ) data_asset_names: List[str] = [ batch_definition.data_asset_name for batch_definition in batch_definition_list ] return list(set(data_asset_names))
def get_available_data_asset_names(self) -> List[str]: # This will fetch ALL batch_definitions in the cache batch_definition_list: List[ BatchDefinition] = self._get_batch_definition_list_from_batch_request( batch_request=BatchRequestBase( datasource_name=self.datasource_name, data_connector_name=self.name, )) data_asset_names: set = set() for batch_definition in batch_definition_list: data_asset_names.add(batch_definition.data_asset_name) return list(data_asset_names)
def _get_data_reference_list_from_cache_by_data_asset_name( self, data_asset_name: str) -> List[str]: """Fetch data_references corresponding to data_asset_name from the cache.""" batch_definition_list: List[ BatchDefinition] = self._get_batch_definition_list_from_batch_request( batch_request=BatchRequestBase( datasource_name=self.datasource_name, data_connector_name=self.name, data_asset_name=data_asset_name, )) if len(batch_definition_list) == 0: return [] return [ self._map_batch_definition_to_data_reference( batch_definition=batch_definition, ) for batch_definition in batch_definition_list ]
def test_return_all_batch_definitions_unsorted(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp("test_return_all_batch_definitions_unsorted") ) create_files_in_directory( directory=base_directory, file_name_list=[ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ], ) my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetFilesystemDataConnector datasource_name: test_environment #execution_engine: # class_name: PandasExecutionEngine base_directory: {base_directory} glob_directive: "*.csv" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "datasource_name": "test_environment", }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) ) with pytest.raises(TypeError): my_data_connector.get_batch_definition_list_from_batch_request() # with unnamed data_asset_name with pytest.raises(TypeError): my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name=None, ) ) # with unnamed data_asset_name unsorted_batch_definition_list = ( my_data_connector._get_batch_definition_list_from_batch_request( BatchRequestBase( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name=None, ) ) ) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "abe", "timestamp": "20200809", "price": "1040"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200809", "price": "1000"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "alex", "timestamp": "20200819", "price": "1300"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20200809", "price": "1500"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "eugene", "timestamp": "20201129", "price": "1900"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200713", "price": "1567"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200810", "price": "1003"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "james", "timestamp": "20200811", "price": "1009"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200809", "price": "1002"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", partition_definition=PartitionDefinition( {"name": "will", "timestamp": "20200810", "price": "1001"} ), ), ] assert expected == unsorted_batch_definition_list # with named data_asset_name unsorted_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", ) ) ) assert expected == unsorted_batch_definition_list
def test_return_all_batch_definitions_unsorted(): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ] for key in keys: client.put_object( Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key ) my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetS3DataConnector datasource_name: test_environment bucket: {bucket} prefix: "" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price """, ) my_data_connector: ConfiguredAssetS3DataConnector = instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_s3_data_connector", "execution_engine": PandasExecutionEngine(), }, config_defaults={"module_name": "great_expectations.datasource.data_connector"}, ) with pytest.raises(TypeError): # noinspection PyArgumentList my_data_connector.get_batch_definition_list_from_batch_request() # with unnamed data_asset_name with pytest.raises(TypeError): my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="", ) ) # with unnamed data_asset_name unsorted_batch_definition_list = ( my_data_connector._get_batch_definition_list_from_batch_request( BatchRequestBase( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="", ) ) ) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict( {"name": "abe", "timestamp": "20200809", "price": "1040"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict( {"name": "alex", "timestamp": "20200809", "price": "1000"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict( {"name": "alex", "timestamp": "20200819", "price": "1300"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict( {"name": "eugene", "timestamp": "20200809", "price": "1500"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict( {"name": "eugene", "timestamp": "20201129", "price": "1900"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict( {"name": "james", "timestamp": "20200713", "price": "1567"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict( {"name": "james", "timestamp": "20200810", "price": "1003"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict( {"name": "james", "timestamp": "20200811", "price": "1009"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict( {"name": "will", "timestamp": "20200809", "price": "1002"} ), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict( {"name": "will", "timestamp": "20200810", "price": "1001"} ), ), ] assert expected == unsorted_batch_definition_list # with named data_asset_name unsorted_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_s3_data_connector", data_asset_name="TestFiles", ) ) ) assert expected == unsorted_batch_definition_list
def test__batch_definition_matches_batch_request(): # TODO: <Alex>We need to cleanup PyCharm warnings.</Alex> A = BatchDefinition( datasource_name="A", data_connector_name="a", data_asset_name="aaa", partition_definition=PartitionDefinition({ "id": "A", }), ) assert batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequestBase(datasource_name="A")) assert not batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequestBase(datasource_name="B")) assert batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequestBase( datasource_name="A", data_connector_name="a", ), ) assert batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequestBase( datasource_name="A", data_connector_name="a", data_asset_name="aaa", ), ) assert not batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequestBase( datasource_name="A", data_connector_name="a", data_asset_name="bbb", ), ) assert not batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequestBase( datasource_name="A", data_connector_name="a", data_asset_name="aaa", partition_request={ "batch_identifiers": { "id": "B" }, }, ), ) assert batch_definition_matches_batch_request( batch_definition=A, batch_request=BatchRequestBase(partition_request={ "batch_identifiers": { "id": "A" }, }), ) assert batch_definition_matches_batch_request( batch_definition=BatchDefinition( **{ "datasource_name": "FAKE_DATASOURCE", "data_connector_name": "TEST_DATA_CONNECTOR", "data_asset_name": "DEFAULT_ASSET_NAME", "partition_definition": PartitionDefinition({"index": "3"}), }), batch_request=BatchRequest( **{ "datasource_name": "FAKE_DATASOURCE", "data_connector_name": "TEST_DATA_CONNECTOR", "data_asset_name": "DEFAULT_ASSET_NAME", "partition_request": None, }), )
def test_return_all_batch_definitions_unsorted_without_named_data_asset_name( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context_stats_enabled, expected_batch_definitions_unsorted, ): my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetGCSDataConnector datasource_name: test_environment bucket_or_name: my_bucket prefix: "" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price """, ) mock_list_keys.return_value = [ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ] my_data_connector: ConfiguredAssetGCSDataConnector = instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_gcs_data_connector", "execution_engine": PandasExecutionEngine(), }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) # In an actual production environment, GCS will automatically sort these blobs by path (alphabetic order). # Source: https://cloud.google.com/storage/docs/listing-objects # # The expected behavior is that our `unsorted_batch_definition_list` will maintain the same order it parses through `list_gcs_keys()` (hence "unsorted"). # When using an actual `Client` (and not a mock), the output of `list_gcs_keys` would be pre-sorted by nature of how the system orders blobs. # It is important to note that although this is a minor deviation, it is deemed to be immaterial as we still end up testing our desired behavior. unsorted_batch_definition_list = ( my_data_connector._get_batch_definition_list_from_batch_request( BatchRequestBase( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="", ))) assert unsorted_batch_definition_list == expected_batch_definitions_unsorted