def _get_data_reference_list_from_cache_by_data_asset_name(
            self, data_asset_name: str) -> List[str]:
        """
        Fetch data_references corresponding to data_asset_name from the cache.
        """
        regex_config: dict = self._get_regex_config(
            data_asset_name=data_asset_name)
        pattern: str = regex_config["pattern"]
        group_names: List[str] = regex_config["group_names"]

        batch_definition_list = self._get_batch_definition_list_from_batch_request(
            batch_request=BatchRequestBase(
                datasource_name=self.datasource_name,
                data_connector_name=self.name,
                data_asset_name=data_asset_name,
            ))

        path_list: List[str] = [
            map_batch_definition_to_data_reference_string_using_regex(
                batch_definition=batch_definition,
                regex_pattern=pattern,
                group_names=group_names,
            ) for batch_definition in batch_definition_list
        ]

        # TODO: Sort with a real sorter here
        path_list.sort()

        return path_list
예제 #2
0
    def get_available_data_asset_names(self) -> List[str]:
        """
        Return the list of asset names known by this DataConnector

        Returns:
            A list of available names
        """
        if len(self._data_references_cache) == 0:
            self._refresh_data_references_cache()

        # This will fetch ALL batch_definitions in the cache
        batch_definition_list: List[
            BatchDefinition
        ] = self._get_batch_definition_list_from_batch_request(
            batch_request=BatchRequestBase(
                datasource_name=self.datasource_name, data_connector_name=self.name
            )
        )

        data_asset_names: List[str] = [
            batch_definition.data_asset_name
            for batch_definition in batch_definition_list
        ]

        return list(set(data_asset_names))
    def get_available_data_asset_names(self) -> List[str]:
        # This will fetch ALL batch_definitions in the cache
        batch_definition_list: List[
            BatchDefinition] = self._get_batch_definition_list_from_batch_request(
                batch_request=BatchRequestBase(
                    datasource_name=self.datasource_name,
                    data_connector_name=self.name,
                ))

        data_asset_names: set = set()
        for batch_definition in batch_definition_list:
            data_asset_names.add(batch_definition.data_asset_name)

        return list(data_asset_names)
    def _get_data_reference_list_from_cache_by_data_asset_name(
            self, data_asset_name: str) -> List[str]:
        """Fetch data_references corresponding to data_asset_name from the cache."""
        batch_definition_list: List[
            BatchDefinition] = self._get_batch_definition_list_from_batch_request(
                batch_request=BatchRequestBase(
                    datasource_name=self.datasource_name,
                    data_connector_name=self.name,
                    data_asset_name=data_asset_name,
                ))

        if len(batch_definition_list) == 0:
            return []
        return [
            self._map_batch_definition_to_data_reference(
                batch_definition=batch_definition, )
            for batch_definition in batch_definition_list
        ]
예제 #5
0
def test_return_all_batch_definitions_unsorted(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp("test_return_all_batch_definitions_unsorted")
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "alex_20200809_1000.csv",
            "eugene_20200809_1500.csv",
            "james_20200811_1009.csv",
            "abe_20200809_1040.csv",
            "will_20200809_1002.csv",
            "james_20200713_1567.csv",
            "eugene_20201129_1900.csv",
            "will_20200810_1001.csv",
            "james_20200810_1003.csv",
            "alex_20200819_1300.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
            class_name: ConfiguredAssetFilesystemDataConnector
            datasource_name: test_environment
            #execution_engine:
            #    class_name: PandasExecutionEngine
            base_directory: {base_directory}
            glob_directive: "*.csv"
            assets:
                TestFiles:
            default_regex:
                pattern: (.+)_(.+)_(.+)\\.csv
                group_names:
                    - name
                    - timestamp
                    - price
        """,
    )

    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "general_filesystem_data_connector",
                "datasource_name": "test_environment",
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        )
    )

    with pytest.raises(TypeError):
        my_data_connector.get_batch_definition_list_from_batch_request()

    # with unnamed data_asset_name
    with pytest.raises(TypeError):
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name=None,
            )
        )

    # with unnamed data_asset_name
    unsorted_batch_definition_list = (
        my_data_connector._get_batch_definition_list_from_batch_request(
            BatchRequestBase(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name=None,
            )
        )
    )
    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "abe", "timestamp": "20200809", "price": "1040"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200809", "price": "1000"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200819", "price": "1300"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20200809", "price": "1500"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20201129", "price": "1900"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200713", "price": "1567"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200810", "price": "1003"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200811", "price": "1009"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200809", "price": "1002"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200810", "price": "1001"}
            ),
        ),
    ]
    assert expected == unsorted_batch_definition_list

    # with named data_asset_name
    unsorted_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
            )
        )
    )
    assert expected == unsorted_batch_definition_list
def test_return_all_batch_definitions_unsorted():
    region_name: str = "us-east-1"
    bucket: str = "test_bucket"
    conn = boto3.resource("s3", region_name=region_name)
    conn.create_bucket(Bucket=bucket)
    client = boto3.client("s3", region_name=region_name)

    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    keys: List[str] = [
        "alex_20200809_1000.csv",
        "eugene_20200809_1500.csv",
        "james_20200811_1009.csv",
        "abe_20200809_1040.csv",
        "will_20200809_1002.csv",
        "james_20200713_1567.csv",
        "eugene_20201129_1900.csv",
        "will_20200810_1001.csv",
        "james_20200810_1003.csv",
        "alex_20200819_1300.csv",
    ]
    for key in keys:
        client.put_object(
            Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key
        )

    my_data_connector_yaml = yaml.load(
        f"""
            class_name: ConfiguredAssetS3DataConnector
            datasource_name: test_environment
            bucket: {bucket}
            prefix: ""
            assets:
                TestFiles:
            default_regex:
                pattern: (.+)_(.+)_(.+)\\.csv
                group_names:
                    - name
                    - timestamp
                    - price
        """,
    )

    my_data_connector: ConfiguredAssetS3DataConnector = instantiate_class_from_config(
        config=my_data_connector_yaml,
        runtime_environment={
            "name": "general_s3_data_connector",
            "execution_engine": PandasExecutionEngine(),
        },
        config_defaults={"module_name": "great_expectations.datasource.data_connector"},
    )

    with pytest.raises(TypeError):
        # noinspection PyArgumentList
        my_data_connector.get_batch_definition_list_from_batch_request()

    # with unnamed data_asset_name
    with pytest.raises(TypeError):
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_s3_data_connector",
                data_asset_name="",
            )
        )

    # with unnamed data_asset_name
    unsorted_batch_definition_list = (
        my_data_connector._get_batch_definition_list_from_batch_request(
            BatchRequestBase(
                datasource_name="test_environment",
                data_connector_name="general_s3_data_connector",
                data_asset_name="",
            )
        )
    )
    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict(
                {"name": "abe", "timestamp": "20200809", "price": "1040"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict(
                {"name": "alex", "timestamp": "20200809", "price": "1000"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict(
                {"name": "alex", "timestamp": "20200819", "price": "1300"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict(
                {"name": "eugene", "timestamp": "20200809", "price": "1500"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict(
                {"name": "eugene", "timestamp": "20201129", "price": "1900"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict(
                {"name": "james", "timestamp": "20200713", "price": "1567"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict(
                {"name": "james", "timestamp": "20200810", "price": "1003"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict(
                {"name": "james", "timestamp": "20200811", "price": "1009"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict(
                {"name": "will", "timestamp": "20200809", "price": "1002"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict(
                {"name": "will", "timestamp": "20200810", "price": "1001"}
            ),
        ),
    ]
    assert expected == unsorted_batch_definition_list

    # with named data_asset_name
    unsorted_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_s3_data_connector",
                data_asset_name="TestFiles",
            )
        )
    )
    assert expected == unsorted_batch_definition_list
def test__batch_definition_matches_batch_request():
    # TODO: <Alex>We need to cleanup PyCharm warnings.</Alex>
    A = BatchDefinition(
        datasource_name="A",
        data_connector_name="a",
        data_asset_name="aaa",
        partition_definition=PartitionDefinition({
            "id": "A",
        }),
    )

    assert batch_definition_matches_batch_request(
        batch_definition=A,
        batch_request=BatchRequestBase(datasource_name="A"))

    assert not batch_definition_matches_batch_request(
        batch_definition=A,
        batch_request=BatchRequestBase(datasource_name="B"))

    assert batch_definition_matches_batch_request(
        batch_definition=A,
        batch_request=BatchRequestBase(
            datasource_name="A",
            data_connector_name="a",
        ),
    )

    assert batch_definition_matches_batch_request(
        batch_definition=A,
        batch_request=BatchRequestBase(
            datasource_name="A",
            data_connector_name="a",
            data_asset_name="aaa",
        ),
    )

    assert not batch_definition_matches_batch_request(
        batch_definition=A,
        batch_request=BatchRequestBase(
            datasource_name="A",
            data_connector_name="a",
            data_asset_name="bbb",
        ),
    )

    assert not batch_definition_matches_batch_request(
        batch_definition=A,
        batch_request=BatchRequestBase(
            datasource_name="A",
            data_connector_name="a",
            data_asset_name="aaa",
            partition_request={
                "batch_identifiers": {
                    "id": "B"
                },
            },
        ),
    )

    assert batch_definition_matches_batch_request(
        batch_definition=A,
        batch_request=BatchRequestBase(partition_request={
            "batch_identifiers": {
                "id": "A"
            },
        }),
    )

    assert batch_definition_matches_batch_request(
        batch_definition=BatchDefinition(
            **{
                "datasource_name": "FAKE_DATASOURCE",
                "data_connector_name": "TEST_DATA_CONNECTOR",
                "data_asset_name": "DEFAULT_ASSET_NAME",
                "partition_definition": PartitionDefinition({"index": "3"}),
            }),
        batch_request=BatchRequest(
            **{
                "datasource_name": "FAKE_DATASOURCE",
                "data_connector_name": "TEST_DATA_CONNECTOR",
                "data_asset_name": "DEFAULT_ASSET_NAME",
                "partition_request": None,
            }),
    )
예제 #8
0
def test_return_all_batch_definitions_unsorted_without_named_data_asset_name(
    mock_gcs_conn,
    mock_list_keys,
    mock_emit,
    empty_data_context_stats_enabled,
    expected_batch_definitions_unsorted,
):
    my_data_connector_yaml = yaml.load(
        f"""
           class_name: ConfiguredAssetGCSDataConnector
           datasource_name: test_environment
           bucket_or_name: my_bucket
           prefix: ""
           assets:
               TestFiles:
           default_regex:
               pattern: (.+)_(.+)_(.+)\\.csv
               group_names:
                   - name
                   - timestamp
                   - price
       """, )

    mock_list_keys.return_value = [
        "alex_20200809_1000.csv",
        "eugene_20200809_1500.csv",
        "james_20200811_1009.csv",
        "abe_20200809_1040.csv",
        "will_20200809_1002.csv",
        "james_20200713_1567.csv",
        "eugene_20201129_1900.csv",
        "will_20200810_1001.csv",
        "james_20200810_1003.csv",
        "alex_20200819_1300.csv",
    ]

    my_data_connector: ConfiguredAssetGCSDataConnector = instantiate_class_from_config(
        config=my_data_connector_yaml,
        runtime_environment={
            "name": "general_gcs_data_connector",
            "execution_engine": PandasExecutionEngine(),
        },
        config_defaults={
            "module_name": "great_expectations.datasource.data_connector"
        },
    )

    # In an actual production environment, GCS will automatically sort these blobs by path (alphabetic order).
    # Source: https://cloud.google.com/storage/docs/listing-objects
    #
    # The expected behavior is that our `unsorted_batch_definition_list` will maintain the same order it parses through `list_gcs_keys()` (hence "unsorted").
    # When using an actual `Client` (and not a mock), the output of `list_gcs_keys` would be pre-sorted by nature of how the system orders blobs.
    # It is important to note that although this is a minor deviation, it is deemed to be immaterial as we still end up testing our desired behavior.

    unsorted_batch_definition_list = (
        my_data_connector._get_batch_definition_list_from_batch_request(
            BatchRequestBase(
                datasource_name="test_environment",
                data_connector_name="general_gcs_data_connector",
                data_asset_name="",
            )))
    assert unsorted_batch_definition_list == expected_batch_definitions_unsorted