Exemplo n.º 1
0
def test_ge_pandas_sampling():
    df = ge.dataset.PandasDataset(
        {
            "A": [1, 2, 3, 4],
            "B": [5, 6, 7, 8],
            "C": ["a", "b", "c", "d"],
            "D": ["e", "f", "g", "h"],
        }
    )

    # Put some simple expectations on the data frame
    df.profile(profiler=ColumnsExistProfiler)
    df.expect_column_values_to_be_in_set("A", [1, 2, 3, 4])
    df.expect_column_values_to_be_in_set("B", [5, 6, 7, 8])
    df.expect_column_values_to_be_in_set("C", ["a", "b", "c", "d"])
    df.expect_column_values_to_be_in_set("D", ["e", "f", "g", "h"])

    exp1 = df.find_expectations()

    # The sampled data frame should:
    #
    #   1. Be a ge.dataset.PandaDataSet
    #   2. Inherit ALL the expectations of the parent data frame

    samp1 = df.sample(n=2)
    assert isinstance(samp1, ge.dataset.PandasDataset)
    assert samp1.find_expectations() == exp1

    samp1 = df.sample(frac=0.25, replace=True)
    assert isinstance(samp1, ge.dataset.PandasDataset)
    assert samp1.find_expectations() == exp1

    # Change expectation on column "D", sample, and check expectations.
    # The failing expectation on column "D" is NOT automatically dropped
    # in the sample.
    df.expect_column_values_to_be_in_set("D", ["e", "f", "g", "x"])
    samp1 = df.sample(n=2)
    exp1 = expectationSuiteSchema.load(
        {
            "expectation_suite_name": "test",
            "expectations": [
                {
                    "expectation_type": "expect_column_to_exist",
                    "kwargs": {"column": "A"},
                },
                {
                    "expectation_type": "expect_column_to_exist",
                    "kwargs": {"column": "B"},
                },
                {
                    "expectation_type": "expect_column_to_exist",
                    "kwargs": {"column": "C"},
                },
                {
                    "expectation_type": "expect_column_to_exist",
                    "kwargs": {"column": "D"},
                },
                {
                    "expectation_type": "expect_column_values_to_be_in_set",
                    "kwargs": {"column": "A", "value_set": [1, 2, 3, 4]},
                },
                {
                    "expectation_type": "expect_column_values_to_be_in_set",
                    "kwargs": {"column": "B", "value_set": [5, 6, 7, 8]},
                },
                {
                    "expectation_type": "expect_column_values_to_be_in_set",
                    "kwargs": {"column": "C", "value_set": ["a", "b", "c", "d"]},
                },
                {
                    "expectation_type": "expect_column_values_to_be_in_set",
                    "kwargs": {"column": "D", "value_set": ["e", "f", "g", "x"]},
                },
            ],
        }
    )
    assert samp1.find_expectations() == exp1.expectations
def test_ge_pandas_sampling():
    df = ge.dataset.PandasDataset({
        'A': [1, 2, 3, 4],
        'B': [5, 6, 7, 8],
        'C': ['a', 'b', 'c', 'd'],
        'D': ['e', 'f', 'g', 'h']
    })

    # Put some simple expectations on the data frame
    df.profile(profiler=ColumnsExistProfiler)
    df.expect_column_values_to_be_in_set("A", [1, 2, 3, 4])
    df.expect_column_values_to_be_in_set("B", [5, 6, 7, 8])
    df.expect_column_values_to_be_in_set("C", ['a', 'b', 'c', 'd'])
    df.expect_column_values_to_be_in_set("D", ['e', 'f', 'g', 'h'])

    exp1 = df.find_expectations()

    # The sampled data frame should:
    #
    #   1. Be a ge.dataset.PandaDataSet
    #   2. Inherit ALL the expectations of the parent data frame

    samp1 = df.sample(n=2)
    assert isinstance(samp1, ge.dataset.PandasDataset)
    assert samp1.find_expectations() == exp1

    samp1 = df.sample(frac=0.25, replace=True)
    assert isinstance(samp1, ge.dataset.PandasDataset)
    assert samp1.find_expectations() == exp1

    # Change expectation on column "D", sample, and check expectations.
    # The failing expectation on column "D" is NOT automatically dropped
    # in the sample.
    df.expect_column_values_to_be_in_set("D", ['e', 'f', 'g', 'x'])
    samp1 = df.sample(n=2)
    exp1 = expectationSuiteSchema.load({
        'data_asset_name':
        'test',
        'expectation_suite_name':
        'test',
        "expectations": [{
            'expectation_type': 'expect_column_to_exist',
            'kwargs': {
                'column': 'A'
            }
        }, {
            'expectation_type': 'expect_column_to_exist',
            'kwargs': {
                'column': 'B'
            }
        }, {
            'expectation_type': 'expect_column_to_exist',
            'kwargs': {
                'column': 'C'
            }
        }, {
            'expectation_type': 'expect_column_to_exist',
            'kwargs': {
                'column': 'D'
            }
        }, {
            'expectation_type': 'expect_column_values_to_be_in_set',
            'kwargs': {
                'column': 'A',
                'value_set': [1, 2, 3, 4]
            }
        }, {
            'expectation_type': 'expect_column_values_to_be_in_set',
            'kwargs': {
                'column': 'B',
                'value_set': [5, 6, 7, 8]
            }
        }, {
            'expectation_type': 'expect_column_values_to_be_in_set',
            'kwargs': {
                'column': 'C',
                'value_set': ['a', 'b', 'c', 'd']
            }
        }, {
            'expectation_type': 'expect_column_values_to_be_in_set',
            'kwargs': {
                'column': 'D',
                'value_set': ['e', 'f', 'g', 'x']
            }
        }]
    }).data
    assert samp1.find_expectations() == exp1.expectations
Exemplo n.º 3
0
def validate(
        data_asset,
        expectation_suite=None,
        data_asset_name=None,
        expectation_suite_name=None,
        data_context=None,
        data_asset_class_name=None,
        data_asset_module_name="great_expectations.dataset",
        data_asset_class=None,
        *args, **kwargs):
    """Validate the provided data asset. Validate can accept an optional data_asset_name to apply, data_context to use
    to fetch an expectation_suite if one is not provided, and data_asset_class_name/data_asset_module_name or
    data_asset_class to use to provide custom expectations.

    Args:
        data_asset: the asset to validate
        expectation_suite: the suite to use, or None to fetch one using a DataContext
        data_asset_name: the name of the data asset to use
        expectation_suite_name: the name of the expectation_suite to use
        data_context: data context to use to fetch an an expectation suite, or the path from which to obtain one
        data_asset_class_name: the name of a class to dynamically load a DataAsset class
        data_asset_module_name: the name of the module to dynamically load a DataAsset class
        data_asset_class: a class to use. overrides data_asset_class_name/ data_asset_module_name if provided
        *args:
        **kwargs:

    Returns:

    """
    # Get an expectation suite if not provided
    if expectation_suite is None and data_context is None:
        raise ValueError(
            "Either an expectation suite or a DataContext is required for validation.")

    if expectation_suite is None:
        logger.info("Using expectation suite from DataContext.")
        # Allow data_context to be a string, and try loading it from path in that case
        if isinstance(data_context, str):
            from great_expectations.data_context import DataContext
            data_context = DataContext(data_context)
        expectation_suite = data_context.get_expectation_suite(
            expectation_suite_name=expectation_suite_name
        )
    else:
        if isinstance(expectation_suite, dict):
            expectation_suite = expectationSuiteSchema.load(expectation_suite)
        if data_asset_name is not None:
            raise ValueError("When providing an expectation suite, data_asset_name cannot also be provided.")
        if expectation_suite_name is not None:
            raise ValueError("When providing an expectation suite, expectation_suite_name cannot also be provided.")
        logger.info(
            "Validating data_asset_name %s with expectation_suite_name %s" % (data_asset_name,
                                                                              expectation_suite.expectation_suite_name)
        )

    # If the object is already a DataAsset type, then this is purely a convenience method
    # and no conversion is needed; try to run validate on the given object
    if data_asset_class_name is None and data_asset_class is None:
        return data_asset.validate(expectation_suite=expectation_suite, data_context=data_context, *args, **kwargs)

    # Otherwise, try to convert and validate the dataset
    if data_asset_class is None:
        verify_dynamic_loading_support(module_name=data_asset_module_name)
        data_asset_class = load_class(data_asset_class_name, data_asset_module_name)

    import pandas as pd
    from great_expectations.dataset import Dataset, PandasDataset
    if data_asset_class is None:
        # Guess the GE data_asset_type based on the type of the data_asset
        if isinstance(data_asset, pd.DataFrame):
            data_asset_class = PandasDataset
        # Add other data_asset_type conditions here as needed

    # Otherwise, we will convert for the user to a subclass of the
    # existing class to enable new expectations, but only for datasets
    if not isinstance(data_asset, (Dataset, pd.DataFrame)):
        raise ValueError(
            "The validate util method only supports dataset validations, including custom subclasses. For other data "
            "asset types, use the object's own validate method."
        )

    if not issubclass(type(data_asset), data_asset_class):
        if isinstance(data_asset, pd.DataFrame) and issubclass(data_asset_class, PandasDataset):
            pass  # This is a special type of allowed coercion
        else:
            raise ValueError(
                "The validate util method only supports validation for subtypes of the provided data_asset_type.")

    data_asset_ = _convert_to_dataset_class(data_asset, dataset_class=data_asset_class,
                                            expectation_suite=expectation_suite)
    return data_asset_.validate(*args, data_context=data_context, **kwargs)
Exemplo n.º 4
0
def test_data_context_updates_expectation_suite_names(data_context):
    # A data context should update the data_asset_name and expectation_suite_name of expectation suites
    # that it creates when it saves them.

    expectation_suites = data_context.list_expectation_suites()

    # We should have a single expectation suite defined
    assert len(expectation_suites) == 1

    expectation_suite_name = expectation_suites[0].expectation_suite_name

    # We'll get that expectation suite and then update its name and re-save, then verify that everything
    # has been properly updated
    expectation_suite = data_context.get_expectation_suite(expectation_suite_name)

    # Note we codify here the current behavior of having a string data_asset_name though typed ExpectationSuite objects
    # will enable changing that
    assert expectation_suite.expectation_suite_name == expectation_suite_name

    # We will now change the data_asset_name and then save the suite in three ways:
    #   1. Directly using the new name,
    #   2. Using a different name that should be overwritten
    #   3. Using the new name but having the context draw that from the suite

    # Finally, we will try to save without a name (deleting it first) to demonstrate that saving will fail.


    expectation_suite.expectation_suite_name = 'a_new_suite_name'

    data_context.save_expectation_suite(
        expectation_suite=expectation_suite,
        expectation_suite_name='a_new_suite_name'
    )

    fetched_expectation_suite = data_context.get_expectation_suite('a_new_suite_name')

    assert fetched_expectation_suite.expectation_suite_name == 'a_new_suite_name'

    #   2. Using a different name that should be overwritten
    data_context.save_expectation_suite(
        expectation_suite=expectation_suite,
        expectation_suite_name='a_new_new_suite_name'
    )

    fetched_expectation_suite = data_context.get_expectation_suite('a_new_new_suite_name')

    assert fetched_expectation_suite.expectation_suite_name == 'a_new_new_suite_name'

    # Check that the saved name difference is actually persisted on disk
    with open(os.path.join(
                data_context.root_directory,
                "expectations",
                "a_new_new_suite_name.json"
                ), 'r') as suite_file:
        loaded_suite = expectationSuiteSchema.load(json.load(suite_file)).data
        assert loaded_suite.expectation_suite_name == 'a_new_new_suite_name'

    #   3. Using the new name but having the context draw that from the suite
    expectation_suite.expectation_suite_name = "a_third_suite_name"
    data_context.save_expectation_suite(
        expectation_suite=expectation_suite
    )

    fetched_expectation_suite = data_context.get_expectation_suite("a_third_suite_name")
    assert fetched_expectation_suite.expectation_suite_name == "a_third_suite_name"