Exemplo n.º 1
0
def test__find_next_string_column(non_numeric_high_card_dataset,
                                  non_numeric_low_card_dataset):
    columns = non_numeric_high_card_dataset.get_table_columns()
    column_cache = {}
    profiled_columns = {
        "numeric": [],
        "low_card": [],
        "string": [],
        "datetime": []
    }

    column = SampleExpectationsDatasetProfiler._find_next_string_column(
        non_numeric_high_card_dataset, columns, profiled_columns, column_cache)
    expected_columns = ["highcardnonnum", "medcardnonnum"]
    assert column in expected_columns
    profiled_columns["string"].append(column)
    expected_columns.remove(column)
    assert SampleExpectationsDatasetProfiler._find_next_string_column(
        non_numeric_high_card_dataset, columns, profiled_columns,
        column_cache) in expected_columns

    columns = non_numeric_low_card_dataset.get_table_columns()
    column_cache = {}
    profiled_columns = {
        "numeric": [],
        "low_card": [],
        "string": [],
        "datetime": []
    }
    assert SampleExpectationsDatasetProfiler._find_next_string_column(
        non_numeric_low_card_dataset, columns, profiled_columns,
        column_cache) is None
Exemplo n.º 2
0
def test__find_next_numeric_column(numeric_high_card_dataset,
                                   non_numeric_low_card_dataset):
    columns = numeric_high_card_dataset.get_table_columns()
    column_cache = {}
    profiled_columns = {
        "numeric": [],
        "low_card": [],
        "string": [],
        "datetime": []
    }

    column = SampleExpectationsDatasetProfiler._find_next_numeric_column(
        numeric_high_card_dataset, columns, profiled_columns, column_cache)
    assert column == "norm_0_1"
    profiled_columns["numeric"].append(column)
    assert SampleExpectationsDatasetProfiler._find_next_numeric_column(
        numeric_high_card_dataset, columns, profiled_columns,
        column_cache) is None

    columns = non_numeric_low_card_dataset.get_table_columns()
    column_cache = {}
    profiled_columns = {
        "numeric": [],
        "low_card": [],
        "string": [],
        "datetime": []
    }
    assert SampleExpectationsDatasetProfiler._find_next_numeric_column(
        non_numeric_low_card_dataset, columns, profiled_columns,
        column_cache) is None
Exemplo n.º 3
0
def test__create_expectations_for_datetime_column(datetime_dataset):
    column = "datetime"

    expectation_suite = datetime_dataset.get_expectation_suite(
        suppress_warnings=True)
    assert len(expectation_suite.expectations) == 1

    SampleExpectationsDatasetProfiler._create_expectations_for_datetime_column(
        datetime_dataset, column)
    expectation_suite = datetime_dataset.get_expectation_suite(
        suppress_warnings=True)
    assert set([
        expectation.expectation_type
        for expectation in expectation_suite.expectations
        if expectation.kwargs.get("column") == column
    ]) == {
        "expect_column_to_exist", "expect_column_values_to_be_between",
        "expect_column_values_to_not_be_null"
    }
Exemplo n.º 4
0
def test__create_expectations_for_string_column(non_numeric_high_card_dataset):
    column = "highcardnonnum"

    expectation_suite = non_numeric_high_card_dataset.get_expectation_suite(
        suppress_warnings=True)
    assert len(expectation_suite.expectations) == 2

    SampleExpectationsDatasetProfiler._create_expectations_for_string_column(
        non_numeric_high_card_dataset, column)
    expectation_suite = non_numeric_high_card_dataset.get_expectation_suite(
        suppress_warnings=True)
    assert set([
        expectation.expectation_type
        for expectation in expectation_suite.expectations
        if expectation.kwargs.get("column") == column
    ]) == {
        "expect_column_to_exist", "expect_column_values_to_not_be_null",
        "expect_column_value_lengths_to_be_between"
    }
Exemplo n.º 5
0
def test__create_expectations_for_low_card_column(
        non_numeric_low_card_dataset):
    column = "lowcardnonnum"
    column_cache = {}

    expectation_suite = non_numeric_low_card_dataset.get_expectation_suite(
        suppress_warnings=True)
    assert len(expectation_suite.expectations) == 1

    SampleExpectationsDatasetProfiler._create_expectations_for_low_card_column(
        non_numeric_low_card_dataset, column, column_cache)
    expectation_suite = non_numeric_low_card_dataset.get_expectation_suite(
        suppress_warnings=True)
    assert set([
        expectation.expectation_type
        for expectation in expectation_suite.expectations
        if expectation.kwargs.get("column") == column
    ]) == {
        "expect_column_to_exist",
        'expect_column_distinct_values_to_be_in_set',
        "expect_column_kl_divergence_to_be_less_than",
        "expect_column_values_to_not_be_null",
    }
Exemplo n.º 6
0
def test__create_expectations_for_numeric_column(numeric_high_card_dataset,
                                                 test_backend):
    column = "norm_0_1"

    expectation_suite = numeric_high_card_dataset.get_expectation_suite(
        suppress_warnings=True)
    assert len(expectation_suite.expectations) == 1

    SampleExpectationsDatasetProfiler._create_expectations_for_numeric_column(
        numeric_high_card_dataset, column)
    expectation_suite = numeric_high_card_dataset.get_expectation_suite(
        suppress_warnings=True)
    if test_backend in ["PandasDataset", "SparkDFDataset", "postgresql"]:
        assert set([
            expectation.expectation_type
            for expectation in expectation_suite.expectations
            if expectation.kwargs.get("column") == column
        ]) == {
            "expect_column_to_exist", "expect_column_min_to_be_between",
            "expect_column_max_to_be_between",
            "expect_column_mean_to_be_between",
            "expect_column_median_to_be_between",
            "expect_column_quantile_values_to_be_between",
            "expect_column_values_to_not_be_null"
        }
    else:
        assert set([
            expectation.expectation_type
            for expectation in expectation_suite.expectations
            if expectation.kwargs.get("column") == column
        ]) == {
            "expect_column_to_exist", "expect_column_min_to_be_between",
            "expect_column_max_to_be_between",
            "expect_column_mean_to_be_between",
            "expect_column_median_to_be_between",
            "expect_column_values_to_not_be_null"
        }
Exemplo n.º 7
0
def test_SampleExpectationsDatasetProfiler_with_context(not_empty_datacontext):
    context = not_empty_datacontext

    context.create_expectation_suite("default")
    datasource = context.datasources["rad_datasource"]
    base_dir = datasource.config["generators"]["subdir_reader"][
        "base_directory"]
    batch_kwargs = {
        "datasource": "rad_datasource",
        "path": os.path.join(base_dir, "f1.csv"),
    }
    batch = context.get_batch(batch_kwargs, "default")
    expectation_suite, validation_results = SampleExpectationsDatasetProfiler.profile(
        batch)

    assert expectation_suite.expectation_suite_name == "default"
    assert "SampleExpectationsDatasetProfiler" in expectation_suite.meta
    assert set(expectation_suite.meta["SampleExpectationsDatasetProfiler"].
               keys()) == {
                   "created_by",
                   "created_at",
                   "batch_kwargs",
               }
    assert (expectation_suite.meta["SampleExpectationsDatasetProfiler"]
            ["batch_kwargs"] == batch_kwargs)
    for exp in expectation_suite.expectations:
        assert "SampleExpectationsDatasetProfiler" in exp.meta
        assert "confidence" in exp.meta["SampleExpectationsDatasetProfiler"]

    assert set(validation_results.meta.keys()) == {
        "batch_kwargs",
        "batch_markers",
        "batch_parameters",
        "expectation_suite_name",
        "great_expectations.__version__",
        "run_id",
    }

    assert expectation_suite.meta["notes"] == {
        "format":
        "markdown",
        "content": [
            """#### This is an _example_ suite

- This suite was made by quickly glancing at 1000 rows of your data.
- This is **not a production suite**. It is meant to show examples of expectations.
- Because this suite was auto-generated using a very basic profiler that does not know your data like you do, many of the expectations may not be meaningful.
"""
        ]
    }

    expectation_types = [
        expectation["expectation_type"]
        for expectation in expectation_suite.expectations
    ]

    expected_expectation_types = {
        'expect_table_row_count_to_be_between',
        'expect_table_column_count_to_equal',
        'expect_table_columns_to_match_ordered_list',
        'expect_column_values_to_not_be_null',
        'expect_column_min_to_be_between', 'expect_column_max_to_be_between',
        'expect_column_mean_to_be_between',
        'expect_column_median_to_be_between',
        'expect_column_quantile_values_to_be_between'
    }

    assert set(expectation_types) == expected_expectation_types