Пример #1
0
def test_build_suite_when_suite_already_exists(mock_emit, cardinality_dataset):
    """
    What does this test do and why?
    Confirms that creating a new suite on an existing profiler wipes the previous suite
    """
    profiler = UserConfigurableProfiler(
        cardinality_dataset,
        table_expectations_only=True,
        excluded_expectations=["expect_table_row_count_to_be_between"],
    )

    suite = profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(suite)
    assert len(suite.expectations) == 1
    assert "expect_table_columns_to_match_ordered_list" in expectations

    profiler.excluded_expectations = [
        "expect_table_columns_to_match_ordered_list"
    ]
    suite = profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(suite)
    assert len(suite.expectations) == 1
    assert "expect_table_row_count_to_be_between" in expectations

    # Note 20211209 - Currently the only method called by the Profiler that is instrumented for usage_statistics
    # is ExpectationSuite's add_expectation(). It will not send a usage_stats event when called from a Profiler.
    # this number can change in the future if our instrumentation changes.
    assert mock_emit.call_count == 0
    assert mock_emit.call_args_list == []
Пример #2
0
def test_config_with_not_null_only(titanic_data_context_modular_api,
                                   nulls_validator, possible_expectations_set):
    """
    What does this test do and why?
    Confirms that the not_null_only key in config works as expected.
    """

    excluded_expectations = [
        i for i in possible_expectations_set if "null" not in i
    ]

    validator = nulls_validator

    profiler_without_not_null_only = UserConfigurableProfiler(
        validator, excluded_expectations, not_null_only=False)
    suite_without_not_null_only = profiler_without_not_null_only.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(
        suite_without_not_null_only)
    assert expectations == {
        "expect_column_values_to_be_null",
        "expect_column_values_to_not_be_null",
    }

    profiler_with_not_null_only = UserConfigurableProfiler(
        validator, excluded_expectations, not_null_only=True)
    not_null_only_suite = profiler_with_not_null_only.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(
        not_null_only_suite)
    assert expectations == {"expect_column_values_to_not_be_null"}

    no_config_profiler = UserConfigurableProfiler(validator)
    no_config_suite = no_config_profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(
        no_config_suite)
    assert "expect_column_values_to_be_null" in expectations
Пример #3
0
def test_build_suite_with_config_and_no_semantic_types_dict(
        titanic_validator, possible_expectations_set):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with a config and without a semantic_types dict
    """
    profiler = UserConfigurableProfiler(
        titanic_validator,
        ignored_columns=["Survived", "Unnamed: 0"],
        excluded_expectations=["expect_column_mean_to_be_between"],
        primary_or_compound_key=["Name"],
        table_expectations_only=False,
        value_set_threshold="very_few",
    )
    suite = profiler.build_suite()
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    columns_expected_in_suite = {"Name", "PClass", "Age", "Sex", "SexCode"}
    assert columns_with_expectations == columns_expected_in_suite
    assert expectations_from_suite.issubset(possible_expectations_set)
    assert "expect_column_mean_to_be_between" not in expectations_from_suite
    assert len(suite.expectations) == 29
Пример #4
0
def test_profiler_all_expectation_types_spark(
    titanic_data_context_modular_api,
    taxi_validator_spark,
    possible_expectations_set,
    taxi_data_semantic_types,
    taxi_data_ignored_columns,
):
    """
    What does this test do and why?
    Ensures that all available expectation types work as expected for spark
    """
    context = titanic_data_context_modular_api

    profiler = UserConfigurableProfiler(
        taxi_validator_spark,
        semantic_types_dict=taxi_data_semantic_types,
        ignored_columns=taxi_data_ignored_columns,
        # TODO: Add primary_or_compound_key test
        #  primary_or_compound_key=[
        #     "vendor_id",
        #     "pickup_datetime",
        #     "dropoff_datetime",
        #     "trip_distance",
        #     "pickup_location_id",
        #     "dropoff_location_id",
        #  ],
    )

    assert profiler.column_info.get("rate_code_id")
    with pytest.deprecated_call(
    ):  # parse_strings_as_datetimes is deprecated in V3
        suite = profiler.build_suite()

    assert len(suite.expectations) == 45
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    unexpected_expectations = {
        "expect_column_values_to_be_unique",
        "expect_column_values_to_be_null",
        "expect_compound_columns_to_be_unique",
    }
    assert expectations_from_suite == {
        i
        for i in possible_expectations_set if i not in unexpected_expectations
    }

    ignored_included_columns_overlap = [
        i for i in columns_with_expectations if i in taxi_data_ignored_columns
    ]
    assert len(ignored_included_columns_overlap) == 0

    with pytest.deprecated_call(
    ):  # parse_strings_as_datetimes is deprecated in V3
        results = context.run_validation_operator(
            "action_list_operator", assets_to_validate=[taxi_validator_spark])

    assert results["success"]
def test_build_suite_with_config_and_no_semantic_types_dict(
    mock_emit, titanic_validator, possible_expectations_set
):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with a config and without a semantic_types dict
    """
    profiler = UserConfigurableProfiler(
        titanic_validator,
        ignored_columns=["Survived", "Unnamed: 0"],
        excluded_expectations=["expect_column_mean_to_be_between"],
        primary_or_compound_key=["Name"],
        table_expectations_only=False,
        value_set_threshold="very_few",
    )
    suite = profiler.build_suite()
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    columns_expected_in_suite = {"Name", "PClass", "Age", "Sex", "SexCode"}
    assert columns_with_expectations == columns_expected_in_suite
    assert expectations_from_suite.issubset(possible_expectations_set)
    assert "expect_column_mean_to_be_between" not in expectations_from_suite
    assert len(suite.expectations) == 29

    assert mock_emit.call_count == 1
    assert "expectation_suite.add_expectation" not in [
        mock_emit.call_args_list[0][0][0]["event"]
    ]

    # noinspection PyUnresolvedReferences
    expected_events: List[unittest.mock._Call]
    # noinspection PyUnresolvedReferences
    actual_events: List[unittest.mock._Call]

    expected_events = [
        mock.call(
            {
                "event": "legacy_profiler.build_suite",
                "event_payload": {
                    "profile_dataset_type": "Validator",
                    "excluded_expectations_specified": True,
                    "ignored_columns_specified": True,
                    "not_null_only": False,
                    "primary_or_compound_key_specified": True,
                    "semantic_types_dict_specified": False,
                    "table_expectations_only": False,
                    "value_set_threshold_specified": True,
                    "api_version": "v2",
                },
                "success": True,
            }
        ),
    ]
    actual_events = mock_emit.call_args_list
    assert actual_events == expected_events
def test_config_with_not_null_only(
    titanic_data_context_modular_api, possible_expectations_set
):
    """
    What does this test do and why?
    Confirms that the not_null_only key in config works as expected.
    """

    excluded_expectations = [i for i in possible_expectations_set if "null" not in i]

    df = pd.DataFrame(
        {
            "mostly_null": [i if i % 3 == 0 else None for i in range(0, 1000)],
            "mostly_not_null": [None if i % 3 == 0 else i for i in range(0, 1000)],
        }
    )

    validator = get_pandas_runtime_validator(titanic_data_context_modular_api, df)

    profiler_without_not_null_only = UserConfigurableProfiler(
        validator, excluded_expectations, not_null_only=False
    )
    suite_without_not_null_only = profiler_without_not_null_only.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(
        suite_without_not_null_only
    )
    assert expectations == {
        "expect_column_values_to_be_null",
        "expect_column_values_to_not_be_null",
    }

    profiler_with_not_null_only = UserConfigurableProfiler(
        validator, excluded_expectations, not_null_only=True
    )
    not_null_only_suite = profiler_with_not_null_only.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(
        not_null_only_suite
    )
    assert expectations == {"expect_column_values_to_not_be_null"}

    no_config_profiler = UserConfigurableProfiler(validator)
    no_config_suite = no_config_profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(no_config_suite)
    assert "expect_column_values_to_be_null" in expectations
def test_profiler_all_expectation_types_pandas(
    titanic_data_context_modular_api,
    taxi_validator_pandas,
    possible_expectations_set,
    taxi_data_semantic_types,
    taxi_data_ignored_columns,
):
    """
    What does this test do and why?
    Ensures that all available expectation types work as expected for pandas
    """
    context = titanic_data_context_modular_api

    profiler = UserConfigurableProfiler(
        taxi_validator_pandas,
        semantic_types_dict=taxi_data_semantic_types,
        ignored_columns=taxi_data_ignored_columns,
        primary_or_compound_key=[
            "vendor_id",
            "pickup_datetime",
            "dropoff_datetime",
            "trip_distance",
            "pickup_location_id",
            "dropoff_location_id",
        ],
    )

    assert profiler.column_info.get("rate_code_id")

    suite = profiler.build_suite()

    assert len(suite.expectations) == 41
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    unexpected_expectations = {
        "expect_column_values_to_be_unique",
        "expect_column_values_to_be_null",
        "expect_column_values_to_be_between",
    }
    assert expectations_from_suite == {
        i for i in possible_expectations_set if i not in unexpected_expectations
    }

    ignored_included_columns_overlap = [
        i for i in columns_with_expectations if i in taxi_data_ignored_columns
    ]
    assert len(ignored_included_columns_overlap) == 0
    results = context.run_validation_operator(
        "action_list_operator", assets_to_validate=[taxi_validator_pandas]
    )

    assert results["success"]
Пример #8
0
def test_build_suite_when_suite_already_exists(cardinality_dataset):
    """
    What does this test do and why?
    Confirms that creating a new suite on an existing profiler wipes the previous suite
    """
    profiler = UserConfigurableProfiler(
        cardinality_dataset,
        table_expectations_only=True,
        excluded_expectations=["expect_table_row_count_to_be_between"],
    )

    suite = profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(suite)
    assert len(suite.expectations) == 1
    assert "expect_table_columns_to_match_ordered_list" in expectations

    profiler.excluded_expectations = ["expect_table_columns_to_match_ordered_list"]
    suite = profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(suite)
    assert len(suite.expectations) == 1
    assert "expect_table_row_count_to_be_between" in expectations
Пример #9
0
def test_build_suite_with_semantic_types_dict(
    mock_emit,
    cardinality_dataset,
    possible_expectations_set,
):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with a semantic_types dict
    """

    semantic_types = {
        "numeric": ["col_few", "col_many", "col_very_many"],
        "value_set": ["col_two", "col_very_few"],
    }

    profiler = UserConfigurableProfiler(
        cardinality_dataset,
        semantic_types_dict=semantic_types,
        primary_or_compound_key=["col_unique"],
        ignored_columns=["col_one"],
        value_set_threshold="unique",
        table_expectations_only=False,
        excluded_expectations=["expect_column_values_to_not_be_null"],
    )
    suite = profiler.build_suite()
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    assert "column_one" not in columns_with_expectations
    assert "expect_column_values_to_not_be_null" not in expectations_from_suite
    assert expectations_from_suite.issubset(possible_expectations_set)
    assert len(suite.expectations) == 33

    value_set_expectations = [
        i for i in suite.expectations
        if i.expectation_type == "expect_column_values_to_be_in_set"
    ]
    value_set_columns = {
        i.kwargs.get("column")
        for i in value_set_expectations
    }

    assert len(value_set_columns) == 2
    assert value_set_columns == {"col_two", "col_very_few"}

    # Note 20211209 - Currently the only method called by the Profiler that is instrumented for usage_statistics
    # is ExpectationSuite's add_expectation(). It will not send a usage_stats event when called from a Profiler.
    # this number can change in the future if our instrumentation changes.
    assert mock_emit.call_count == 0
    assert mock_emit.call_args_list == []
Пример #10
0
def test_build_suite_with_semantic_types_dict(
    cardinality_validator,
    possible_expectations_set,
):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with a semantic_types dict
    """

    semantic_types = {
        "numeric": ["col_few", "col_many", "col_very_many"],
        "value_set": ["col_two", "col_very_few"],
    }

    profiler = UserConfigurableProfiler(
        cardinality_validator,
        semantic_types_dict=semantic_types,
        primary_or_compound_key=["col_unique"],
        ignored_columns=["col_one"],
        value_set_threshold="unique",
        table_expectations_only=False,
        excluded_expectations=["expect_column_values_to_not_be_null"],
    )
    suite = profiler.build_suite()
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    assert "column_one" not in columns_with_expectations
    assert "expect_column_values_to_not_be_null" not in expectations_from_suite
    assert expectations_from_suite.issubset(possible_expectations_set)
    assert len(suite.expectations) == 32

    value_set_expectations = [
        i for i in suite.expectations
        if i.expectation_type == "expect_column_values_to_be_in_set"
    ]
    value_set_columns = {
        i.kwargs.get("column")
        for i in value_set_expectations
    }

    assert len(value_set_columns) == 2
    assert value_set_columns == {"col_two", "col_very_few"}
Пример #11
0
def test_error_handling_for_expect_compound_columns_to_be_unique(
        taxi_validator_pandas, taxi_data_ignored_columns, caplog):
    # TODO: When this expectation is implemented for V3, remove this test and test for this expectation
    ignored_columns = taxi_data_ignored_columns + [
        "pickup_datetime",
        "dropoff_datetime",
        "total_amount",
        "passenger_count",
        "payment_type",
        "rate_code_id",
        "store_and_fwd_flag",
        "passenger_count",
        "store_and_fwd_flag",
        "vendor_id",
        "trip_distance",
    ]

    profiler = UserConfigurableProfiler(
        taxi_validator_pandas,
        ignored_columns=ignored_columns,
        primary_or_compound_key=[
            "vendor_id",
            "pickup_datetime",
            "dropoff_datetime",
            "trip_distance",
            "pickup_location_id",
            "dropoff_location_id",
        ],
    )
    with caplog.at_level(logging.WARNING):
        suite = profiler.build_suite()

    log_warnings = caplog.messages
    assert len(log_warnings) == 1

    assert (
        log_warnings[0] ==
        "expect_compound_columns_to_be_unique is not currently available in the V3 (Batch Request) API. Specifying a compound key will not add any expectations. This will be updated when that expectation becomes available."
    )

    assert len(suite.expectations) == 2

    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    expected_expectations = {
        "expect_table_columns_to_match_ordered_list",
        "expect_table_row_count_to_be_between",
    }

    assert expected_expectations == expectations_from_suite

    profiler_with_single_column_key = UserConfigurableProfiler(
        taxi_validator_pandas,
        ignored_columns=ignored_columns,
        primary_or_compound_key=["pickup_datetime"],
    )

    suite = profiler_with_single_column_key.build_suite()

    assert len(suite.expectations) == 3

    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    expected_expectations = {
        "expect_table_columns_to_match_ordered_list",
        "expect_table_row_count_to_be_between",
        "expect_column_values_to_be_unique",
    }

    assert expected_expectations == expectations_from_suite
def test_profiler_all_expectation_types_sqlalchemy(
    titanic_data_context_modular_api,
    taxi_validator_sqlalchemy,
    possible_expectations_set,
):
    """
    What does this test do and why?
    Ensures that all available expectation types work as expected for sqlalchemy
    """
    if taxi_validator_sqlalchemy == None:
        pytest.skip("a message")

    context = titanic_data_context_modular_api

    ignored_columns = [
        "pickup_location_id",
        "dropoff_location_id",
        "fare_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "improvement_surcharge",
        "congestion_surcharge",
    ]
    semantic_types = {
        "datetime": ["pickup_datetime", "dropoff_datetime"],
        "numeric": ["total_amount", "passenger_count"],
        "value_set": [
            "payment_type",
            "rate_code_id",
            "store_and_fwd_flag",
            "passenger_count",
        ],
        "boolean": ["store_and_fwd_flag"],
    }

    profiler = UserConfigurableProfiler(
        taxi_validator_sqlalchemy,
        semantic_types_dict=semantic_types,
        ignored_columns=ignored_columns,
        # TODO: Add primary_or_compound_key test
        #  primary_or_compound_key=[
        #     "vendor_id",
        #     "pickup_datetime",
        #     "dropoff_datetime",
        #     "trip_distance",
        #     "pickup_location_id",
        #     "dropoff_location_id",
        #  ],
    )

    assert profiler.column_info.get("rate_code_id")
    suite = profiler.build_suite()
    assert len(suite.expectations) == 45
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    unexpected_expectations = {
        "expect_column_values_to_be_unique",
        "expect_column_values_to_be_null",
        "expect_compound_columns_to_be_unique",
    }
    assert expectations_from_suite == {
        i for i in possible_expectations_set if i not in unexpected_expectations
    }

    ignored_included_columns_overlap = [
        i for i in columns_with_expectations if i in ignored_columns
    ]
    assert len(ignored_included_columns_overlap) == 0

    results = context.run_validation_operator(
        "action_list_operator", assets_to_validate=[taxi_validator_sqlalchemy]
    )

    assert results["success"]
Пример #13
0
def test_profiler_all_expectation_types(titanic_data_context,
                                        possible_expectations_set):
    """
    What does this test do and why?
    Ensures that all available expectation types work as expected
    """
    context = titanic_data_context
    df = ge.read_csv(
        file_relative_path(
            __file__,
            "../test_sets/taxi_yellow_trip_data_samples/yellow_trip_data_sample_2019-01.csv",
        ))
    batch_df = ge.dataset.PandasDataset(df)

    ignored_columns = [
        "pickup_location_id",
        "dropoff_location_id",
        "fare_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "improvement_surcharge",
        "congestion_surcharge",
    ]
    semantic_types = {
        "datetime": ["pickup_datetime", "dropoff_datetime"],
        "numeric": ["total_amount", "passenger_count"],
        "value_set": [
            "payment_type",
            "rate_code_id",
            "store_and_fwd_flag",
            "passenger_count",
        ],
        "boolean": ["store_and_fwd_flag"],
    }

    profiler = UserConfigurableProfiler(
        batch_df,
        semantic_types_dict=semantic_types,
        ignored_columns=ignored_columns,
        primary_or_compound_key=[
            "vendor_id",
            "pickup_datetime",
            "dropoff_datetime",
            "trip_distance",
            "pickup_location_id",
            "dropoff_location_id",
        ],
    )

    assert profiler.column_info.get("rate_code_id")
    suite = profiler.build_suite()
    assert len(suite.expectations) == 46
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    unexpected_expectations = {
        "expect_column_values_to_be_unique",
        "expect_column_values_to_be_null",
    }
    assert expectations_from_suite == {
        i
        for i in possible_expectations_set if i not in unexpected_expectations
    }

    ignored_included_columns_overlap = [
        i for i in columns_with_expectations if i in ignored_columns
    ]
    assert len(ignored_included_columns_overlap) == 0

    results = context.run_validation_operator("action_list_operator",
                                              assets_to_validate=[batch_df])

    assert results["success"]
def test_notebook_execution_onboarding_data_assistant_pandas_backend(
    titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled,
):
    """
    To set this test up we:

    - create a suite using User-Configurable Profiler
    - verify that no validations have happened
    - create the suite edit notebook by hijacking the private cli method

    We then:
    - execute that notebook (Note this will raise various errors like
    CellExecutionError if any cell in the notebook fails
    - create a new context from disk
    - verify that a validation has been run with our expectation suite
    """
    context: DataContext = titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled
    root_dir: str = context.root_directory
    uncommitted_dir: str = os.path.join(root_dir, "uncommitted")
    expectation_suite_name: str = "warning"

    context.create_expectation_suite(
        expectation_suite_name=expectation_suite_name)
    batch_request: dict = {
        "datasource_name": "my_datasource",
        "data_connector_name": "my_basic_data_connector",
        "data_asset_name": "Titanic_1912",
    }

    # Sanity check test setup
    original_suite: ExpectationSuite = context.get_expectation_suite(
        expectation_suite_name=expectation_suite_name)
    assert len(original_suite.expectations) == 0
    assert context.list_expectation_suite_names() == [expectation_suite_name]
    assert context.list_datasources() == [
        {
            "name": "my_datasource",
            "class_name": "Datasource",
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "class_name": "PandasExecutionEngine",
                "module_name": "great_expectations.execution_engine",
            },
            "data_connectors": {
                "my_basic_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.*)\\.csv",
                        "group_names": ["data_asset_name"],
                    },
                    "class_name": "InferredAssetFilesystemDataConnector",
                },
                "my_special_data_connector": {
                    "glob_directive": "*.csv",
                    "assets": {
                        "users": {
                            "pattern":
                            "(.+)_(\\d+)_(\\d+)\\.csv",
                            "group_names": ["name", "timestamp", "size"],
                            "class_name":
                            "Asset",
                            "base_directory":
                            f"{root_dir}/../data/titanic",
                            "module_name":
                            "great_expectations.datasource.data_connector.asset",
                        }
                    },
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.+)\\.csv",
                        "group_names": ["name"]
                    },
                    "class_name": "ConfiguredAssetFilesystemDataConnector",
                },
                "my_other_data_connector": {
                    "glob_directive": "*.csv",
                    "assets": {
                        "users": {
                            "class_name":
                            "Asset",
                            "module_name":
                            "great_expectations.datasource.data_connector.asset",
                        }
                    },
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.+)\\.csv",
                        "group_names": ["name"]
                    },
                    "class_name": "ConfiguredAssetFilesystemDataConnector",
                },
                "my_runtime_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "batch_identifiers":
                    ["pipeline_stage_name", "airflow_run_id"],
                    "class_name": "RuntimeDataConnector",
                },
            },
        },
        {
            "name": "my_additional_datasource",
            "class_name": "Datasource",
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "module_name": "great_expectations.execution_engine",
                "class_name": "PandasExecutionEngine",
            },
            "data_connectors": {
                "my_additional_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "default_regex": {
                        "pattern": "(.*)\\.csv",
                        "group_names": ["data_asset_name"],
                    },
                    "base_directory": f"{root_dir}/../data/titanic",
                    "class_name": "InferredAssetFilesystemDataConnector",
                }
            },
        },
    ]

    assert context.get_validation_result(
        expectation_suite_name="warning") == {}

    # Create notebook
    # do not want to actually send usage_message, since the function call is not the result of actual usage
    _suite_edit_workflow(
        context=context,
        expectation_suite_name=expectation_suite_name,
        profile=True,
        profiler_name=None,
        usage_event="test_notebook_execution",
        interactive_mode=CLISuiteInteractiveFlagCombinations.
        UNPROMPTED_INTERACTIVE_FALSE_MANUAL_TRUE,
        no_jupyter=True,
        create_if_not_exist=False,
        datasource_name=None,
        batch_request=batch_request,
        additional_batch_request_args=None,
        suppress_usage_message=True,
        assume_yes=True,
    )
    edit_notebook_path: str = os.path.join(uncommitted_dir,
                                           "edit_warning.ipynb")
    assert os.path.isfile(edit_notebook_path)

    run_notebook(
        notebook_path=edit_notebook_path,
        notebook_dir=uncommitted_dir,
        string_to_be_replaced=
        "context.open_data_docs(resource_identifier=validation_result_identifier)",
        replacement_string="",
    )

    # Assertions about output
    context = DataContext(context_root_dir=root_dir)
    obs_validation_result: ExpectationSuiteValidationResult = (
        context.get_validation_result(expectation_suite_name="warning"))
    assert obs_validation_result.statistics == {
        "evaluated_expectations": 2,
        "successful_expectations": 2,
        "unsuccessful_expectations": 0,
        "success_percent": 100.0,
    }

    expected_expectation_configurations: List[ExpectationConfiguration] = [
        ExpectationConfiguration(
            **{
                "kwargs": {
                    "max_value": 1313,
                    "min_value": 1313
                },
                "expectation_type": "expect_table_row_count_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {},
                            "metric_dependencies": None,
                            "metric_name": "table.row_count",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
            }),
        ExpectationConfiguration(
            **{
                "kwargs": {
                    "column_set": [
                        "Age",
                        "Name",
                        "PClass",
                        "Sex",
                        "SexCode",
                        "Survived",
                        "Unnamed: 0",
                    ],
                    "exact_match":
                    None,
                },
                "expectation_type": "expect_table_columns_to_match_set",
                "meta": {
                    "profiler_details": {
                        "success_ratio": 1.0
                    }
                },
            }),
    ]

    suite: ExpectationSuite = context.get_expectation_suite(
        expectation_suite_name=expectation_suite_name)

    expectation_configurations: List[ExpectationConfiguration] = []
    expectation_configuration: ExpectationConfiguration
    for expectation_configuration in suite.expectations:
        kwargs: dict = expectation_configuration.kwargs
        key: str
        value: Any
        kwargs = {
            key: sorted(value) if isinstance(value,
                                             (list, set, tuple)) else value
            for key, value in kwargs.items()
        }
        expectation_configuration.kwargs = kwargs
        expectation_configurations.append(expectation_configuration)

    assert expectation_configurations == expected_expectation_configurations

    columns_with_expectations: Set[str]
    expectations_from_suite: Set[str]
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite=suite)

    expected_expectations: Set[str] = {
        "expect_table_row_count_to_be_between",
        "expect_table_columns_to_match_set",
    }
    assert columns_with_expectations == set()
    assert expectations_from_suite == expected_expectations
Пример #15
0
def test_notebook_execution_with_pandas_backend(
        titanic_data_context_no_data_docs):
    """
    This tests that the notebook is written to disk and executes without error.

    To set this test up we:
    - create a scaffold notebook
    - verify that no validations have happened

    We then:
    - execute that notebook (Note this will raise various errors like
    CellExecutionError if any cell in the notebook fails
    - create a new context from disk
    - verify that a validation has been run with our expectation suite
    """
    # Since we'll run the notebook, we use a context with no data docs to avoid
    # the renderer's default behavior of building and opening docs, which is not
    # part of this test.
    context = titanic_data_context_no_data_docs
    root_dir = context.root_directory
    uncommitted_dir = os.path.join(root_dir, "uncommitted")
    suite_name = "my_suite"
    suite = context.create_expectation_suite(suite_name)

    csv_path = os.path.join(root_dir, "..", "data", "Titanic.csv")
    batch_kwargs = {"datasource": "mydatasource", "path": csv_path}

    # Sanity check test setup
    assert context.list_expectation_suite_names() == [suite_name]
    assert context.list_datasources() == [{
        "module_name": "great_expectations.datasource",
        "class_name": "PandasDatasource",
        "data_asset_type": {
            "module_name": "great_expectations.dataset",
            "class_name": "PandasDataset",
        },
        "batch_kwargs_generators": {
            "mygenerator": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": "../data",
            }
        },
        "name": "mydatasource",
    }]
    assert context.get_validation_result(suite_name) == {}
    notebook_path = os.path.join(uncommitted_dir, f"{suite_name}.ipynb")
    assert not os.path.isfile(notebook_path)

    # Create notebook
    renderer = SuiteScaffoldNotebookRenderer(titanic_data_context_no_data_docs,
                                             suite, batch_kwargs)
    renderer.render_to_disk(notebook_path)
    assert os.path.isfile(notebook_path)

    with open(notebook_path) as f:
        nb = nbformat.read(f, as_version=4)

    # Run notebook
    ep = ExecutePreprocessor(timeout=600, kernel_name="python3")
    ep.preprocess(nb, {"metadata": {"path": uncommitted_dir}})

    # Useful to inspect executed notebook
    output_notebook = os.path.join(uncommitted_dir, "output.ipynb")
    with open(output_notebook, "w") as f:
        nbformat.write(nb, f)

    # Assertions about output
    context = DataContext(root_dir)
    obs_validation_result = context.get_validation_result(suite_name)
    assert obs_validation_result.statistics == {
        "evaluated_expectations": 2,
        "successful_expectations": 2,
        "unsuccessful_expectations": 0,
        "success_percent": 100,
    }
    suite = context.get_expectation_suite(suite_name)

    assert suite.expectations
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    expected_expectations = {
        "expect_table_columns_to_match_ordered_list",
        "expect_table_row_count_to_be_between",
    }
    assert columns_with_expectations == set()
    assert expectations_from_suite == expected_expectations
def test_expect_compound_columns_to_be_unique(
    taxi_validator_spark, taxi_data_ignored_columns, caplog
):
    """
    Until all ExecutionEngine implementations for V3 are completed for this expectation:
    1) Use the "taxi_validator_" argument for this test method, corresponding to one of the ExecutionEngine subclasses,
       for which this expectation has not yet been implemented (and update the :param annotation below accordingly);
    2) With every additional ExecutionEngine implementation for this expectation, update the corresponding
       "test_profiler_all_expectation_types_" test method to include this expectation in the appropriate assertion.
    3) Once this expectation has been implemented for all ExecutionEngine subclasses, delete this test method entirely.

    :param taxi_validator_spark:
    :param taxi_data_ignored_columns:
    :param caplog:
    :return:
    """

    taxi_validator = taxi_validator_spark

    ignored_columns = taxi_data_ignored_columns + [
        "pickup_datetime",
        "dropoff_datetime",
        "total_amount",
        "passenger_count",
        "payment_type",
        "rate_code_id",
        "store_and_fwd_flag",
        "passenger_count",
        "store_and_fwd_flag",
        "vendor_id",
        "trip_distance",
    ]

    profiler = UserConfigurableProfiler(
        taxi_validator,
        ignored_columns=ignored_columns,
        primary_or_compound_key=[
            "vendor_id",
            "pickup_datetime",
            "dropoff_datetime",
            "trip_distance",
            "pickup_location_id",
            "dropoff_location_id",
        ],
    )
    with caplog.at_level(logging.WARNING):
        suite = profiler.build_suite()

    log_warning_records = list(
        filter(lambda record: record.levelname == "WARNING", caplog.records)
    )
    assert len(log_warning_records) == 0
    assert len(suite.expectations) == 3

    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    expected_expectations = {
        "expect_table_columns_to_match_ordered_list",
        "expect_table_row_count_to_be_between",
        "expect_compound_columns_to_be_unique",
    }

    assert expected_expectations == expectations_from_suite

    profiler_with_single_column_key = UserConfigurableProfiler(
        taxi_validator,
        ignored_columns=ignored_columns,
        primary_or_compound_key=["pickup_datetime"],
    )

    suite = profiler_with_single_column_key.build_suite()

    assert len(suite.expectations) == 3

    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    expected_expectations = {
        "expect_table_columns_to_match_ordered_list",
        "expect_table_row_count_to_be_between",
        "expect_column_values_to_be_unique",
    }

    assert expected_expectations == expectations_from_suite
def test_build_suite_when_suite_already_exists(
    mock_emit,
    cardinality_validator,
):
    """
    What does this test do and why?
    Confirms that creating a new suite on an existing profiler wipes the previous suite
    """
    profiler = UserConfigurableProfiler(
        cardinality_validator,
        table_expectations_only=True,
        excluded_expectations=["expect_table_row_count_to_be_between"],
    )

    suite = profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(suite)
    assert len(suite.expectations) == 1
    assert "expect_table_columns_to_match_ordered_list" in expectations

    profiler.excluded_expectations = ["expect_table_columns_to_match_ordered_list"]
    suite = profiler.build_suite()
    _, expectations = get_set_of_columns_and_expectations_from_suite(suite)
    assert len(suite.expectations) == 1
    assert "expect_table_row_count_to_be_between" in expectations

    assert mock_emit.call_count == 2

    # noinspection PyUnresolvedReferences
    expected_events: List[unittest.mock._Call]
    # noinspection PyUnresolvedReferences
    actual_events: List[unittest.mock._Call]

    expected_events = [
        mock.call(
            {
                "event": "legacy_profiler.build_suite",
                "event_payload": {
                    "profile_dataset_type": "Validator",
                    "excluded_expectations_specified": True,
                    "ignored_columns_specified": True,
                    "not_null_only": False,
                    "primary_or_compound_key_specified": False,
                    "semantic_types_dict_specified": False,
                    "table_expectations_only": True,
                    "value_set_threshold_specified": True,
                    "api_version": "v2",
                },
                "success": True,
            }
        ),
        mock.call(
            {
                "event": "legacy_profiler.build_suite",
                "event_payload": {
                    "profile_dataset_type": "Validator",
                    "excluded_expectations_specified": True,
                    "ignored_columns_specified": True,
                    "not_null_only": False,
                    "primary_or_compound_key_specified": False,
                    "semantic_types_dict_specified": False,
                    "table_expectations_only": True,
                    "value_set_threshold_specified": True,
                    "api_version": "v2",
                },
                "success": True,
            }
        ),
    ]
    actual_events = mock_emit.call_args_list
    assert actual_events == expected_events
def test_build_suite_with_semantic_types_dict(
    mock_emit,
    cardinality_validator,
    possible_expectations_set,
):
    """
    What does this test do and why?
    Tests that the build_suite function works as expected with a semantic_types dict
    """

    semantic_types = {
        "numeric": ["col_few", "col_many", "col_very_many"],
        "value_set": ["col_two", "col_very_few"],
    }

    profiler = UserConfigurableProfiler(
        cardinality_validator,
        semantic_types_dict=semantic_types,
        primary_or_compound_key=["col_unique"],
        ignored_columns=["col_one"],
        value_set_threshold="unique",
        table_expectations_only=False,
        excluded_expectations=["expect_column_values_to_not_be_null"],
    )
    suite = profiler.build_suite()
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    assert "column_one" not in columns_with_expectations
    assert "expect_column_values_to_not_be_null" not in expectations_from_suite
    assert expectations_from_suite.issubset(possible_expectations_set)
    assert len(suite.expectations) == 32

    value_set_expectations = [
        i
        for i in suite.expectations
        if i.expectation_type == "expect_column_values_to_be_in_set"
    ]
    value_set_columns = {i.kwargs.get("column") for i in value_set_expectations}

    assert len(value_set_columns) == 2
    assert value_set_columns == {"col_two", "col_very_few"}

    # Note 20211209 - Profiler will also call ExpectationSuite's add_expectation(), but it will not
    # send a usage_stats event when called from a Profiler.
    assert mock_emit.call_count == 1

    # noinspection PyUnresolvedReferences
    expected_events: List[unittest.mock._Call]
    # noinspection PyUnresolvedReferences
    actual_events: List[unittest.mock._Call]

    expected_events = [
        mock.call(
            {
                "event": "legacy_profiler.build_suite",
                "event_payload": {
                    "profile_dataset_type": "Validator",
                    "excluded_expectations_specified": True,
                    "ignored_columns_specified": True,
                    "not_null_only": False,
                    "primary_or_compound_key_specified": True,
                    "semantic_types_dict_specified": True,
                    "table_expectations_only": False,
                    "value_set_threshold_specified": True,
                    "api_version": "v2",
                },
                "success": True,
            }
        ),
    ]
    actual_events = mock_emit.call_args_list
    assert actual_events == expected_events