def _profile(cls, dataset, configuration=None):
        logger.debug(f"Running profiler with configuration: {configuration}")
        if configuration == "demo":
            return cls._demo_profile(dataset)

        existing_columns = dataset.get_table_columns()
        selected_columns = existing_columns
        included_expectations = []
        excluded_expectations = []

        if configuration:
            if ("included_expectations" in configuration
                    and "excluded_expectations" in configuration):
                raise ProfilerError(
                    "Please specify either `included_expectations` or `excluded_expectations`."
                )
            if "included_expectations" in configuration:
                included_expectations = configuration["included_expectations"]
                if included_expectations in [False, None, []]:
                    included_expectations = None
                _check_that_expectations_are_available(dataset,
                                                       included_expectations)
            if "excluded_expectations" in configuration:
                excluded_expectations = configuration["excluded_expectations"]
                if excluded_expectations in [False, None, []]:
                    excluded_expectations = None
                _check_that_expectations_are_available(dataset,
                                                       excluded_expectations)

            if ("included_columns" in configuration
                    and "excluded_columns" in configuration):
                raise ProfilerError(
                    "Please specify either `excluded_columns` or `included_columns`."
                )
            elif "included_columns" in configuration:
                selected_columns = configuration["included_columns"]
                if selected_columns in [False, None, []]:
                    selected_columns = []
            elif "excluded_columns" in configuration:
                excluded_columns = configuration["excluded_columns"]
                if excluded_columns in [False, None, []]:
                    excluded_columns = []
                selected_columns = set(existing_columns) - set(
                    excluded_columns)

        _check_that_columns_exist(dataset, selected_columns)
        if included_expectations is None:
            suite = cls._build_column_description_metadata(dataset)
            # remove column exist expectations
            suite.expectations = []
            return suite

        dataset.set_default_expectation_argument("catch_exceptions", False)
        dataset = cls._build_table_row_count_expectation(
            dataset,
            excluded_expectations=excluded_expectations,
            included_expectations=included_expectations,
        )
        dataset.set_config_value("interactive_evaluation", True)
        dataset = cls._build_table_column_expectations(
            dataset,
            excluded_expectations=excluded_expectations,
            included_expectations=included_expectations,
        )

        column_cache = {}
        if selected_columns:
            with tqdm(total=len(selected_columns),
                      desc="Profiling Columns",
                      delay=5) as pbar:
                for column in selected_columns:
                    pbar.set_postfix_str(column)
                    cardinality = cls._get_column_cardinality_with_caching(
                        dataset, column, column_cache)
                    column_type = cls._get_column_type_with_caching(
                        dataset, column, column_cache)

                    if cardinality in [
                            ProfilerCardinality.TWO,
                            ProfilerCardinality.VERY_FEW,
                            ProfilerCardinality.FEW,
                    ]:
                        cls._create_expectations_for_low_card_column(
                            dataset, column, column_cache)
                    elif cardinality in [
                            ProfilerCardinality.MANY,
                            ProfilerCardinality.VERY_MANY,
                            ProfilerCardinality.UNIQUE,
                    ]:
                        # TODO we will want to finesse the number and types of
                        #  expectations created here. The simple version is deny/allow list
                        #  and the more complex version is desired per column type and
                        #  cardinality. This deserves more thought on configuration.
                        dataset.expect_column_values_to_be_unique(column)

                        if column_type in [
                                ProfilerDataType.INT,
                                ProfilerDataType.FLOAT,
                        ]:
                            cls._create_expectations_for_numeric_column(
                                dataset, column)
                        elif column_type in [ProfilerDataType.DATETIME]:
                            cls._create_expectations_for_datetime_column(
                                dataset,
                                column,
                                excluded_expectations=excluded_expectations,
                                included_expectations=included_expectations,
                            )
                        elif column_type in [ProfilerDataType.STRING]:
                            cls._create_expectations_for_string_column(
                                dataset,
                                column,
                                excluded_expectations=excluded_expectations,
                                included_expectations=included_expectations,
                            )
                        elif column_type in [ProfilerDataType.UNKNOWN]:
                            logger.debug(
                                f"Skipping expectation creation for column {column} of unknown type: {column_type}"
                            )
                    pbar.update()

        if excluded_expectations:
            # NOTE: we reach into a private member here because of an expected future
            # refactor that will make the suite directly accessible
            dataset._expectation_suite.remove_all_expectations_of_type(
                excluded_expectations)
        if included_expectations:
            for expectation in dataset.get_expectation_suite(
                    discard_failed_expectations=False,
                    suppress_logging=True,
            ).expectations:
                if expectation.expectation_type not in included_expectations:
                    try:
                        dataset.remove_expectation(
                            ExpectationConfiguration(
                                expectation_type=expectation.expectation_type,
                                kwargs=expectation.kwargs,
                            ),
                            match_type="domain",
                            remove_multiple_matches=True,
                        )
                    except ValueError:
                        logger.debug(
                            f"Attempted to remove {expectation}, which was not found."
                        )

        expectation_suite = cls._build_column_description_metadata(dataset)

        return expectation_suite
    def _profile(cls, dataset, configuration=None):
        logger.debug(f"Running profiler with configuration: {configuration}")
        if configuration == "demo":
            return cls._demo_profile(dataset)

        existing_columns = dataset.get_table_columns()
        selected_columns = existing_columns
        included_expectations = []
        excluded_expectations = []

        if configuration:
            if ("included_expectations" in configuration
                    and "excluded_expectations" in configuration):
                raise ProfilerError(
                    "Please specify either `included_expectations` or `excluded_expectations`."
                )
            if "included_expectations" in configuration:
                included_expectations = configuration["included_expectations"]
                if included_expectations in [False, None, []]:
                    included_expectations = None
                _check_that_expectations_are_available(dataset,
                                                       included_expectations)
            if "excluded_expectations" in configuration:
                excluded_expectations = configuration["excluded_expectations"]
                _check_that_expectations_are_available(dataset,
                                                       excluded_expectations)

            if ("included_columns" in configuration
                    and "excluded_columns" in configuration):
                raise ProfilerError(
                    "Please specify either `excluded_columns` or `included_columns`."
                )
            elif "included_columns" in configuration:
                selected_columns = configuration["included_columns"]
            elif "excluded_columns" in configuration:
                excluded_columns = configuration["excluded_columns"]
                if excluded_columns in [False, None, []]:
                    excluded_columns = []
                selected_columns = set(existing_columns) - set(
                    excluded_columns)

        _check_that_columns_exist(dataset, selected_columns)
        if included_expectations is None:
            suite = cls._build_column_description_metadata(dataset)
            # remove column exist expectations
            suite.expectations = []
            return suite

        dataset.set_default_expectation_argument("catch_exceptions", False)
        dataset = cls._build_table_row_count_expectation(dataset,
                                                         tolerance=0.1)
        dataset.set_config_value("interactive_evaluation", True)
        dataset = cls._build_table_column_expectations(dataset)

        column_cache = {}
        if selected_columns:
            for column in selected_columns:
                cardinality = cls._get_column_cardinality_with_caching(
                    dataset, column, column_cache)
                column_type = cls._get_column_type_with_caching(
                    dataset, column, column_cache)

                if cardinality in [
                        ProfilerCardinality.TWO,
                        ProfilerCardinality.VERY_FEW,
                        ProfilerCardinality.FEW,
                ]:
                    cls._create_expectations_for_low_card_column(
                        dataset, column, column_cache)
                elif cardinality in [
                        ProfilerCardinality.MANY,
                        ProfilerCardinality.VERY_MANY,
                        ProfilerCardinality.UNIQUE,
                ]:
                    # TODO we will want to finesse the number and types of
                    #  expectations created here. The simple version is blacklisting
                    #  and the more complex version is desired per column type and
                    #  cardinality. This deserves more thought on configuration.
                    dataset.expect_column_values_to_be_unique(column)

                    if column_type in [
                            ProfilerDataType.INT, ProfilerDataType.FLOAT
                    ]:
                        cls._create_expectations_for_numeric_column(
                            dataset, column)
                    elif column_type in [ProfilerDataType.DATETIME]:
                        cls._create_expectations_for_datetime_column(
                            dataset, column)
                    elif column_type in [ProfilerDataType.STRING]:
                        cls._create_expectations_for_string_column(
                            dataset, column)
                    elif column_type in [ProfilerDataType.UNKNOWN]:
                        logger.debug(
                            f"Skipping expectation creation for column {column} of unknown type: {column_type}"
                        )

        if excluded_expectations:
            dataset = _remove_table_expectations(dataset,
                                                 excluded_expectations)
            dataset = _remove_column_expectations(dataset,
                                                  excluded_expectations)
        if included_expectations:
            for expectation in dataset.get_expectation_suite().expectations:
                if expectation.expectation_type not in included_expectations:
                    try:
                        dataset.remove_expectation(
                            expectation_type=expectation.expectation_type,
                            expectation_kwargs=expectation.kwargs,
                            column=expectation.kwargs.get("column", None),
                            remove_multiple_matches=True,
                        )
                    except ValueError:
                        logger.debug(
                            f"Attempted to remove {expectation}, which was not found."
                        )

        expectation_suite = cls._build_column_description_metadata(dataset)

        return expectation_suite
    def _create_expectations_for_numeric_column(
            cls,
            dataset,
            column,
            excluded_expectations=None,
            included_expectations=None) -> None:
        cls._create_non_nullity_expectations(
            dataset,
            column,
            excluded_expectations=excluded_expectations,
            included_expectations=included_expectations,
        )

        if (not excluded_expectations or "expect_column_min_to_be_between"
                not in excluded_expectations
            ) and (not included_expectations or
                   "expect_column_min_to_be_between" in included_expectations):
            observed_min = dataset.expect_column_min_to_be_between(
                column,
                min_value=None,
                max_value=None,
                result_format="SUMMARY").result["observed_value"]
            if not is_nan(observed_min):
                dataset.expect_column_min_to_be_between(
                    column,
                    min_value=observed_min - 1,
                    max_value=observed_min + 1)
            else:
                logger.debug(
                    f"Skipping expect_column_min_to_be_between because observed value is nan: {observed_min}"
                )

        if (not excluded_expectations or "expect_column_max_to_be_between"
                not in excluded_expectations
            ) and (not included_expectations or
                   "expect_column_max_to_be_between" in included_expectations):
            observed_max = dataset.expect_column_max_to_be_between(
                column,
                min_value=None,
                max_value=None,
                result_format="SUMMARY").result["observed_value"]
            if not is_nan(observed_max):
                dataset.expect_column_max_to_be_between(
                    column,
                    min_value=observed_max - 1,
                    max_value=observed_max + 1)
            else:
                logger.debug(
                    f"Skipping expect_column_max_to_be_between because observed value is nan: {observed_max}"
                )

        if (not excluded_expectations or "expect_column_mean_to_be_between"
                not in excluded_expectations) and (
                    not included_expectations
                    or "expect_column_mean_to_be_between"
                    in included_expectations):
            observed_mean = dataset.expect_column_mean_to_be_between(
                column,
                min_value=None,
                max_value=None,
                result_format="SUMMARY").result["observed_value"]
            if not is_nan(observed_mean):
                dataset.expect_column_mean_to_be_between(
                    column,
                    min_value=observed_mean - 1,
                    max_value=observed_mean + 1)
            else:
                logger.debug(
                    f"Skipping expect_column_mean_to_be_between because observed value is nan: {observed_mean}"
                )

        if (not excluded_expectations or "expect_column_median_to_be_between"
                not in excluded_expectations) and (
                    not included_expectations
                    or "expect_column_median_to_be_between"
                    in included_expectations):
            observed_median = dataset.expect_column_median_to_be_between(
                column,
                min_value=None,
                max_value=None,
                result_format="SUMMARY").result["observed_value"]
            if not is_nan(observed_median):
                dataset.expect_column_median_to_be_between(
                    column,
                    min_value=observed_median - 1,
                    max_value=observed_median + 1)
            else:
                logger.debug(
                    f"Skipping expect_column_median_to_be_between because observed value is nan: {observed_median}"
                )

        allow_relative_error: bool = dataset.attempt_allowing_relative_error()

        if (not excluded_expectations
                or "expect_column_quantile_values_to_be_between"
                not in excluded_expectations) and (
                    not included_expectations
                    or "expect_column_quantile_values_to_be_between"
                    in included_expectations):
            quantile_result = dataset.expect_column_quantile_values_to_be_between(
                column,
                quantile_ranges={
                    "quantiles": [0.05, 0.25, 0.5, 0.75, 0.95],
                    "value_ranges": [
                        [None, None],
                        [None, None],
                        [None, None],
                        [None, None],
                        [None, None],
                    ],
                },
                allow_relative_error=allow_relative_error,
                result_format="SUMMARY",
                catch_exceptions=True,
            )
            if quantile_result.exception_info and (
                    quantile_result.exception_info["exception_traceback"]
                    or quantile_result.exception_info["exception_message"]):
                # TODO quantiles are not implemented correctly on sqlite, and likely other sql dialects
                logger.debug(
                    quantile_result.exception_info["exception_traceback"])
                logger.debug(
                    quantile_result.exception_info["exception_message"])
            else:
                dataset.set_config_value("interactive_evaluation", False)
                dataset.expect_column_quantile_values_to_be_between(
                    column,
                    quantile_ranges={
                        "quantiles":
                        quantile_result.result["observed_value"]["quantiles"],
                        "value_ranges":
                        [[v - 1, v + 1] for v in
                         quantile_result.result["observed_value"]["values"]],
                    },
                    allow_relative_error=allow_relative_error,
                    catch_exceptions=True,
                )
                dataset.set_config_value("interactive_evaluation", True)
Exemplo n.º 4
0
    def _create_expectations_for_numeric_column(cls, dataset, column):
        cls._create_non_nullity_expectations(dataset, column)

        value = \
        dataset.expect_column_min_to_be_between(column, min_value=None, max_value=None, result_format="SUMMARY").result[
            "observed_value"]
        value = dataset.expect_column_min_to_be_between(column,
                                                        min_value=value - 1,
                                                        max_value=value + 1)

        value = \
        dataset.expect_column_max_to_be_between(column, min_value=None, max_value=None, result_format="SUMMARY").result[
            "observed_value"]
        value = dataset.expect_column_max_to_be_between(column,
                                                        min_value=value - 1,
                                                        max_value=value + 1)

        value = dataset.expect_column_mean_to_be_between(
            column, min_value=None, max_value=None,
            result_format="SUMMARY").result["observed_value"]
        dataset.expect_column_mean_to_be_between(column,
                                                 min_value=value - 1,
                                                 max_value=value + 1)

        value = dataset.expect_column_median_to_be_between(
            column, min_value=None, max_value=None,
            result_format="SUMMARY").result["observed_value"]
        dataset.expect_column_median_to_be_between(column,
                                                   min_value=value - 1,
                                                   max_value=value + 1)

        result = dataset.expect_column_quantile_values_to_be_between(
            column,
            quantile_ranges={
                "quantiles": [0.05, 0.25, 0.5, 0.75, 0.95],
                "value_ranges": [
                    [None, None],
                    [None, None],
                    [None, None],
                    [None, None],
                    [None, None],
                ],
            },
            result_format="SUMMARY",
            catch_exceptions=True)
        if result.exception_info and (
                result.exception_info["exception_traceback"]
                or result.exception_info["exception_message"]):
            # TODO quantiles are not implemented correctly on sqlite, and likely other sql dialects
            logger.debug(result.exception_info["exception_traceback"])
            logger.debug(result.exception_info["exception_message"])
        else:
            dataset.set_config_value('interactive_evaluation', False)
            dataset.expect_column_quantile_values_to_be_between(
                column,
                quantile_ranges={
                    "quantiles":
                    result.result["observed_value"]["quantiles"],
                    "value_ranges":
                    [[v - 1, v + 1]
                     for v in result.result["observed_value"]["values"]],
                },
                catch_exceptions=True)
            dataset.set_config_value('interactive_evaluation', True)