예제 #1
0
def _get_column_quantiles_generic_sqlalchemy(
    column,
    quantiles: Iterable,
    allow_relative_error: bool,
    dialect,
    selectable,
    sqlalchemy_engine,
) -> list:
    selects: List[WithinGroup] = [
        sa.func.percentile_disc(quantile).within_group(column.asc())
        for quantile in quantiles
    ]
    quantiles_query: Select = sa.select(selects).select_from(selectable)

    try:
        quantiles_results: RowProxy = sqlalchemy_engine.execute(
            quantiles_query
        ).fetchone()
        return list(quantiles_results)
    except ProgrammingError:
        # ProgrammingError: (psycopg2.errors.SyntaxError) Aggregate function "percentile_disc" is not supported;
        # use approximate percentile_disc or percentile_cont instead.
        if attempt_allowing_relative_error(dialect):
            # Redshift does not have a percentile_disc method, but does support an approximate version.
            sql_approx: str = get_approximate_percentile_disc_sql(
                selects=selects, sql_engine_dialect=dialect
            )
            selects_approx: List[TextClause] = [sa.text(sql_approx)]
            quantiles_query_approx: Select = sa.select(selects_approx).select_from(
                selectable
            )
            if allow_relative_error:
                try:
                    quantiles_results: RowProxy = sqlalchemy_engine.execute(
                        quantiles_query_approx
                    ).fetchone()
                    return list(quantiles_results)
                except ProgrammingError as pe:
                    exception_message: str = "An SQL syntax Exception occurred."
                    exception_traceback: str = traceback.format_exc()
                    exception_message += f'{type(pe).__name__}: "{str(pe)}".  Traceback: "{exception_traceback}".'
                    logger.error(exception_message)
                    raise pe
            else:
                raise ValueError(
                    f'The SQL engine dialect "{str(dialect)}" does not support computing quantiles '
                    "without approximation error; set allow_relative_error to True to allow approximate quantiles."
                )
        else:
            raise ValueError(
                f'The SQL engine dialect "{str(dialect)}" does not support computing quantiles with '
                "approximation error; set allow_relative_error to False to disable approximate quantiles."
            )
예제 #2
0
    def _build_expectations_numeric(self, profile_dataset, column, **kwargs):
        """
        Adds a set of numeric expectations for a given column
        Args:
            profile_dataset: A GE Dataset
            column: The column for which to add expectations
            **kwargs:

        Returns:
            The GE Dataset
        """

        # min
        if "expect_column_min_to_be_between" not in self.excluded_expectations:
            observed_min = profile_dataset.expect_column_min_to_be_between(
                column,
                min_value=None,
                max_value=None,
                result_format="SUMMARY").result["observed_value"]
            if not self._is_nan(observed_min):

                profile_dataset.expect_column_min_to_be_between(
                    column,
                    min_value=observed_min,
                    max_value=observed_min,
                )

            else:
                profile_dataset._expectation_suite.remove_expectation(
                    ExpectationConfiguration(
                        expectation_type="expect_column_min_to_be_between",
                        kwargs={"column": column},
                    ),
                    match_type="domain",
                )
                logger.debug(
                    f"Skipping expect_column_min_to_be_between because observed value is nan: {observed_min}"
                )

        # max
        if "expect_column_max_to_be_between" not in self.excluded_expectations:
            observed_max = profile_dataset.expect_column_max_to_be_between(
                column,
                min_value=None,
                max_value=None,
                result_format="SUMMARY").result["observed_value"]
            if not self._is_nan(observed_max):
                profile_dataset.expect_column_max_to_be_between(
                    column,
                    min_value=observed_max,
                    max_value=observed_max,
                )

            else:
                profile_dataset._expectation_suite.remove_expectation(
                    ExpectationConfiguration(
                        expectation_type="expect_column_max_to_be_between",
                        kwargs={"column": column},
                    ),
                    match_type="domain",
                )
                logger.debug(
                    f"Skipping expect_column_max_to_be_between because observed value is nan: {observed_max}"
                )

        # mean
        if "expect_column_mean_to_be_between" not in self.excluded_expectations:
            observed_mean = profile_dataset.expect_column_mean_to_be_between(
                column,
                min_value=None,
                max_value=None,
                result_format="SUMMARY").result["observed_value"]
            if not self._is_nan(observed_mean):
                profile_dataset.expect_column_mean_to_be_between(
                    column,
                    min_value=observed_mean,
                    max_value=observed_mean,
                )

            else:
                profile_dataset._expectation_suite.remove_expectation(
                    ExpectationConfiguration(
                        expectation_type="expect_column_mean_to_be_between",
                        kwargs={"column": column},
                    ),
                    match_type="domain",
                )
                logger.debug(
                    f"Skipping expect_column_mean_to_be_between because observed value is nan: {observed_mean}"
                )

        # median
        if "expect_column_median_to_be_between" not in self.excluded_expectations:
            observed_median = profile_dataset.expect_column_median_to_be_between(
                column,
                min_value=None,
                max_value=None,
                result_format="SUMMARY").result["observed_value"]
            if not self._is_nan(observed_median):

                profile_dataset.expect_column_median_to_be_between(
                    column,
                    min_value=observed_median,
                    max_value=observed_median,
                )

            else:
                profile_dataset._expectation_suite.remove_expectation(
                    ExpectationConfiguration(
                        expectation_type="expect_column_median_to_be_between",
                        kwargs={"column": column},
                    ),
                    match_type="domain",
                )
                logger.debug(
                    f"Skipping expect_column_median_to_be_between because observed value is nan: {observed_median}"
                )

        if ("expect_column_quantile_values_to_be_between"
                not in self.excluded_expectations):
            if isinstance(profile_dataset, Dataset):
                if isinstance(profile_dataset, PandasDataset):
                    allow_relative_error = "lower"
                else:
                    allow_relative_error = (
                        profile_dataset.attempt_allowing_relative_error())
            elif isinstance(profile_dataset, Validator):
                if isinstance(profile_dataset.execution_engine,
                              PandasExecutionEngine):
                    allow_relative_error = "lower"
                if isinstance(profile_dataset.execution_engine,
                              SparkDFExecutionEngine):
                    allow_relative_error = 0.0
                if isinstance(profile_dataset.execution_engine,
                              SqlAlchemyExecutionEngine):
                    allow_relative_error = attempt_allowing_relative_error(
                        profile_dataset.execution_engine.engine.dialect)

            quantile_result = (
                profile_dataset.expect_column_quantile_values_to_be_between(
                    column,
                    quantile_ranges={
                        "quantiles": [0.05, 0.25, 0.5, 0.75, 0.95],
                        "value_ranges": [
                            [None, None],
                            [None, None],
                            [None, None],
                            [None, None],
                            [None, None],
                        ],
                    },
                    allow_relative_error=allow_relative_error,
                    result_format="SUMMARY",
                ))
            if quantile_result.exception_info and (
                    quantile_result.exception_info["exception_traceback"]
                    or quantile_result.exception_info["exception_message"]):
                profile_dataset._expectation_suite.remove_expectation(
                    ExpectationConfiguration(
                        expectation_type=
                        "expect_column_quantile_values_to_be_between",
                        kwargs={"column": column},
                    ),
                    match_type="domain",
                )
                logger.debug(
                    quantile_result.exception_info["exception_traceback"])
                logger.debug(
                    quantile_result.exception_info["exception_message"])
            else:

                profile_dataset.expect_column_quantile_values_to_be_between(
                    column,
                    quantile_ranges={
                        "quantiles":
                        quantile_result.result["observed_value"]["quantiles"],
                        "value_ranges":
                        [[v, v] for v in
                         quantile_result.result["observed_value"]["values"]],
                    },
                    allow_relative_error=allow_relative_error,
                )
        return profile_dataset