def _get_column_quantiles_generic_sqlalchemy( column, quantiles: Iterable, allow_relative_error: bool, dialect, selectable, sqlalchemy_engine, ) -> list: selects: List[WithinGroup] = [ sa.func.percentile_disc(quantile).within_group(column.asc()) for quantile in quantiles ] quantiles_query: Select = sa.select(selects).select_from(selectable) try: quantiles_results: RowProxy = sqlalchemy_engine.execute( quantiles_query ).fetchone() return list(quantiles_results) except ProgrammingError: # ProgrammingError: (psycopg2.errors.SyntaxError) Aggregate function "percentile_disc" is not supported; # use approximate percentile_disc or percentile_cont instead. if attempt_allowing_relative_error(dialect): # Redshift does not have a percentile_disc method, but does support an approximate version. sql_approx: str = get_approximate_percentile_disc_sql( selects=selects, sql_engine_dialect=dialect ) selects_approx: List[TextClause] = [sa.text(sql_approx)] quantiles_query_approx: Select = sa.select(selects_approx).select_from( selectable ) if allow_relative_error: try: quantiles_results: RowProxy = sqlalchemy_engine.execute( quantiles_query_approx ).fetchone() return list(quantiles_results) except ProgrammingError as pe: exception_message: str = "An SQL syntax Exception occurred." exception_traceback: str = traceback.format_exc() exception_message += f'{type(pe).__name__}: "{str(pe)}". Traceback: "{exception_traceback}".' logger.error(exception_message) raise pe else: raise ValueError( f'The SQL engine dialect "{str(dialect)}" does not support computing quantiles ' "without approximation error; set allow_relative_error to True to allow approximate quantiles." ) else: raise ValueError( f'The SQL engine dialect "{str(dialect)}" does not support computing quantiles with ' "approximation error; set allow_relative_error to False to disable approximate quantiles." )
def _build_expectations_numeric(self, profile_dataset, column, **kwargs): """ Adds a set of numeric expectations for a given column Args: profile_dataset: A GE Dataset column: The column for which to add expectations **kwargs: Returns: The GE Dataset """ # min if "expect_column_min_to_be_between" not in self.excluded_expectations: observed_min = profile_dataset.expect_column_min_to_be_between( column, min_value=None, max_value=None, result_format="SUMMARY").result["observed_value"] if not self._is_nan(observed_min): profile_dataset.expect_column_min_to_be_between( column, min_value=observed_min, max_value=observed_min, ) else: profile_dataset._expectation_suite.remove_expectation( ExpectationConfiguration( expectation_type="expect_column_min_to_be_between", kwargs={"column": column}, ), match_type="domain", ) logger.debug( f"Skipping expect_column_min_to_be_between because observed value is nan: {observed_min}" ) # max if "expect_column_max_to_be_between" not in self.excluded_expectations: observed_max = profile_dataset.expect_column_max_to_be_between( column, min_value=None, max_value=None, result_format="SUMMARY").result["observed_value"] if not self._is_nan(observed_max): profile_dataset.expect_column_max_to_be_between( column, min_value=observed_max, max_value=observed_max, ) else: profile_dataset._expectation_suite.remove_expectation( ExpectationConfiguration( expectation_type="expect_column_max_to_be_between", kwargs={"column": column}, ), match_type="domain", ) logger.debug( f"Skipping expect_column_max_to_be_between because observed value is nan: {observed_max}" ) # mean if "expect_column_mean_to_be_between" not in self.excluded_expectations: observed_mean = profile_dataset.expect_column_mean_to_be_between( column, min_value=None, max_value=None, result_format="SUMMARY").result["observed_value"] if not self._is_nan(observed_mean): profile_dataset.expect_column_mean_to_be_between( column, min_value=observed_mean, max_value=observed_mean, ) else: profile_dataset._expectation_suite.remove_expectation( ExpectationConfiguration( expectation_type="expect_column_mean_to_be_between", kwargs={"column": column}, ), match_type="domain", ) logger.debug( f"Skipping expect_column_mean_to_be_between because observed value is nan: {observed_mean}" ) # median if "expect_column_median_to_be_between" not in self.excluded_expectations: observed_median = profile_dataset.expect_column_median_to_be_between( column, min_value=None, max_value=None, result_format="SUMMARY").result["observed_value"] if not self._is_nan(observed_median): profile_dataset.expect_column_median_to_be_between( column, min_value=observed_median, max_value=observed_median, ) else: profile_dataset._expectation_suite.remove_expectation( ExpectationConfiguration( expectation_type="expect_column_median_to_be_between", kwargs={"column": column}, ), match_type="domain", ) logger.debug( f"Skipping expect_column_median_to_be_between because observed value is nan: {observed_median}" ) if ("expect_column_quantile_values_to_be_between" not in self.excluded_expectations): if isinstance(profile_dataset, Dataset): if isinstance(profile_dataset, PandasDataset): allow_relative_error = "lower" else: allow_relative_error = ( profile_dataset.attempt_allowing_relative_error()) elif isinstance(profile_dataset, Validator): if isinstance(profile_dataset.execution_engine, PandasExecutionEngine): allow_relative_error = "lower" if isinstance(profile_dataset.execution_engine, SparkDFExecutionEngine): allow_relative_error = 0.0 if isinstance(profile_dataset.execution_engine, SqlAlchemyExecutionEngine): allow_relative_error = attempt_allowing_relative_error( profile_dataset.execution_engine.engine.dialect) quantile_result = ( profile_dataset.expect_column_quantile_values_to_be_between( column, quantile_ranges={ "quantiles": [0.05, 0.25, 0.5, 0.75, 0.95], "value_ranges": [ [None, None], [None, None], [None, None], [None, None], [None, None], ], }, allow_relative_error=allow_relative_error, result_format="SUMMARY", )) if quantile_result.exception_info and ( quantile_result.exception_info["exception_traceback"] or quantile_result.exception_info["exception_message"]): profile_dataset._expectation_suite.remove_expectation( ExpectationConfiguration( expectation_type= "expect_column_quantile_values_to_be_between", kwargs={"column": column}, ), match_type="domain", ) logger.debug( quantile_result.exception_info["exception_traceback"]) logger.debug( quantile_result.exception_info["exception_message"]) else: profile_dataset.expect_column_quantile_values_to_be_between( column, quantile_ranges={ "quantiles": quantile_result.result["observed_value"]["quantiles"], "value_ranges": [[v, v] for v in quantile_result.result["observed_value"]["values"]], }, allow_relative_error=allow_relative_error, ) return profile_dataset