def _spark( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain(metric_domain_kwargs, MetricDomainTypes.COLUMN) column = accessor_domain_kwargs["column"] # We will get the two middle values by choosing an epsilon to add # to the 50th percentile such that we always get exactly the middle two values # (i.e. 0 < epsilon < 1 / (2 * values)) # Note that this can be an expensive computation; we are not exposing # spark's ability to estimate. # We add two to 2 * n_values to maintain a legitimate quantile # in the degenerate case when n_values = 0 """Spark Median Implementation""" table_row_count = metrics.get("table.row_count") result = df.approxQuantile( column, [0.5, 0.5 + (1 / (2 + (2 * table_row_count)))], 0) return np.mean(result)
def _spark( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN ) allow_relative_error = metric_value_kwargs.get("allow_relative_error", False) quantiles = metric_value_kwargs["quantiles"] column = accessor_domain_kwargs["column"] if allow_relative_error is False: allow_relative_error = 0.0 if ( not isinstance(allow_relative_error, float) or allow_relative_error < 0 or allow_relative_error > 1 ): raise ValueError( "SparkDFDataset requires relative error to be False or to be a float between 0 and 1." ) return df.approxQuantile(column, list(quantiles), allow_relative_error)
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): selectable, _, _ = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE ) df = None table_name = getattr(selectable, "name", None) if table_name is not None: try: if metric_value_kwargs["fetch_all"]: df = pd.read_sql_table( table_name=getattr(selectable, "name", None), schema=getattr(selectable, "schema", None), con=execution_engine.engine, ) else: df = next( pd.read_sql_table( table_name=getattr(selectable, "name", None), schema=getattr(selectable, "schema", None), con=execution_engine.engine, chunksize=metric_value_kwargs["n_rows"], ) ) except (ValueError, NotImplementedError): # it looks like MetaData that is used by pd.read_sql_table # cannot work on a temp table. # If it fails, we are trying to get the data using read_sql df = None except StopIteration: validator = Validator(execution_engine=execution_engine) columns = validator.get_metric( MetricConfiguration("table.columns", metric_domain_kwargs) ) df = pd.DataFrame(columns=columns) if df is None: # we want to compile our selectable stmt = sa.select(["*"]).select_from(selectable) if metric_value_kwargs["fetch_all"]: pass else: stmt = stmt.limit(metric_value_kwargs["n_rows"]) sql = stmt.compile( dialect=execution_engine.engine.dialect, compile_kwargs={"literal_binds": True}, ) df = pd.read_sql(sql, con=execution_engine.engine) return df
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): sort = metric_value_kwargs.get("sort", cls.default_kwarg_values["sort"]) collate = metric_value_kwargs.get("collate", cls.default_kwarg_values["collate"]) if sort not in ["value", "count", "none"]: raise ValueError("sort must be either 'value', 'count', or 'none'") if collate is not None: raise ValueError( "collate parameter is not supported in PandasDataset") selectable, _, accessor_domain_kwargs = execution_engine.get_compute_domain( metric_domain_kwargs, MetricDomainTypes.COLUMN) column = accessor_domain_kwargs["column"] if sort not in ["value", "count", "none"]: raise ValueError("sort must be either 'value', 'count', or 'none'") query = (sa.select([ sa.column(column).label("value"), sa.func.count(sa.column(column)).label("count"), ]).where(sa.column(column) != None).group_by(sa.column(column))) if sort == "value": # NOTE: depending on the way the underlying database collates columns, # ordering can vary. postgresql collate "C" matches default sort # for python and most other systems, but is not universally supported, # so we use the default sort for the system, unless specifically overridden if collate is not None: query = query.order_by(sa.column(column).collate(collate)) else: query = query.order_by(sa.column(column)) elif sort == "count": query = query.order_by(sa.column("count").desc()) results = execution_engine.engine.execute( query.select_from(selectable)).fetchall() series = pd.Series( [row[1] for row in results], index=pd.Index(data=[row[0] for row in results], name="value"), name="count", ) return series
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): ( selectable, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain(metric_domain_kwargs, MetricDomainTypes.COLUMN) column_name = accessor_domain_kwargs["column"] column = sa.column(column_name) sqlalchemy_engine = execution_engine.engine """SqlAlchemy Median Implementation""" nonnull_count = metrics.get("column_values.nonnull.count") if not nonnull_count: return None element_values = sqlalchemy_engine.execute( sa.select([column]).order_by(column).where(column != None).offset( max(nonnull_count // 2 - 1, 0)).limit(2).select_from(selectable)) column_values = list(element_values.fetchall()) if len(column_values) == 0: column_median = None elif nonnull_count % 2 == 0: # An even number of column values: take the average of the two center values column_median = ( float(column_values[0][0] + column_values[1][ 0] # left center value # right center value ) / 2.0) # Average center values else: # An odd number of column values, we can just take the center value column_median = column_values[1][0] # True center value return column_median
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: dict, metric_value_kwargs: dict, metrics: Dict[str, Any], runtime_configuration: dict, ) -> List[sqlalchemy_engine_Row]: query: Optional[str] = metric_value_kwargs.get( "query" ) or cls.default_kwarg_values.get("query") selectable: Union[sa.sql.Selectable, str] selectable, _, _ = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE ) column: str = metric_value_kwargs.get("column") if isinstance(selectable, sa.Table): query = query.format(col=column, active_batch=selectable) elif isinstance( selectable, sa.sql.Subquery ): # Specifying a runtime query in a RuntimeBatchRequest returns the active bacth as a Subquery; sectioning the active batch off w/ parentheses ensures flow of operations doesn't break query = query.format(col=column, active_batch=f"({selectable})") elif isinstance( selectable, sa.sql.Select ): # Specifying a row_condition returns the active batch as a Select object, requiring compilation & aliasing when formatting the parameterized query query = query.format( col=column, active_batch=f'({selectable.compile(compile_kwargs={"literal_binds": True})}) AS subselect', ) else: query = query.format(col=column, active_batch=f"({selectable})") engine: sqlalchemy_engine_Engine = execution_engine.engine result: List[sqlalchemy_engine_Row] = engine.execute(sa.text(query)).fetchall() return result
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs, metric_value_kwargs, metrics, runtime_configuration, ): ( selectable, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain(metric_domain_kwargs, MetricDomainTypes.COLUMN) column_name = accessor_domain_kwargs["column"] column = sa.column(column_name) sqlalchemy_engine = execution_engine.engine query = sa.select(sa.func.max(column)).select_from(selectable) result = sqlalchemy_engine.execute(query).fetchone() return result[0]
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): min_value = metric_value_kwargs.get("min_value") max_value = metric_value_kwargs.get("max_value") strict_min = metric_value_kwargs.get("strict_min") strict_max = metric_value_kwargs.get("strict_max") if min_value is not None and max_value is not None and min_value > max_value: raise ValueError("min_value cannot be greater than max_value") if min_value is None and max_value is None: raise ValueError("min_value and max_value cannot both be None") dialect_name = execution_engine.engine.dialect.name.lower() if (min_value == get_sql_dialect_floating_point_infinity_value( schema="api_np", negative=True)) or ( min_value == get_sql_dialect_floating_point_infinity_value( schema="api_cast", negative=True)): min_value = get_sql_dialect_floating_point_infinity_value( schema=dialect_name, negative=True) if (min_value == get_sql_dialect_floating_point_infinity_value( schema="api_np", negative=False)) or ( min_value == get_sql_dialect_floating_point_infinity_value( schema="api_cast", negative=False)): min_value = get_sql_dialect_floating_point_infinity_value( schema=dialect_name, negative=False) if (max_value == get_sql_dialect_floating_point_infinity_value( schema="api_np", negative=True)) or ( max_value == get_sql_dialect_floating_point_infinity_value( schema="api_cast", negative=True)): max_value = get_sql_dialect_floating_point_infinity_value( schema=dialect_name, negative=True) if (max_value == get_sql_dialect_floating_point_infinity_value( schema="api_np", negative=False)) or ( max_value == get_sql_dialect_floating_point_infinity_value( schema="api_cast", negative=False)): max_value = get_sql_dialect_floating_point_infinity_value( schema=dialect_name, negative=False) ( selectable, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( domain_kwargs=metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) column = sa.column(accessor_domain_kwargs["column"]) if min_value is None: if strict_max: condition = column < max_value else: condition = column <= max_value elif max_value is None: if strict_min: condition = column > min_value else: condition = column >= min_value else: if strict_min and strict_max: condition = sa.and_(column > min_value, column < max_value) elif strict_min: condition = sa.and_(column > min_value, column <= max_value) elif strict_max: condition = sa.and_(column >= min_value, column < max_value) else: condition = sa.and_(column >= min_value, column <= max_value) return execution_engine.engine.execute( sa.select([sa.func.count() ]).select_from(selectable).where(condition)).scalar()
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): """return a list of counts corresponding to bins Args: column: the name of the column for which to get the histogram bins: tuple of bin edges for which to get histogram values; *must* be tuple to support caching """ selectable, _, accessor_domain_kwargs = execution_engine.get_compute_domain( domain_kwargs=metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) column = accessor_domain_kwargs["column"] bins = metric_value_kwargs["bins"] case_conditions = [] idx = 0 if isinstance(bins, np.ndarray): bins = bins.tolist() else: bins = list(bins) # If we have an infinite lower bound, don't express that in sql if (bins[0] == get_sql_dialect_floating_point_infinity_value( schema="api_np", negative=True)) or ( bins[0] == get_sql_dialect_floating_point_infinity_value( schema="api_cast", negative=True)): case_conditions.append( sa.func.sum( sa.case([(sa.column(column) < bins[idx + 1], 1)], else_=0)).label("bin_" + str(idx))) idx += 1 for idx in range(idx, len(bins) - 2): case_conditions.append( sa.func.sum( sa.case( [( sa.and_( bins[idx] <= sa.column(column), sa.column(column) < bins[idx + 1], ), 1, )], else_=0, )).label("bin_" + str(idx))) if (bins[-1] == get_sql_dialect_floating_point_infinity_value( schema="api_np", negative=False)) or ( bins[-1] == get_sql_dialect_floating_point_infinity_value( schema="api_cast", negative=False)): case_conditions.append( sa.func.sum( sa.case([(bins[-2] <= sa.column(column), 1)], else_=0)).label("bin_" + str(len(bins) - 1))) else: case_conditions.append( sa.func.sum( sa.case( [( sa.and_( bins[-2] <= sa.column(column), sa.column(column) <= bins[-1], ), 1, )], else_=0, )).label("bin_" + str(len(bins) - 1))) query = (sa.select(case_conditions).where( sa.column(column) != None, ).select_from(selectable)) # Run the data through convert_to_json_serializable to ensure we do not have Decimal types hist = convert_to_json_serializable( list(execution_engine.engine.execute(query).fetchone())) return hist
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): ( selectable, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN ) column_name = accessor_domain_kwargs["column"] column = sa.column(column_name) sqlalchemy_engine = execution_engine.engine dialect = sqlalchemy_engine.dialect quantiles = metric_value_kwargs["quantiles"] allow_relative_error = metric_value_kwargs.get("allow_relative_error", False) table_row_count = metrics.get("table.row_count") if dialect.name.lower() == "mssql": return _get_column_quantiles_mssql( column=column, quantiles=quantiles, selectable=selectable, sqlalchemy_engine=sqlalchemy_engine, ) elif dialect.name.lower() == "bigquery": return _get_column_quantiles_bigquery( column=column, quantiles=quantiles, selectable=selectable, sqlalchemy_engine=sqlalchemy_engine, ) elif dialect.name.lower() == "mysql": return _get_column_quantiles_mysql( column=column, quantiles=quantiles, selectable=selectable, sqlalchemy_engine=sqlalchemy_engine, ) elif dialect.name.lower() == "snowflake": # NOTE: 20201216 - JPC - snowflake has a representation/precision limitation # in its percentile_disc implementation that causes an error when we do # not round. It is unclear to me *how* the call to round affects the behavior -- # the binary representation should be identical before and after, and I do # not observe a type difference. However, the issue is replicable in the # snowflake console and directly observable in side-by-side comparisons with # and without the call to round() quantiles = [round(x, 10) for x in quantiles] return _get_column_quantiles_generic_sqlalchemy( column=column, quantiles=quantiles, allow_relative_error=allow_relative_error, dialect=dialect, selectable=selectable, sqlalchemy_engine=sqlalchemy_engine, ) elif dialect.name.lower() == "sqlite": return _get_column_quantiles_sqlite( column=column, quantiles=quantiles, selectable=selectable, sqlalchemy_engine=sqlalchemy_engine, table_row_count=table_row_count, ) else: return _get_column_quantiles_generic_sqlalchemy( column=column, quantiles=quantiles, allow_relative_error=allow_relative_error, dialect=dialect, selectable=selectable, sqlalchemy_engine=sqlalchemy_engine, )