def _get_column_quantiles_mssql(column, quantiles: Iterable, selectable, sqlalchemy_engine) -> list: # mssql requires over(), so we add an empty over() clause selects: List[WithinGroup] = [ sa.func.percentile_disc(quantile).within_group(column.asc()).over() for quantile in quantiles ] quantiles_query: Select = sa.select(selects).select_from(selectable) try: quantiles_results: Row = sqlalchemy_engine.execute( quantiles_query).fetchone() return list(quantiles_results) except ProgrammingError as pe: exception_message: str = "An SQL syntax Exception occurred." exception_traceback: str = traceback.format_exc() exception_message += ( f'{type(pe).__name__}: "{str(pe)}". Traceback: "{exception_traceback}".' ) logger.error(exception_message) raise pe
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): ( selectable, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain(metric_domain_kwargs, MetricDomainTypes.COLUMN) column_name = accessor_domain_kwargs["column"] column = sa.column(column_name) sqlalchemy_engine = execution_engine.engine """SqlAlchemy Median Implementation""" nonnull_count = metrics.get("column_values.nonnull.count") if not nonnull_count: return None element_values = sqlalchemy_engine.execute( sa.select([column]).order_by(column).where(column != None).offset( max(nonnull_count // 2 - 1, 0)).limit(2).select_from(selectable)) column_values = list(element_values.fetchall()) if len(column_values) == 0: column_median = None elif nonnull_count % 2 == 0: # An even number of column values: take the average of the two center values column_median = ( float(column_values[0][0] + column_values[1][ 0] # left center value # right center value ) / 2.0) # Average center values else: # An odd number of column values, we can just take the center value column_median = column_values[1][0] # True center value return column_median
def _get_column_quantiles_sqlite( column, quantiles: Iterable, selectable, sqlalchemy_engine, table_row_count ) -> list: """ The present implementation is somewhat inefficient, because it requires as many calls to "sqlalchemy_engine.execute()" as the number of partitions in the "quantiles" parameter (albeit, typically, only a few). However, this is the only mechanism available for SQLite at the present time (11/17/2021), because the analytical processing is not a very strongly represented capability of the SQLite database management system. """ offsets: List[int] = [quantile * table_row_count - 1 for quantile in quantiles] quantile_queries: List[Select] = [ sa.select([column]) .order_by(column.asc()) .offset(offset) .limit(1) .select_from(selectable) for offset in offsets ] quantile_result: Row quantile_query: Select try: quantiles_results: List[Row] = [ sqlalchemy_engine.execute(quantile_query).fetchone() for quantile_query in quantile_queries ] return list( itertools.chain.from_iterable( [list(quantile_result) for quantile_result in quantiles_results] ) ) except ProgrammingError as pe: exception_message: str = "An SQL syntax Exception occurred." exception_traceback: str = traceback.format_exc() exception_message += ( f'{type(pe).__name__}: "{str(pe)}". Traceback: "{exception_traceback}".' ) logger.error(exception_message) raise pe
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs, metric_value_kwargs, metrics, runtime_configuration, ): ( selectable, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain(metric_domain_kwargs, MetricDomainTypes.COLUMN) column_name = accessor_domain_kwargs["column"] column = sa.column(column_name) sqlalchemy_engine = execution_engine.engine query = sa.select(sa.func.max(column)).select_from(selectable) result = sqlalchemy_engine.execute(query).fetchone() return result[0]
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): min_value = metric_value_kwargs.get("min_value") max_value = metric_value_kwargs.get("max_value") strict_min = metric_value_kwargs.get("strict_min") strict_max = metric_value_kwargs.get("strict_max") if min_value is not None and max_value is not None and min_value > max_value: raise ValueError("min_value cannot be greater than max_value") if min_value is None and max_value is None: raise ValueError("min_value and max_value cannot both be None") dialect_name = execution_engine.engine.dialect.name.lower() if (min_value == get_sql_dialect_floating_point_infinity_value( schema="api_np", negative=True)) or ( min_value == get_sql_dialect_floating_point_infinity_value( schema="api_cast", negative=True)): min_value = get_sql_dialect_floating_point_infinity_value( schema=dialect_name, negative=True) if (min_value == get_sql_dialect_floating_point_infinity_value( schema="api_np", negative=False)) or ( min_value == get_sql_dialect_floating_point_infinity_value( schema="api_cast", negative=False)): min_value = get_sql_dialect_floating_point_infinity_value( schema=dialect_name, negative=False) if (max_value == get_sql_dialect_floating_point_infinity_value( schema="api_np", negative=True)) or ( max_value == get_sql_dialect_floating_point_infinity_value( schema="api_cast", negative=True)): max_value = get_sql_dialect_floating_point_infinity_value( schema=dialect_name, negative=True) if (max_value == get_sql_dialect_floating_point_infinity_value( schema="api_np", negative=False)) or ( max_value == get_sql_dialect_floating_point_infinity_value( schema="api_cast", negative=False)): max_value = get_sql_dialect_floating_point_infinity_value( schema=dialect_name, negative=False) ( selectable, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( domain_kwargs=metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) column = sa.column(accessor_domain_kwargs["column"]) if min_value is None: if strict_max: condition = column < max_value else: condition = column <= max_value elif max_value is None: if strict_min: condition = column > min_value else: condition = column >= min_value else: if strict_min and strict_max: condition = sa.and_(column > min_value, column < max_value) elif strict_min: condition = sa.and_(column > min_value, column <= max_value) elif strict_max: condition = sa.and_(column >= min_value, column < max_value) else: condition = sa.and_(column >= min_value, column <= max_value) return execution_engine.engine.execute( sa.select([sa.func.count() ]).select_from(selectable).where(condition)).scalar()
# In normal usage you'd set your data asset name directly in the BatchRequest above. batch_request.data_asset_name: str = "main.yellow_tripdata_sample_2019_01" context.create_expectation_suite( expectation_suite_name="test_suite", overwrite_existing=True ) validator = context.get_validator( batch_request=batch_request, expectation_suite_name="test_suite" ) print(validator.head(n_rows=10)) batch_list = context.get_batch_list(batch_request=batch_request) assert len(batch_list) == 1 batch_data = batch_list[0].data num_rows = batch_data.execution_engine.engine.execute( sa.select([sa.func.count()]).select_from(batch_data.selectable) ).one()[0] assert num_rows == 10000 # Here is a BatchRequest naming an inferred data_asset partitioned by day. # This BatchRequest specifies multiple batches, which is useful for dataset exploration. batch_request = BatchRequest( datasource_name="taxi_datasource", data_connector_name="daily", data_asset_name="<YOUR_DATA_ASSET_NAME>", ) # Please note this override is only to provide good UX for docs and tests. # In normal usage you'd set your data asset name directly in the BatchRequest above. batch_request.data_asset_name: str = "main.yellow_tripdata_sample_2019_01"
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): """return a list of counts corresponding to bins Args: column: the name of the column for which to get the histogram bins: tuple of bin edges for which to get histogram values; *must* be tuple to support caching """ selectable, _, accessor_domain_kwargs = execution_engine.get_compute_domain( domain_kwargs=metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) column = accessor_domain_kwargs["column"] bins = metric_value_kwargs["bins"] case_conditions = [] idx = 0 if isinstance(bins, np.ndarray): bins = bins.tolist() else: bins = list(bins) # If we have an infinite lower bound, don't express that in sql if (bins[0] == get_sql_dialect_floating_point_infinity_value( schema="api_np", negative=True)) or ( bins[0] == get_sql_dialect_floating_point_infinity_value( schema="api_cast", negative=True)): case_conditions.append( sa.func.sum( sa.case([(sa.column(column) < bins[idx + 1], 1)], else_=0)).label("bin_" + str(idx))) idx += 1 for idx in range(idx, len(bins) - 2): case_conditions.append( sa.func.sum( sa.case( [( sa.and_( bins[idx] <= sa.column(column), sa.column(column) < bins[idx + 1], ), 1, )], else_=0, )).label("bin_" + str(idx))) if (bins[-1] == get_sql_dialect_floating_point_infinity_value( schema="api_np", negative=False)) or ( bins[-1] == get_sql_dialect_floating_point_infinity_value( schema="api_cast", negative=False)): case_conditions.append( sa.func.sum( sa.case([(bins[-2] <= sa.column(column), 1)], else_=0)).label("bin_" + str(len(bins) - 1))) else: case_conditions.append( sa.func.sum( sa.case( [( sa.and_( bins[-2] <= sa.column(column), sa.column(column) <= bins[-1], ), 1, )], else_=0, )).label("bin_" + str(len(bins) - 1))) query = (sa.select(case_conditions).where( sa.column(column) != None, ).select_from(selectable)) # Run the data through convert_to_json_serializable to ensure we do not have Decimal types hist = convert_to_json_serializable( list(execution_engine.engine.execute(query).fetchone())) return hist
# Please note this override is only to provide good UX for docs and tests. # In normal usage you'd set your data asset name directly in the BatchRequest above. batch_request.data_asset_name = "yellow_tripdata_sample_2019_01" context.create_expectation_suite(expectation_suite_name="test_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite_name="test_suite") print(validator.head(n_rows=10)) batch_list = context.get_batch_list(batch_request=batch_request) assert len(batch_list) == 1 batch_data = batch_list[0].data num_rows = batch_data.execution_engine.engine.execute( sa.select([sa.func.count()]).select_from(batch_data.selectable)).one()[0] assert num_rows == 10000 # Here is a BatchRequest naming an inferred data_asset partitioned by day. # This BatchRequest specifies multiple batches, which is useful for dataset exploration. batch_request = BatchRequest( datasource_name="taxi_datasource", data_connector_name="daily", data_asset_name="<YOUR_DATA_ASSET_NAME>", ) # Please note this override is only to provide good UX for docs and tests. # In normal usage you'd set your data asset name directly in the BatchRequest above. batch_request.data_asset_name = "yellow_tripdata_sample_2019_01" batch_list = context.get_batch_list(batch_request=batch_request)
def _sqlalchemy( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): selectable, _, _ = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE) df = None table_name = getattr(selectable, "name", None) if table_name is None: # if a custom query was passed try: if metric_value_kwargs["fetch_all"]: df = pd.read_sql_query( sql=selectable, con=execution_engine.engine, ) else: df = next( pd.read_sql_query( sql=selectable, con=execution_engine.engine, chunksize=metric_value_kwargs["n_rows"], )) except (ValueError, NotImplementedError): # it looks like MetaData that is used by pd.read_sql_query # cannot work on a temp table. # If it fails, we are trying to get the data using read_sql df = None except StopIteration: validator = Validator(execution_engine=execution_engine) columns = validator.get_metric( MetricConfiguration("table.columns", metric_domain_kwargs)) df = pd.DataFrame(columns=columns) else: try: if metric_value_kwargs["fetch_all"]: df = pd.read_sql_table( table_name=getattr(selectable, "name", None), schema=getattr(selectable, "schema", None), con=execution_engine.engine, ) else: df = next( pd.read_sql_table( table_name=getattr(selectable, "name", None), schema=getattr(selectable, "schema", None), con=execution_engine.engine, chunksize=metric_value_kwargs["n_rows"], )) except (ValueError, NotImplementedError): # it looks like MetaData that is used by pd.read_sql_table # cannot work on a temp table. # If it fails, we are trying to get the data using read_sql df = None except StopIteration: validator = Validator(execution_engine=execution_engine) columns = validator.get_metric( MetricConfiguration("table.columns", metric_domain_kwargs)) df = pd.DataFrame(columns=columns) if df is None: # we want to compile our selectable stmt = sa.select(["*"]).select_from(selectable) if metric_value_kwargs["fetch_all"]: sql = stmt.compile( dialect=execution_engine.engine.dialect, compile_kwargs={"literal_binds": True}, ) elif execution_engine.engine.dialect.name.lower() == "mssql": # limit doesn't compile properly for mssql sql = str( stmt.compile( dialect=execution_engine.engine.dialect, compile_kwargs={"literal_binds": True}, )) sql = f"SELECT TOP {metric_value_kwargs['n_rows']}{sql[6:]}" else: stmt = stmt.limit(metric_value_kwargs["n_rows"]) sql = stmt.compile( dialect=execution_engine.engine.dialect, compile_kwargs={"literal_binds": True}, ) df = pd.read_sql(sql, con=execution_engine.engine) return df
def _sqlalchemy_function(self, column_list, **kwargs): """ Computes the "map" between the specified "column_list" (treated as a group so as to model the "compound" aspect) and the number of occurrences of every permutation of the values of "column_list" as the grouped subset of all rows of the table. In the present context, the term "compound" refers to having to treat the specified columns as unique together (e.g., as a multi-column primary key). For example, suppose that in the example below, all three columns ("A", "B", and "C") of the table are included as part of the "compound" columns list (i.e., column_list = ["A", "B", "C"]): A B C _num_rows 1 1 2 2 1 2 3 1 1 1 2 2 2 2 2 1 3 2 3 1 The fourth column, "_num_rows", holds the value of the "map" function -- the number of rows the group occurs in. """ # Needed as keys (hence, string valued) to access "ColumnElement" objects contained within the "FROM" clauses. column_names = kwargs.get("_column_names") # Need all columns of the table for the purposes of reporting entire rows satisfying unexpected condition logic. table_columns = kwargs.get("_table_columns") table = kwargs.get( "_table" ) # Note that here, "table" is of the "sqlalchemy.sql.selectable.Subquery" type. # First, obtain the SQLAlchemy "FromClause" version of the original "table" for the purposes of gaining the # "FromClause.c" attribute, which is a namespace of all the columns contained within the "FROM" clause (these # elements are themselves subclasses of the SQLAlchemy "ColumnElement" class). table_columns_selector = [ sa.column(column_name) for column_name in table_columns ] original_table_clause = (sa.select(table_columns_selector).select_from( table).alias("original_table_clause")) # Second, "SELECT FROM" the original table, represented by the "FromClause" object, querying all columns of the # table and the count of occurrences of distinct "compound" (i.e., group, as specified by "column_list") values. # Give this aggregated group count a distinctive label. # Give the resulting sub-query a unique alias in order to disambiguate column names in subsequent queries. count_selector = column_list + [sa.func.count().label("_num_rows")] group_count_query = (sa.select(count_selector).group_by( *column_list).select_from(original_table_clause).alias( "group_counts_subquery")) # The above "group_count_query", if executed, will produce the result set containing the number of rows that # equals the number of distinct values of the group -- unique grouping (e.g., as in a multi-column primary key). # Hence, in order for the "_num_rows" column values to provide an entry for each row of the original table, the # "SELECT FROM" of "group_count_query" must undergo an "INNER JOIN" operation with the "original_table_clause" # object, whereby all table columns in the two "FromClause" objects must match, respectively, as the conditions. conditions = sa.and_( *(group_count_query.c[name] == original_table_clause.c[name] for name in column_names)) # noinspection PyProtectedMember compound_columns_count_query = (sa.select([ original_table_clause, group_count_query.c._num_rows.label("_num_rows"), ]).select_from( original_table_clause.join( right=group_count_query, onclause=conditions, isouter=False)).alias( "records_with_grouped_column_counts_subquery")) # The returned SQLAlchemy "FromClause" "compound_columns_count_query" object realizes the "map" metric function. return compound_columns_count_query