Exemplo n.º 1
0
    def _spark(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[str, Any],
        runtime_configuration: Dict,
    ):
        (
            df,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(metric_domain_kwargs,
                                                MetricDomainTypes.COLUMN)
        column = accessor_domain_kwargs["column"]
        # We will get the two middle values by choosing an epsilon to add
        # to the 50th percentile such that we always get exactly the middle two values
        # (i.e. 0 < epsilon < 1 / (2 * values))

        # Note that this can be an expensive computation; we are not exposing
        # spark's ability to estimate.
        # We add two to 2 * n_values to maintain a legitimate quantile
        # in the degenerate case when n_values = 0
        """Spark Median Implementation"""
        table_row_count = metrics.get("table.row_count")
        result = df.approxQuantile(
            column, [0.5, 0.5 + (1 / (2 + (2 * table_row_count)))], 0)
        return np.mean(result)
 def _spark(
     cls,
     execution_engine: SqlAlchemyExecutionEngine,
     metric_domain_kwargs: Dict,
     metric_value_kwargs: Dict,
     metrics: Dict[str, Any],
     runtime_configuration: Dict,
 ):
     (
         df,
         compute_domain_kwargs,
         accessor_domain_kwargs,
     ) = execution_engine.get_compute_domain(
         metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN
     )
     allow_relative_error = metric_value_kwargs.get("allow_relative_error", False)
     quantiles = metric_value_kwargs["quantiles"]
     column = accessor_domain_kwargs["column"]
     if allow_relative_error is False:
         allow_relative_error = 0.0
     if (
         not isinstance(allow_relative_error, float)
         or allow_relative_error < 0
         or allow_relative_error > 1
     ):
         raise ValueError(
             "SparkDFDataset requires relative error to be False or to be a float between 0 and 1."
         )
     return df.approxQuantile(column, list(quantiles), allow_relative_error)
Exemplo n.º 3
0
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        selectable, _, _ = execution_engine.get_compute_domain(
            metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE
        )
        df = None
        table_name = getattr(selectable, "name", None)
        if table_name is not None:
            try:
                if metric_value_kwargs["fetch_all"]:
                    df = pd.read_sql_table(
                        table_name=getattr(selectable, "name", None),
                        schema=getattr(selectable, "schema", None),
                        con=execution_engine.engine,
                    )
                else:
                    df = next(
                        pd.read_sql_table(
                            table_name=getattr(selectable, "name", None),
                            schema=getattr(selectable, "schema", None),
                            con=execution_engine.engine,
                            chunksize=metric_value_kwargs["n_rows"],
                        )
                    )
            except (ValueError, NotImplementedError):
                # it looks like MetaData that is used by pd.read_sql_table
                # cannot work on a temp table.
                # If it fails, we are trying to get the data using read_sql
                df = None
            except StopIteration:
                validator = Validator(execution_engine=execution_engine)
                columns = validator.get_metric(
                    MetricConfiguration("table.columns", metric_domain_kwargs)
                )
                df = pd.DataFrame(columns=columns)
        if df is None:
            # we want to compile our selectable
            stmt = sa.select(["*"]).select_from(selectable)
            if metric_value_kwargs["fetch_all"]:
                pass
            else:
                stmt = stmt.limit(metric_value_kwargs["n_rows"])
            sql = stmt.compile(
                dialect=execution_engine.engine.dialect,
                compile_kwargs={"literal_binds": True},
            )
            df = pd.read_sql(sql, con=execution_engine.engine)

        return df
Exemplo n.º 4
0
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        sort = metric_value_kwargs.get("sort",
                                       cls.default_kwarg_values["sort"])
        collate = metric_value_kwargs.get("collate",
                                          cls.default_kwarg_values["collate"])

        if sort not in ["value", "count", "none"]:
            raise ValueError("sort must be either 'value', 'count', or 'none'")
        if collate is not None:
            raise ValueError(
                "collate parameter is not supported in PandasDataset")

        selectable, _, accessor_domain_kwargs = execution_engine.get_compute_domain(
            metric_domain_kwargs, MetricDomainTypes.COLUMN)
        column = accessor_domain_kwargs["column"]

        if sort not in ["value", "count", "none"]:
            raise ValueError("sort must be either 'value', 'count', or 'none'")

        query = (sa.select([
            sa.column(column).label("value"),
            sa.func.count(sa.column(column)).label("count"),
        ]).where(sa.column(column) != None).group_by(sa.column(column)))
        if sort == "value":
            # NOTE: depending on the way the underlying database collates columns,
            # ordering can vary. postgresql collate "C" matches default sort
            # for python and most other systems, but is not universally supported,
            # so we use the default sort for the system, unless specifically overridden
            if collate is not None:
                query = query.order_by(sa.column(column).collate(collate))
            else:
                query = query.order_by(sa.column(column))
        elif sort == "count":
            query = query.order_by(sa.column("count").desc())
        results = execution_engine.engine.execute(
            query.select_from(selectable)).fetchall()
        series = pd.Series(
            [row[1] for row in results],
            index=pd.Index(data=[row[0] for row in results], name="value"),
            name="count",
        )
        return series
Exemplo n.º 5
0
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[str, Any],
        runtime_configuration: Dict,
    ):
        (
            selectable,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(metric_domain_kwargs,
                                                MetricDomainTypes.COLUMN)
        column_name = accessor_domain_kwargs["column"]
        column = sa.column(column_name)
        sqlalchemy_engine = execution_engine.engine
        """SqlAlchemy Median Implementation"""
        nonnull_count = metrics.get("column_values.nonnull.count")
        if not nonnull_count:
            return None
        element_values = sqlalchemy_engine.execute(
            sa.select([column]).order_by(column).where(column != None).offset(
                max(nonnull_count // 2 - 1,
                    0)).limit(2).select_from(selectable))

        column_values = list(element_values.fetchall())

        if len(column_values) == 0:
            column_median = None
        elif nonnull_count % 2 == 0:
            # An even number of column values: take the average of the two center values
            column_median = (
                float(column_values[0][0] + column_values[1][
                    0]  # left center value  # right center value
                      ) / 2.0)  # Average center values
        else:
            # An odd number of column values, we can just take the center value
            column_median = column_values[1][0]  # True center value
        return column_median
Exemplo n.º 6
0
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: dict,
        metric_value_kwargs: dict,
        metrics: Dict[str, Any],
        runtime_configuration: dict,
    ) -> List[sqlalchemy_engine_Row]:
        query: Optional[str] = metric_value_kwargs.get(
            "query"
        ) or cls.default_kwarg_values.get("query")

        selectable: Union[sa.sql.Selectable, str]
        selectable, _, _ = execution_engine.get_compute_domain(
            metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE
        )

        column: str = metric_value_kwargs.get("column")
        if isinstance(selectable, sa.Table):
            query = query.format(col=column, active_batch=selectable)
        elif isinstance(
            selectable, sa.sql.Subquery
        ):  # Specifying a runtime query in a RuntimeBatchRequest returns the active bacth as a Subquery; sectioning the active batch off w/ parentheses ensures flow of operations doesn't break
            query = query.format(col=column, active_batch=f"({selectable})")
        elif isinstance(
            selectable, sa.sql.Select
        ):  # Specifying a row_condition returns the active batch as a Select object, requiring compilation & aliasing when formatting the parameterized query
            query = query.format(
                col=column,
                active_batch=f'({selectable.compile(compile_kwargs={"literal_binds": True})}) AS subselect',
            )
        else:
            query = query.format(col=column, active_batch=f"({selectable})")

        engine: sqlalchemy_engine_Engine = execution_engine.engine
        result: List[sqlalchemy_engine_Row] = engine.execute(sa.text(query)).fetchall()

        return result
Exemplo n.º 7
0
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs,
        metric_value_kwargs,
        metrics,
        runtime_configuration,
    ):
        (
            selectable,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(metric_domain_kwargs,
                                                MetricDomainTypes.COLUMN)

        column_name = accessor_domain_kwargs["column"]
        column = sa.column(column_name)
        sqlalchemy_engine = execution_engine.engine

        query = sa.select(sa.func.max(column)).select_from(selectable)
        result = sqlalchemy_engine.execute(query).fetchone()

        return result[0]
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[str, Any],
        runtime_configuration: Dict,
    ):
        min_value = metric_value_kwargs.get("min_value")
        max_value = metric_value_kwargs.get("max_value")
        strict_min = metric_value_kwargs.get("strict_min")
        strict_max = metric_value_kwargs.get("strict_max")
        if min_value is not None and max_value is not None and min_value > max_value:
            raise ValueError("min_value cannot be greater than max_value")

        if min_value is None and max_value is None:
            raise ValueError("min_value and max_value cannot both be None")
        dialect_name = execution_engine.engine.dialect.name.lower()

        if (min_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=True)) or (
                    min_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=True)):
            min_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=True)

        if (min_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=False)) or (
                    min_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=False)):
            min_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=False)

        if (max_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=True)) or (
                    max_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=True)):
            max_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=True)

        if (max_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=False)) or (
                    max_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=False)):
            max_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=False)

        (
            selectable,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(
            domain_kwargs=metric_domain_kwargs,
            domain_type=MetricDomainTypes.COLUMN)
        column = sa.column(accessor_domain_kwargs["column"])

        if min_value is None:
            if strict_max:
                condition = column < max_value
            else:
                condition = column <= max_value

        elif max_value is None:
            if strict_min:
                condition = column > min_value
            else:
                condition = column >= min_value

        else:
            if strict_min and strict_max:
                condition = sa.and_(column > min_value, column < max_value)
            elif strict_min:
                condition = sa.and_(column > min_value, column <= max_value)
            elif strict_max:
                condition = sa.and_(column >= min_value, column < max_value)
            else:
                condition = sa.and_(column >= min_value, column <= max_value)

        return execution_engine.engine.execute(
            sa.select([sa.func.count()
                       ]).select_from(selectable).where(condition)).scalar()
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[str, Any],
        runtime_configuration: Dict,
    ):
        """return a list of counts corresponding to bins

        Args:
            column: the name of the column for which to get the histogram
            bins: tuple of bin edges for which to get histogram values; *must* be tuple to support caching
        """
        selectable, _, accessor_domain_kwargs = execution_engine.get_compute_domain(
            domain_kwargs=metric_domain_kwargs,
            domain_type=MetricDomainTypes.COLUMN)
        column = accessor_domain_kwargs["column"]
        bins = metric_value_kwargs["bins"]

        case_conditions = []
        idx = 0
        if isinstance(bins, np.ndarray):
            bins = bins.tolist()
        else:
            bins = list(bins)

        # If we have an infinite lower bound, don't express that in sql
        if (bins[0] == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=True)) or (
                    bins[0] == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=True)):
            case_conditions.append(
                sa.func.sum(
                    sa.case([(sa.column(column) < bins[idx + 1], 1)],
                            else_=0)).label("bin_" + str(idx)))
            idx += 1

        for idx in range(idx, len(bins) - 2):
            case_conditions.append(
                sa.func.sum(
                    sa.case(
                        [(
                            sa.and_(
                                bins[idx] <= sa.column(column),
                                sa.column(column) < bins[idx + 1],
                            ),
                            1,
                        )],
                        else_=0,
                    )).label("bin_" + str(idx)))

        if (bins[-1] == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=False)) or (
                    bins[-1] == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=False)):
            case_conditions.append(
                sa.func.sum(
                    sa.case([(bins[-2] <= sa.column(column), 1)],
                            else_=0)).label("bin_" + str(len(bins) - 1)))
        else:
            case_conditions.append(
                sa.func.sum(
                    sa.case(
                        [(
                            sa.and_(
                                bins[-2] <= sa.column(column),
                                sa.column(column) <= bins[-1],
                            ),
                            1,
                        )],
                        else_=0,
                    )).label("bin_" + str(len(bins) - 1)))

        query = (sa.select(case_conditions).where(
            sa.column(column) != None, ).select_from(selectable))

        # Run the data through convert_to_json_serializable to ensure we do not have Decimal types
        hist = convert_to_json_serializable(
            list(execution_engine.engine.execute(query).fetchone()))
        return hist
 def _sqlalchemy(
     cls,
     execution_engine: SqlAlchemyExecutionEngine,
     metric_domain_kwargs: Dict,
     metric_value_kwargs: Dict,
     metrics: Dict[str, Any],
     runtime_configuration: Dict,
 ):
     (
         selectable,
         compute_domain_kwargs,
         accessor_domain_kwargs,
     ) = execution_engine.get_compute_domain(
         metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN
     )
     column_name = accessor_domain_kwargs["column"]
     column = sa.column(column_name)
     sqlalchemy_engine = execution_engine.engine
     dialect = sqlalchemy_engine.dialect
     quantiles = metric_value_kwargs["quantiles"]
     allow_relative_error = metric_value_kwargs.get("allow_relative_error", False)
     table_row_count = metrics.get("table.row_count")
     if dialect.name.lower() == "mssql":
         return _get_column_quantiles_mssql(
             column=column,
             quantiles=quantiles,
             selectable=selectable,
             sqlalchemy_engine=sqlalchemy_engine,
         )
     elif dialect.name.lower() == "bigquery":
         return _get_column_quantiles_bigquery(
             column=column,
             quantiles=quantiles,
             selectable=selectable,
             sqlalchemy_engine=sqlalchemy_engine,
         )
     elif dialect.name.lower() == "mysql":
         return _get_column_quantiles_mysql(
             column=column,
             quantiles=quantiles,
             selectable=selectable,
             sqlalchemy_engine=sqlalchemy_engine,
         )
     elif dialect.name.lower() == "snowflake":
         # NOTE: 20201216 - JPC - snowflake has a representation/precision limitation
         # in its percentile_disc implementation that causes an error when we do
         # not round. It is unclear to me *how* the call to round affects the behavior --
         # the binary representation should be identical before and after, and I do
         # not observe a type difference. However, the issue is replicable in the
         # snowflake console and directly observable in side-by-side comparisons with
         # and without the call to round()
         quantiles = [round(x, 10) for x in quantiles]
         return _get_column_quantiles_generic_sqlalchemy(
             column=column,
             quantiles=quantiles,
             allow_relative_error=allow_relative_error,
             dialect=dialect,
             selectable=selectable,
             sqlalchemy_engine=sqlalchemy_engine,
         )
     elif dialect.name.lower() == "sqlite":
         return _get_column_quantiles_sqlite(
             column=column,
             quantiles=quantiles,
             selectable=selectable,
             sqlalchemy_engine=sqlalchemy_engine,
             table_row_count=table_row_count,
         )
     else:
         return _get_column_quantiles_generic_sqlalchemy(
             column=column,
             quantiles=quantiles,
             allow_relative_error=allow_relative_error,
             dialect=dialect,
             selectable=selectable,
             sqlalchemy_engine=sqlalchemy_engine,
         )