def _pandas(
        cls,
        execution_engine: PandasExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        df, _, _ = execution_engine.get_compute_domain(
            metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE
        )

        X, y = (
            df.drop(columns=[metric_value_kwargs["y_column"]]),
            df[metric_value_kwargs["y_column"]],
        )
        model = LinearRegression().fit(X, y)
        importances = permutation_importance(
            model,
            X,
            y,
            n_repeats=30,
            random_state=42,
            scoring="neg_mean_absolute_percentage_error",
        )

        return {i: j for i, j in zip(X.columns, importances.importances_mean)}
Пример #2
0
            def inner_func(
                cls,
                execution_engine: PandasExecutionEngine,
                metric_domain_kwargs: Dict,
                metric_value_kwargs: Dict,
                metrics: Dict[str, Any],
                runtime_configuration: Dict,
            ):
                filter_column_isnull = kwargs.get(
                    "filter_column_isnull",
                    getattr(cls, "filter_column_isnull", False))

                df, _, accessor_domain_kwargs = execution_engine.get_compute_domain(
                    domain_kwargs=metric_domain_kwargs,
                    domain_type=domain_type)

                column_name = accessor_domain_kwargs["column"]

                if column_name not in metrics["table.columns"]:
                    raise ge_exceptions.InvalidMetricAccessorDomainKwargsKeyError(
                        message=
                        f'Error: The column "{column_name}" in BatchData does not exist.'
                    )

                if filter_column_isnull:
                    df = df[df[column_name].notnull()]

                return metric_fn(
                    cls,
                    column=df[column_name],
                    **metric_value_kwargs,
                    _metrics=metrics,
                )
    def _pandas(
        cls,
        execution_engine: PandasExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        min_value = metric_value_kwargs.get("min_value")
        max_value = metric_value_kwargs.get("max_value")
        strict_min = metric_value_kwargs.get("strict_min")
        strict_max = metric_value_kwargs.get("strict_max")
        if min_value is None and max_value is None:
            raise ValueError("min_value and max_value cannot both be None")

        if min_value is not None and max_value is not None and min_value > max_value:
            raise ValueError("min_value cannot be greater than max_value")

        (
            df,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(
            domain_kwargs=metric_domain_kwargs,
            domain_type=MetricDomainTypes.COLUMN)
        val = df[accessor_domain_kwargs["column"]]

        if min_value is not None and max_value is not None:
            if strict_min and strict_max:
                series = (min_value < val) and (val < max_value)
            elif strict_min:
                series = (min_value < val) and (val <= max_value)
            elif strict_max:
                series = (min_value <= val) and (val < max_value)
            else:
                series = (min_value <= val) and (val <= max_value)

        elif min_value is None and max_value is not None:
            if strict_max:
                series = val < max_value
            else:
                series = val <= max_value

        elif min_value is not None and max_value is None:
            if strict_min:
                series = min_value < val
            else:
                series = min_value <= val
        else:
            raise ValueError("unable to parse domain and value kwargs")

        return np.count_nonzero(series)
 def _pandas(
     cls,
     execution_engine: PandasExecutionEngine,
     metric_domain_kwargs: Dict,
     metric_value_kwargs: Dict,
     metrics: Dict[Tuple, Any],
     runtime_configuration: Dict,
 ):
     df, _, _ = execution_engine.get_compute_domain(
         metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE)
     return [{
         "name": name,
         "type": dtype
     } for (name, dtype) in zip(df.columns, df.dtypes)]
Пример #5
0
 def _pandas(
     cls,
     execution_engine: PandasExecutionEngine,
     metric_domain_kwargs: Dict,
     metric_value_kwargs: Dict,
     metrics: Dict[Tuple, Any],
     runtime_configuration: Dict,
 ):
     df, _, _ = execution_engine.get_compute_domain(
         metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE)
     if metric_value_kwargs.get("fetch_all",
                                cls.default_kwarg_values["fetch_all"]):
         return df
     return df.head(metric_value_kwargs["n_rows"])
 def _pandas(
     cls,
     execution_engine: PandasExecutionEngine,
     metric_domain_kwargs: Dict,
     metric_value_kwargs: Dict,
     metrics: Dict[str, Any],
     runtime_configuration: Dict,
 ):
     df, _, accessor_domain_kwargs = execution_engine.get_compute_domain(
         domain_kwargs=metric_domain_kwargs,
         domain_type=MetricDomainTypes.COLUMN)
     column = accessor_domain_kwargs["column"]
     bins = metric_value_kwargs["bins"]
     hist, bin_edges = np.histogram(df[column], bins, density=False)
     return list(hist)
Пример #7
0
 def _pandas(
     cls,
     execution_engine: PandasExecutionEngine,
     metric_domain_kwargs: Dict,
     metric_value_kwargs: Dict,
     metrics: Dict[str, Any],
     runtime_configuration: Dict,
 ):
     df, _, accessor_domain_kwargs = execution_engine.get_compute_domain(
         domain_kwargs=metric_domain_kwargs,
         domain_type=MetricDomainTypes.COLUMN)
     column = accessor_domain_kwargs["column"]
     bins = metric_value_kwargs["bins"]
     column_series: pd.Series = df[column]
     column_null_elements_cond: pd.Series = column_series.isnull()
     column_nonnull_elements: pd.Series = column_series[
         ~column_null_elements_cond]
     hist, bin_edges = np.histogram(column_nonnull_elements,
                                    bins,
                                    density=False)
     return list(hist)
Пример #8
0
    def _pandas(
        cls,
        execution_engine: PandasExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        sort = metric_value_kwargs.get("sort",
                                       cls.default_kwarg_values["sort"])
        collate = metric_value_kwargs.get("collate",
                                          cls.default_kwarg_values["collate"])

        if sort not in ["value", "count", "none"]:
            raise ValueError("sort must be either 'value', 'count', or 'none'")
        if collate is not None:
            raise ValueError(
                "collate parameter is not supported in PandasDataset")

        df, _, accessor_domain_kwargs = execution_engine.get_compute_domain(
            metric_domain_kwargs, MetricDomainTypes.COLUMN)
        column = accessor_domain_kwargs["column"]

        counts = df[column].value_counts()
        if sort == "value":
            try:
                counts.sort_index(inplace=True)
            except TypeError:
                # Having values of multiple types in a object dtype column (e.g., strings and floats)
                # raises a TypeError when the sorting method performs comparisons.
                if df[column].dtype == object:
                    counts.index = counts.index.astype(str)
                    counts.sort_index(inplace=True)
        elif sort == "counts":
            counts.sort_values(inplace=True)
        counts.name = "count"
        counts.index.name = "value"
        return counts