def _pandas( cls, execution_engine: PandasExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): df, _, _ = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE ) X, y = ( df.drop(columns=[metric_value_kwargs["y_column"]]), df[metric_value_kwargs["y_column"]], ) model = LinearRegression().fit(X, y) importances = permutation_importance( model, X, y, n_repeats=30, random_state=42, scoring="neg_mean_absolute_percentage_error", ) return {i: j for i, j in zip(X.columns, importances.importances_mean)}
def inner_func( cls, execution_engine: PandasExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): filter_column_isnull = kwargs.get( "filter_column_isnull", getattr(cls, "filter_column_isnull", False)) df, _, accessor_domain_kwargs = execution_engine.get_compute_domain( domain_kwargs=metric_domain_kwargs, domain_type=domain_type) column_name = accessor_domain_kwargs["column"] if column_name not in metrics["table.columns"]: raise ge_exceptions.InvalidMetricAccessorDomainKwargsKeyError( message= f'Error: The column "{column_name}" in BatchData does not exist.' ) if filter_column_isnull: df = df[df[column_name].notnull()] return metric_fn( cls, column=df[column_name], **metric_value_kwargs, _metrics=metrics, )
def _pandas( cls, execution_engine: PandasExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): min_value = metric_value_kwargs.get("min_value") max_value = metric_value_kwargs.get("max_value") strict_min = metric_value_kwargs.get("strict_min") strict_max = metric_value_kwargs.get("strict_max") if min_value is None and max_value is None: raise ValueError("min_value and max_value cannot both be None") if min_value is not None and max_value is not None and min_value > max_value: raise ValueError("min_value cannot be greater than max_value") ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( domain_kwargs=metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) val = df[accessor_domain_kwargs["column"]] if min_value is not None and max_value is not None: if strict_min and strict_max: series = (min_value < val) and (val < max_value) elif strict_min: series = (min_value < val) and (val <= max_value) elif strict_max: series = (min_value <= val) and (val < max_value) else: series = (min_value <= val) and (val <= max_value) elif min_value is None and max_value is not None: if strict_max: series = val < max_value else: series = val <= max_value elif min_value is not None and max_value is None: if strict_min: series = min_value < val else: series = min_value <= val else: raise ValueError("unable to parse domain and value kwargs") return np.count_nonzero(series)
def _pandas( cls, execution_engine: PandasExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): df, _, _ = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE) return [{ "name": name, "type": dtype } for (name, dtype) in zip(df.columns, df.dtypes)]
def _pandas( cls, execution_engine: PandasExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): df, _, _ = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE) if metric_value_kwargs.get("fetch_all", cls.default_kwarg_values["fetch_all"]): return df return df.head(metric_value_kwargs["n_rows"])
def _pandas( cls, execution_engine: PandasExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): df, _, accessor_domain_kwargs = execution_engine.get_compute_domain( domain_kwargs=metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) column = accessor_domain_kwargs["column"] bins = metric_value_kwargs["bins"] hist, bin_edges = np.histogram(df[column], bins, density=False) return list(hist)
def _pandas( cls, execution_engine: PandasExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): df, _, accessor_domain_kwargs = execution_engine.get_compute_domain( domain_kwargs=metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) column = accessor_domain_kwargs["column"] bins = metric_value_kwargs["bins"] column_series: pd.Series = df[column] column_null_elements_cond: pd.Series = column_series.isnull() column_nonnull_elements: pd.Series = column_series[ ~column_null_elements_cond] hist, bin_edges = np.histogram(column_nonnull_elements, bins, density=False) return list(hist)
def _pandas( cls, execution_engine: PandasExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): sort = metric_value_kwargs.get("sort", cls.default_kwarg_values["sort"]) collate = metric_value_kwargs.get("collate", cls.default_kwarg_values["collate"]) if sort not in ["value", "count", "none"]: raise ValueError("sort must be either 'value', 'count', or 'none'") if collate is not None: raise ValueError( "collate parameter is not supported in PandasDataset") df, _, accessor_domain_kwargs = execution_engine.get_compute_domain( metric_domain_kwargs, MetricDomainTypes.COLUMN) column = accessor_domain_kwargs["column"] counts = df[column].value_counts() if sort == "value": try: counts.sort_index(inplace=True) except TypeError: # Having values of multiple types in a object dtype column (e.g., strings and floats) # raises a TypeError when the sorting method performs comparisons. if df[column].dtype == object: counts.index = counts.index.astype(str) counts.sort_index(inplace=True) elif sort == "counts": counts.sort_values(inplace=True) counts.name = "count" counts.index.name = "value" return counts