def process(self, element: types.Extracts) -> List[types.Extracts]: # Slice on transformed features if available. features_dicts = [] if (constants.TRANSFORMED_FEATURES_KEY in element and element[constants.TRANSFORMED_FEATURES_KEY] is not None): transformed_features = element[constants.TRANSFORMED_FEATURES_KEY] # If only one model, the output is stored without keying on model name. if not self._eval_config or len( self._eval_config.model_specs) == 1: features_dicts.append(transformed_features) else: # Search for slices in each model's transformed features output. for spec in self._eval_config.model_specs: if spec.name in transformed_features: features_dicts.append(transformed_features[spec.name]) # Search for slices first in transformed features (if any). If a match is # not found there then search in raw features. slices = list( slicer.get_slices_for_features_dicts( features_dicts, util.get_features_from_extracts(element), self._slice_spec)) # Make a a shallow copy, so we don't mutate the original. element_copy = copy.copy(element) element_copy[constants.SLICE_KEY_TYPES_KEY] = slices # Add a list of stringified slice keys to be materialized to output table. if self._materialize: element_copy[constants.SLICE_KEYS_KEY] = types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=(list( slicer.stringify_slice_key(x).encode('utf-8') for x in slices))) return [element_copy]
def testGetFeaturesFromExtracts(self): self.assertEqual( {'a': np.array([1])}, util.get_features_from_extracts({ constants.FEATURES_PREDICTIONS_LABELS_KEY: types.FeaturesPredictionsLabels(input_ref=0, features={'a': np.array([1])}, predictions={}, labels={}) }), ) self.assertEqual( {'a': np.array([1])}, util.get_features_from_extracts( {constants.FEATURES_KEY: { 'a': np.array([1]) }}), ) self.assertEqual({}, util.get_features_from_extracts({}))
def process(self, element: types.Extracts) -> List[types.Extracts]: # Make a deep copy, so we don't mutate the original. element_copy = copy.deepcopy(element) features = util.get_features_from_extracts(element_copy) for feature_name, boundaries in self._bucket_boundaries.items(): if feature_name in features: transformed_values = [] for value in features[feature_name]: transformed_values.append(_bin_value(value, boundaries)) features[TRANSFORMED_FEATURE_PREFIX + feature_name] = np.array(transformed_values) return [element_copy]
def process(self, element: types.Extracts) -> List[types.Extracts]: features = util.get_features_from_extracts(element) slices = list( slicer.get_slices_for_features_dict(features, self._slice_spec)) # Make a a shallow copy, so we don't mutate the original. element_copy = copy.copy(element) element_copy[constants.SLICE_KEY_TYPES_KEY] = slices # Add a list of stringified slice keys to be materialized to output table. if self._materialize: element_copy[constants.SLICE_KEYS_KEY] = types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=(list( slicer.stringify_slice_key(x).encode('utf-8') for x in slices))) return [element_copy]
def process( self, element: types.Extracts, slice_spec: List[slicer.SingleSliceSpec]) -> List[types.Extracts]: # Slice on transformed features if available. features_dicts = [] if (constants.TRANSFORMED_FEATURES_KEY in element and element[constants.TRANSFORMED_FEATURES_KEY] is not None): transformed_features = element[constants.TRANSFORMED_FEATURES_KEY] # If only one model, the output is stored without keying on model name. if not self._eval_config or len( self._eval_config.model_specs) == 1: features_dicts.append(transformed_features) else: # Search for slices in each model's transformed features output. for spec in self._eval_config.model_specs: if spec.name in transformed_features: features_dicts.append(transformed_features[spec.name]) # Search for slices first in transformed features (if any). If a match is # not found there then search in raw features. slice_keys = list( slicer.get_slices_for_features_dicts( features_dicts, util.get_features_from_extracts(element), slice_spec)) # If SLICE_KEY_TYPES_KEY already exists, that means the # SqlSliceKeyExtractor has generated some slice keys. We need to add # them to current slice_keys list. if (constants.SLICE_KEY_TYPES_KEY in element and element[constants.SLICE_KEY_TYPES_KEY]): slice_keys.extend(element[constants.SLICE_KEY_TYPES_KEY]) unique_slice_keys = list(set(slice_keys)) if len(slice_keys) != len(unique_slice_keys): self._duplicate_slice_keys_counter.inc() # Make a a shallow copy, so we don't mutate the original. element_copy = copy.copy(element) element_copy[constants.SLICE_KEY_TYPES_KEY] = unique_slice_keys # Add a list of stringified slice keys to be materialized to output table. if self._materialize: element_copy[constants.SLICE_KEYS_KEY] = types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=(list( slicer.stringify_slice_key(x).encode('utf-8') for x in unique_slice_keys))) return [element_copy]
def process( self, element: types.Extracts, statistics: statistics_pb2.DatasetFeatureStatisticsList ) -> List[types.Extracts]: if self._bucket_boundaries is None: self._bucket_boundaries = get_quantile_boundaries(statistics) # Make a deep copy, so we don't mutate the original. element_copy = copy.deepcopy(element) features = util.get_features_from_extracts(element_copy) for feature_name, boundaries in self._bucket_boundaries.items(): if (feature_name in features and features[feature_name] is not None and features[feature_name].size > 0): transformed_values = [] for value in features[feature_name]: transformed_values.append(_bin_value(value, boundaries)) features[TRANSFORMED_FEATURE_PREFIX + feature_name] = np.array(transformed_values) return [element_copy]