示例#1
0
 def filter_extracts(extracts: types.Extracts) -> types.Extracts:  # pylint: disable=invalid-name
     """Filters extracts."""
     if not include and not exclude:
         return extracts
     elif include:
         return {k: v for k, v in extracts.items() if k in include}
     else:
         assert exclude
         return {k: v for k, v in extracts.items() if k not in exclude}
示例#2
0
 def process(self, element: types.Extracts
            ) -> List[Tuple[SliceKeyType, types.Extracts]]:
   key_filter_fn = self._key_filter_fn  # Local cache.
   filtered = {k: v for k, v in element.items() if key_filter_fn(k)}
   result = [(slice_key, filtered)
             for slice_key in element.get(constants.SLICE_KEY_TYPES_KEY)]
   self._num_slices_generated_per_instance.update(len(result))
   self._post_slice_num_instances.inc(len(result))
   return result
 def process(self, element: types.Extracts) -> Sequence[types.Extracts]:
     batch_size = element[constants.ARROW_RECORD_BATCH_KEY].num_rows
     try:
         result = self._batch_reducible_process(element)
         self._batch_size.update(batch_size)
         self._num_instances.inc(batch_size)
         return result
     except (ValueError, tf.errors.InvalidArgumentError) as e:
         logging.warning(
             'Large batch_size %s failed with error %s. '
             'Attempting to run batch through serially. Note that this will '
             'significantly affect the performance.', batch_size, e)
         self._batch_size_failed.update(batch_size)
         result = []
         record_batch = element[constants.ARROW_RECORD_BATCH_KEY]
         for i in range(batch_size):
             self._batch_size.update(1)
             unbatched_element = {}
             for key in element.keys():
                 if key == constants.ARROW_RECORD_BATCH_KEY:
                     unbatched_element[key] = record_batch.slice(i, 1)
                 else:
                     unbatched_element[key] = [element[key][i]]
             result.extend(self._batch_reducible_process(unbatched_element))
         self._num_instances.inc(len(result))
         return result
    def process(self, element: types.Extracts) -> List[types.Extracts]:
        fpl = element.get(constants.FEATURES_PREDICTIONS_LABELS_KEY)
        if not fpl:
            raise RuntimeError(
                'FPL missing, Please ensure Predict() was called.')
        if not isinstance(fpl, types.FeaturesPredictionsLabels):
            raise TypeError(
                'Expected FPL to be instance of FeaturesPredictionsLabel. FPL was: '
                '%s of type %s' % (str(fpl), type(fpl)))
        features = fpl.features
        slices = list(
            slicer.get_slices_for_features_dict(features, self._slice_spec))

        # Make a a shallow copy, so we don't mutate the original.
        element_copy = copy.copy(element)

        element_copy[constants.SLICE_KEY_TYPES_KEY] = slices
        # Add a list of stringified slice keys to be materialized to output table.
        if self._materialize:
            element_copy[constants.SLICE_KEYS_KEY] = types.MaterializedColumn(
                name=constants.SLICE_KEYS_KEY,
                value=(list(
                    slicer.stringify_slice_key(x).encode('utf-8')
                    for x in slices)))
        return [element_copy]
示例#5
0
def _convert_legacy_fpl(
        extracts: types.Extracts,
        example_weight_key: Union[Text, Dict[Text, Text]]) -> types.Extracts:
    """Converts from legacy FPL types to features, labels, predictions."""
    if constants.FEATURES_PREDICTIONS_LABELS_KEY not in extracts:
        return extracts

    remove_node = lambda d: {k: list(v.values())[0] for k, v in d.items()}
    remove_batch = lambda v: v[0] if len(v.shape) > 1 and v.shape[0
                                                                  ] == 1 else v
    remove_batches = lambda d: {k: remove_batch(v) for k, v in d.items()}
    remove_default_key = lambda d: list(d.values())[0] if len(d) == 1 else d

    extracts = copy.copy(extracts)
    fpl = extracts.pop(constants.FEATURES_PREDICTIONS_LABELS_KEY)
    features = remove_node(fpl.features)
    example_weights = np.array([1.0])
    if example_weight_key:
        if isinstance(example_weight_key, dict):
            example_weights = {}
            for k, v in example_weight_key.items():
                example_weights[k] = remove_batch(features[v])
        else:
            example_weights = remove_batch(features[example_weight_key])
    labels = remove_default_key(remove_batches(remove_node(fpl.labels)))
    predictions = remove_default_key(
        remove_batches(remove_node(fpl.predictions)))
    extracts[constants.FEATURES_KEY] = features
    extracts[constants.PREDICTIONS_KEY] = predictions
    extracts[constants.LABELS_KEY] = labels
    extracts[constants.EXAMPLE_WEIGHTS_KEY] = example_weights
    return extracts
def _ExtractUnbatchedInputs(
        batched_extract: types.Extracts) -> Sequence[types.Extracts]:
    """Extract features, predictions, labels and weights from batched extract."""
    keys_to_retain = set(batched_extract.keys())
    keys_to_retain.remove(constants.ARROW_RECORD_BATCH_KEY)
    dataframe = pd.DataFrame()
    for key in keys_to_retain:
        dataframe[key] = batched_extract[key]
    return dataframe.to_dict(orient='records')
示例#7
0
 def process(
         self, element: types.Extracts
 ) -> List[Tuple[SliceKeyType, types.Extracts]]:
     key_filter_fn = self._key_filter_fn  # Local cache.
     filtered = {k: v for k, v in element.items() if key_filter_fn(k)}
     slice_keys = element.get(constants.SLICE_KEY_TYPES_KEY)
     # The query based evaluator will group slices from multiple examples, so we
     # deduplicate to avoid overcounting. Depending on whether the rows within a
     # batch have a variable or fixed length, either a VarLenTensorValue or a 2D
     # np.ndarray will be created.
     if isinstance(slice_keys, types.VarLenTensorValue):
         slice_keys = slice_keys.values
     elif isinstance(slice_keys, np.ndarray) and len(slice_keys.shape) == 2:
         slice_keys = slice_keys.flatten()
     result = [(slice_key, filtered) for slice_key in set(slice_keys)]
     self._num_slices_generated_per_instance.update(len(result))
     self._post_slice_num_instances.inc(len(result))
     return result
    def process(self, extracts: types.Extracts) -> Iterable[Any]:
        start_time = datetime.datetime.now()
        self._evaluate_num_instances.inc(1)

        use_default_combiner_input = None
        features = None
        combiner_inputs = []
        for computation in self._computations:
            if computation.preprocessor is None:
                combiner_inputs.append(None)
                use_default_combiner_input = True
            elif isinstance(computation.preprocessor,
                            metric_types.FeaturePreprocessor):
                if features is None:
                    features = {}
                for v in computation.preprocessor.process(extracts):
                    features.update(v)
                combiner_inputs.append(None)
                use_default_combiner_input = True
            else:
                combiner_inputs.append(
                    next(computation.preprocessor.process(extracts)))

        output = {
            constants.SLICE_KEY_TYPES_KEY:
            extracts[constants.SLICE_KEY_TYPES_KEY],
            _COMBINER_INPUTS_KEY: combiner_inputs
        }
        if use_default_combiner_input:
            default_combiner_input = []
            if features is not None:
                extracts = copy.copy(extracts)
                extracts.update({constants.FEATURES_KEY: features})
            default_combiner_input = metric_util.to_standard_metric_inputs(
                extracts, include_features=features is not None)
            output[_DEFAULT_COMBINER_INPUT_KEY] = default_combiner_input
        yield output

        self._timer.update(
            int((datetime.datetime.now() - start_time).total_seconds()))
示例#9
0
 def process(
         self, element: types.Extracts
 ) -> List[Tuple[SliceKeyType, types.Extracts]]:
     key_filter_fn = self._key_filter_fn  # Local cache.
     filtered = {k: v for k, v in element.items() if key_filter_fn(k)}
     slice_keys = element.get(constants.SLICE_KEY_TYPES_KEY)
     # The query based evaluator will group slices into a multi-dimentional array
     # with an extra dimension representing the examples matching the query key.
     # We need to flatten and dedup the slice keys.
     if _is_multi_dim_keys(slice_keys):
         arr = np.array(slice_keys)
         unique_keys = set()
         for k in arr.flatten():
             unique_keys.add(k)
         if not unique_keys and arr.shape:
             # If only the empty overall slice is in array, it is removed by flatten
             unique_keys.add(())
         slice_keys = unique_keys
     result = [(slice_key, filtered) for slice_key in slice_keys]
     self._num_slices_generated_per_instance.update(len(result))
     self._post_slice_num_instances.inc(len(result))
     return result
示例#10
0
 def merge_lists(target: types.Extracts) -> types.Extracts:
     """Converts target's leaves which are lists to batched np.array's, etc."""
     if isinstance(target, Mapping):
         result = {}
         for key, value in target.items():
             try:
                 result[key] = merge_lists(value)
             except Exception as e:
                 raise RuntimeError(
                     'Failed to convert value for key "{}"'.format(
                         key)) from e
         return {k: merge_lists(v) for k, v in target.items()}
     elif target and (isinstance(target[0], tf.compat.v1.SparseTensorValue)
                      or isinstance(target[0], types.SparseTensorValue)):
         t = tf.sparse.concat(0, [
             tf.sparse.expand_dims(to_tensorflow_tensor(t), 0)
             for t in target
         ])
         return to_tensor_value(t)
     elif target and isinstance(target[0], types.RaggedTensorValue):
         t = tf.concat(
             [tf.expand_dims(to_tensorflow_tensor(t), 0) for t in target],
             0)
         return to_tensor_value(t)
     else:
         arr = np.array(target)
         # Flatten values that were originally single item lists into a single list
         # e.g. [[1], [2], [3]] -> [1, 2, 3]
         if len(arr.shape) == 2 and arr.shape[1] == 1:
             return arr.squeeze(axis=1)
         # Special case for empty slice arrays since numpy treats empty tuples as
         # arrays with dimension 0.
         # e.g. [[()], [()], [()]] -> [(), (), ()]
         elif len(arr.shape
                  ) == 3 and arr.shape[1] == 1 and arr.shape[2] == 0:
             return arr.squeeze(axis=1)
         else:
             return arr
def get_fpl_copy(extracts: types.Extracts) -> types.FeaturesPredictionsLabels:
    """Get a copy of the FPL in the extracts of extracts."""
    fpl_orig = extracts.get(constants.FEATURES_PREDICTIONS_LABELS_KEY)
    if not fpl_orig:
        raise RuntimeError('FPL missing, Please ensure _Predict() was called.')

    # We must make a copy of the FPL tuple as well, so that we don't mutate the
    # original which is disallowed by Beam.
    fpl_copy = types.FeaturesPredictionsLabels(
        features=copy.copy(fpl_orig.features),
        labels=fpl_orig.labels,
        predictions=fpl_orig.predictions,
        input_ref=fpl_orig.input_ref)
    return fpl_copy
示例#12
0
 def merge_lists(target: types.Extracts) -> types.Extracts:
     """Converts target's leaves which are lists to batched np.array's, etc."""
     if isinstance(target, Mapping):
         result = {}
         for key, value in target.items():
             try:
                 result[key] = merge_lists(value)
             except Exception as e:
                 raise RuntimeError(
                     'Failed to convert value for key "{}"'.format(
                         key)) from e
         return {k: merge_lists(v) for k, v in target.items()}
     elif target and (isinstance(target[0], tf.compat.v1.SparseTensorValue)
                      or isinstance(target[0], types.SparseTensorValue)):
         t = tf.compat.v1.sparse_concat(0, [
             tf.sparse.expand_dims(to_tensorflow_tensor(t), 0)
             for t in target
         ],
                                        expand_nonconcat_dim=True)
         return to_tensor_value(t)
     elif target and isinstance(target[0], types.RaggedTensorValue):
         t = tf.concat(
             [tf.expand_dims(to_tensorflow_tensor(t), 0) for t in target],
             0)
         return to_tensor_value(t)
     elif (all(isinstance(t, np.ndarray) for t in target)
           and len({t.shape
                    for t in target}) > 1):
         return types.VarLenTensorValue.from_dense_rows(target)
     else:
         arr = np.array(target)
         # Flatten values that were originally single item lists into a single list
         # e.g. [[1], [2], [3]] -> [1, 2, 3]
         if len(arr.shape) == 2 and arr.shape[1] == 1:
             return arr.squeeze(axis=1)
         return arr
示例#13
0
  def process(self, element: types.Extracts
             ) -> Generator[Tuple[SliceKeyType, types.Extracts], None, None]:
    filtered = {}
    for key in element:
      if not self._include_slice_keys_in_output and key in (
          constants.SLICE_KEY_TYPES_KEY, constants.SLICE_KEYS_KEY):
        continue
      filtered[key] = element[key]
    slice_count = 0
    for slice_key in element.get(constants.SLICE_KEY_TYPES_KEY):
      slice_count += 1
      yield (slice_key, filtered)

    self._num_slices_generated_per_instance.update(slice_count)
    self._post_slice_num_instances.inc(slice_count)
示例#14
0
def _extract_unbatched_inputs(  # pylint: disable=invalid-name
    mixed_legacy_batched_extract: types.Extracts) -> Sequence[types.Extracts]:
    """Extract features, predictions, labels and weights from batched extract."""
    batched_extract = {}
    # TODO(mdreves): Remove record batch
    keys_to_retain = set(mixed_legacy_batched_extract.keys())
    if constants.ARROW_RECORD_BATCH_KEY in keys_to_retain:
        keys_to_retain.remove(constants.ARROW_RECORD_BATCH_KEY)
    dataframe = pd.DataFrame()
    for key in keys_to_retain:
        # Previously a batch of transformed features were stored as a list of dicts
        # instead of a dict of np.arrays with batch dimensions. These legacy
        # conversions are done using dataframes instead.
        if isinstance(mixed_legacy_batched_extract[key], list):
            try:
                dataframe[key] = mixed_legacy_batched_extract[key]
            except Exception as e:
                raise RuntimeError(
                    f'Exception encountered while adding key {key} with '
                    f'batched length {len(mixed_legacy_batched_extract[key])}'
                ) from e
        else:
            batched_extract[key] = mixed_legacy_batched_extract[key]
    unbatched_extracts = util.split_extracts(batched_extract)
    legacy_unbatched_extracts = dataframe.to_dict(orient='records')
    if unbatched_extracts and legacy_unbatched_extracts:
        if len(unbatched_extracts) != len(legacy_unbatched_extracts):
            raise ValueError(
                f'Batch sizes have differing values: {len(unbatched_extracts)} != '
                f'{len(legacy_unbatched_extracts)}, '
                f'unbatched_extracts={unbatched_extracts}, '
                f'legacy_unbatched_extracts={legacy_unbatched_extracts}')
        result = []
        for unbatched_extract, legacy_unbatched_extract in zip(
                unbatched_extracts, legacy_unbatched_extracts):
            legacy_unbatched_extract.update(unbatched_extract)
            result.append(legacy_unbatched_extract)
        return result
    elif legacy_unbatched_extracts:
        return legacy_unbatched_extracts
    else:
        return unbatched_extracts
示例#15
0
 def visit(subtree: types.Extracts, keys: List[str]):
     for key, value in subtree.items():
         if isinstance(value, Mapping):
             visit(value, keys + [key])
         else:
             add_to_results(keys + [key], value)