예제 #1
0
def _AugmentExtracts(fpl_dict, example_and_extracts):
    """Augments the ExampleAndExtracts with FeaturesPredictionsLabels.

  Args:
    fpl_dict: The dictionary returned by evaluate._Predict()
    example_and_extracts: The ExampleAndExtracts to be augmented. This is
      mutated in-place.

  Raises:
    TypeError: If the FeaturesPredictionsLabels is corrupt.
  """
    for name, val in fpl_dict.items():
        val = val.get(encoding.NODE_SUFFIX)

        if isinstance(val, tf.SparseTensorValue):
            example_and_extracts.extracts[name] = types.MaterializedColumn(
                name=name, value=val.values[0:_MAX_SPARSE_FEATURES_PER_COLUMN])

        elif isinstance(val, np.ndarray):
            val = val[0]  # only support first dim for now.
            if not np.isscalar(val):
                val = val[0:_MAX_SPARSE_FEATURES_PER_COLUMN]
            example_and_extracts.extracts[name] = types.MaterializedColumn(
                name=name, value=val)

        else:
            raise TypeError(
                'Dictionary item with key %s, value %s had unexpected type %s'
                % (name, val, type(val)))
예제 #2
0
def _AugmentExtracts(fpl_dict, example_and_extracts):
    """Augments The ExampleAndExtracts with FeaturesPredictionsLabels.

  Args:
    fpl_dict: The dictionary returned by evaluate._Predict()
    example_and_extracts: The ExampleAndExtracts to be augmented -- note that
      this variable modified (ie both an input and output)
  Raises:
    TypeError: if the FeaturesPredictionsLabels is corrupt.
  """
    for name, val in fpl_dict.iteritems():
        val = val.get(encoding.NODE_SUFFIX)

        if isinstance(val, tf.SparseTensorValue):
            example_and_extracts.extracts[name] = types.MaterializedColumn(
                name=name, value=val.values[0:_MAX_SPARSE_FEATURES_PER_COLUMN])

        elif isinstance(val, np.ndarray):
            val = val[0]  # only support first dim for now.
            if not np.isscalar(val):
                val = val[0:_MAX_SPARSE_FEATURES_PER_COLUMN]
            example_and_extracts.extracts[name] = types.MaterializedColumn(
                name=name, value=val)

        else:
            raise TypeError('Unexpected fpl type: %s' % str(val))
예제 #3
0
    def process(self, element: types.Extracts) -> List[types.Extracts]:
        # Slice on transformed features if available.
        features_dicts = []
        if (constants.TRANSFORMED_FEATURES_KEY in element
                and element[constants.TRANSFORMED_FEATURES_KEY] is not None):
            transformed_features = element[constants.TRANSFORMED_FEATURES_KEY]
            # If only one model, the output is stored without keying on model name.
            if not self._eval_config or len(
                    self._eval_config.model_specs) == 1:
                features_dicts.append(transformed_features)
            else:
                # Search for slices in each model's transformed features output.
                for spec in self._eval_config.model_specs:
                    if spec.name in transformed_features:
                        features_dicts.append(transformed_features[spec.name])
        # Search for slices first in transformed features (if any). If a match is
        # not found there then search in raw features.
        slices = list(
            slicer.get_slices_for_features_dicts(
                features_dicts, util.get_features_from_extracts(element),
                self._slice_spec))

        # Make a a shallow copy, so we don't mutate the original.
        element_copy = copy.copy(element)

        element_copy[constants.SLICE_KEY_TYPES_KEY] = slices
        # Add a list of stringified slice keys to be materialized to output table.
        if self._materialize:
            element_copy[constants.SLICE_KEYS_KEY] = types.MaterializedColumn(
                name=constants.SLICE_KEYS_KEY,
                value=(list(
                    slicer.stringify_slice_key(x).encode('utf-8')
                    for x in slices)))
        return [element_copy]
예제 #4
0
def _ParseExample(extracts: types.Extracts,
                  materialize_columns: bool = True) -> None:
    """Feature extraction from serialized tf.Example."""
    # Deserialize the example.
    example = tf.train.Example()
    try:
        example.ParseFromString(extracts[constants.INPUT_KEY])
    except:  # pylint: disable=bare-except
        logging.warning('Could not parse tf.Example from the input source.')

    features = {}
    if constants.FEATURES_PREDICTIONS_LABELS_KEY in extracts:
        features = extracts[constants.FEATURES_PREDICTIONS_LABELS_KEY].features

    for name in example.features.feature:
        if materialize_columns or name not in features:
            key = util.compound_key(['features', name])
            value = example.features.feature[name]
            if value.HasField('bytes_list'):
                values = [v for v in value.bytes_list.value]
            elif value.HasField('float_list'):
                values = [v for v in value.float_list.value]
            elif value.HasField('int64_list'):
                values = [v for v in value.int64_list.value]
            if materialize_columns:
                extracts[key] = types.MaterializedColumn(name=key,
                                                         value=values)
            if name not in features:
                features[name] = {encoding.NODE_SUFFIX: np.array([values])}
    def process(self, element: types.Extracts) -> List[types.Extracts]:
        fpl = element.get(constants.FEATURES_PREDICTIONS_LABELS_KEY)
        if not fpl:
            raise RuntimeError(
                'FPL missing, Please ensure Predict() was called.')
        if not isinstance(fpl, types.FeaturesPredictionsLabels):
            raise TypeError(
                'Expected FPL to be instance of FeaturesPredictionsLabel. FPL was: '
                '%s of type %s' % (str(fpl), type(fpl)))
        features = fpl.features
        slices = list(
            slicer.get_slices_for_features_dict(features, self._slice_spec))

        # Make a a shallow copy, so we don't mutate the original.
        element_copy = copy.copy(element)

        element_copy[constants.SLICE_KEY_TYPES_KEY] = slices
        # Add a list of stringified slice keys to be materialized to output table.
        if self._materialize:
            element_copy[constants.SLICE_KEYS_KEY] = types.MaterializedColumn(
                name=constants.SLICE_KEYS_KEY,
                value=(list(
                    slicer.stringify_slice_key(x).encode('utf-8')
                    for x in slices)))
        return [element_copy]
    def testMaterializeFeaturesNoMaterializedColumns(self):
        example1 = self._makeExample(age=3.0,
                                     language='english',
                                     label=1.0,
                                     slice_key='first_slice')

        features = {
            'f': {
                encoding.NODE_SUFFIX: np.array([1])
            },
            's': {
                encoding.NODE_SUFFIX:
                tf.compat.v1.SparseTensorValue(indices=[[0, 5], [1, 2], [3,
                                                                         6]],
                                               values=[100., 200., 300.],
                                               dense_shape=[4, 10])
            }
        }
        predictions = {'p': {encoding.NODE_SUFFIX: np.array([2])}}
        labels = {'l': {encoding.NODE_SUFFIX: np.array([3])}}

        extracts = {
            constants.INPUT_KEY:
            example1.SerializeToString(),
            constants.FEATURES_PREDICTIONS_LABELS_KEY:
            types.FeaturesPredictionsLabels(input_ref=0,
                                            features=features,
                                            predictions=predictions,
                                            labels=labels)
        }
        fpl = extracts[constants.FEATURES_PREDICTIONS_LABELS_KEY]
        result = feature_extractor._MaterializeFeatures(extracts)
        self.assertIsInstance(result, dict)
        self.assertEqual(result[constants.FEATURES_PREDICTIONS_LABELS_KEY],
                         fpl)  # should still be there.
        self.assertEqual(
            result['features__f'],
            types.MaterializedColumn(name='features__f', value=[1]))
        self.assertEqual(
            result['predictions__p'],
            types.MaterializedColumn(name='predictions__p', value=[2]))
        self.assertEqual(result['labels__l'],
                         types.MaterializedColumn(name='labels__l', value=[3]))
        self.assertEqual(
            result['features__s'],
            types.MaterializedColumn(name='features__s',
                                     value=[100., 200., 300.]))
예제 #7
0
def _AugmentExtracts(data: Dict[Text, Any], prefix: Text,
                     excludes: List[bytes], extracts: types.Extracts) -> None:
    """Augments the Extracts with FeaturesPredictionsLabels.

  Args:
    data: Data dictionary returned by PredictExtractor.
    prefix: Prefix to use in column naming (e.g. 'features', 'labels', etc).
    excludes: List of strings containing features, predictions, or labels to
      exclude from materialization.
    extracts: The Extracts to be augmented. This is mutated in-place.

  Raises:
    TypeError: If the FeaturesPredictionsLabels is corrupt.
  """
    for name, val in data.items():
        if excludes is not None and name in excludes:
            continue
        # If data originated from FeaturesPredictionsLabels, then the value will be
        # stored under a 'node' key.
        if isinstance(val, dict) and encoding.NODE_SUFFIX in val:
            val = val.get(encoding.NODE_SUFFIX)

        if name in (prefix, util.KEY_SEPARATOR + prefix):
            col_name = prefix
        elif prefix not in ('features', 'predictions', 'labels'):
            # Names used by additional extracts should be properly escaped already so
            # avoid escaping the name a second time by manually combining the prefix.
            col_name = prefix + util.KEY_SEPARATOR + name
        else:
            col_name = util.compound_key([prefix, name])

        if isinstance(val, tf.compat.v1.SparseTensorValue):
            extracts[col_name] = types.MaterializedColumn(
                name=col_name,
                value=val.values[0:_MAX_SPARSE_FEATURES_PER_COLUMN])

        elif isinstance(val, np.ndarray):
            val = val[0]  # only support first dim for now.
            if not np.isscalar(val):
                val = val[0:_MAX_SPARSE_FEATURES_PER_COLUMN]
            extracts[col_name] = types.MaterializedColumn(name=col_name,
                                                          value=val)

        else:
            raise TypeError(
                'Dictionary item with key %s, value %s had unexpected type %s'
                % (name, val, type(val)))
 def check_result(got):
     try:
         self.assertLen(got, 2)
         expected_results = sorted([
             types.MaterializedColumn(
                 name=constants.SLICE_KEYS_KEY,
                 value=[b'Overall', b'gender:f']),
             types.MaterializedColumn(
                 name=constants.SLICE_KEYS_KEY,
                 value=[b'Overall', b'gender:m'])
         ])
         got_results = []
         for item in got:
             self.assertIn(constants.SLICE_KEYS_KEY, item)
             got_results.append(item[constants.SLICE_KEYS_KEY])
         self.assertCountEqual(got_results, expected_results)
     except AssertionError as err:
         raise util.BeamAssertException(err)
 def check_result(got):
   try:
     self.assertEqual(2, len(got), 'got: %s' % got)
     expected_results = sorted([
         types.MaterializedColumn(
             name=constants.SLICE_KEYS_KEY,
             value=[b'Overall', b'gender:f']),
         types.MaterializedColumn(
             name=constants.SLICE_KEYS_KEY,
             value=[b'Overall', b'gender:m'])
     ])
     got_results = []
     for item in got:
       self.assertTrue(constants.SLICE_KEYS_KEY in item)
       got_results.append(item[constants.SLICE_KEYS_KEY])
     self.assertEqual(sorted(got_results), sorted(expected_results))
   except AssertionError as err:
     raise util.BeamAssertException(err)
  def testMaterializeFeaturesFromTfExample(self):
    example1 = self._makeExample(age=3.0, language='english', label=1.0)

    extracts = {constants.INPUT_KEY: example1.SerializeToString()}
    input_example = extracts[constants.INPUT_KEY]
    result = feature_extractor._MaterializeFeatures(
        extracts, source=constants.INPUT_KEY)
    self.assertIsInstance(result, dict)
    self.assertEqual(result[constants.INPUT_KEY],
                     input_example)  # should still be there.
    self.assertEqual(
        result['features__age'],
        types.MaterializedColumn(name='features__age', value=[3.0]))
    self.assertEqual(
        result['features__language'],
        types.MaterializedColumn(name='features__language', value=[b'english']))
    self.assertEqual(
        result['features__label'],
        types.MaterializedColumn(name='features__label', value=[1.0]))
 def check_result(got):
   try:
     self.assertEqual(2, len(got), 'got: %s' % got)
     expected_results = sorted([
         types.MaterializedColumn(
             name='materialized_slice_keys',
             value=[b'Overall', b'gender:f']),
         types.MaterializedColumn(
             name='materialized_slice_keys',
             value=[b'Overall', b'gender:m'])
     ])
     got_results = []
     for item in got:
       extracts_dict = item.extracts
       self.assertTrue('materialized_slice_keys' in extracts_dict)
       got_results.append(extracts_dict['materialized_slice_keys'])
     self.assertEqual(sorted(got_results), sorted(expected_results))
   except AssertionError as err:
     raise util.BeamAssertException(err)
예제 #12
0
            def check_result(got):
                self.assertEqual(1, len(got), 'got: %s' % got)
                extracts = got[0]

                # Values of type MaterializedColumn are emitted to signal to
                # downstream sink components to output the data to file.
                materialized_dict = dict(
                    (k, v) for k, v in extracts.items()
                    if isinstance(v, types.MaterializedColumn))
                self._assertMaterializedColumns(
                    materialized_dict,
                    {
                        # Slice key
                        'features__slice_key':
                        types.MaterializedColumn(name='features__slice_key',
                                                 value=[b'first_slice']),

                        # Features
                        'features__language':
                        types.MaterializedColumn(name='features__language',
                                                 value=[b'english']),
                        'features__age':
                        types.MaterializedColumn(name='features__age',
                                                 value=np.array(
                                                     [3.], dtype=np.float32)),

                        # Label
                        'features__label':
                        types.MaterializedColumn(name='features__label',
                                                 value=np.array(
                                                     [1.], dtype=np.float32)),
                        'labels':
                        types.MaterializedColumn(name='labels',
                                                 value=np.array(
                                                     [1.], dtype=np.float32)),
                    })
                self._assertMaterializedColumnsExist(materialized_dict, [
                    'predictions__logits', 'predictions__probabilities',
                    'predictions__classes', 'predictions__logistic',
                    'predictions__class_ids', constants.SLICE_KEYS_KEY
                ])
    def testMaterializeFeaturesNoMaterializedColumns(self):
        example1 = self._makeExample(age=3.0,
                                     language='english',
                                     label=1.0,
                                     slice_key='first_slice')

        features = {
            'f': {
                encoding.NODE_SUFFIX: np.array([1])
            },
            's': {
                encoding.NODE_SUFFIX:
                tf.SparseTensorValue(indices=[[0, 5], [1, 2], [3, 6]],
                                     values=[100., 200., 300.],
                                     dense_shape=[4, 10])
            }
        }
        predictions = {'p': {encoding.NODE_SUFFIX: np.array([2])}}
        labels = {'l': {encoding.NODE_SUFFIX: np.array([3])}}

        example_and_extracts = types.ExampleAndExtracts(
            example=example1.SerializeToString(),
            extracts={
                'fpl':
                load.FeaturesPredictionsLabels(features, predictions, labels)
            })
        fpl = example_and_extracts.extracts[
            constants.FEATURES_PREDICTIONS_LABELS_KEY]
        result = feature_extractor._MaterializeFeatures(example_and_extracts)
        self.assertTrue(isinstance(result, types.ExampleAndExtracts))
        self.assertEqual(result.extracts['fpl'], fpl)  # should still be there.
        self.assertEqual(result.extracts['f'],
                         types.MaterializedColumn(name='f', value=[1]))
        self.assertEqual(result.extracts['p'],
                         types.MaterializedColumn(name='p', value=[2]))
        self.assertEqual(result.extracts['l'],
                         types.MaterializedColumn(name='l', value=[3]))
        self.assertEqual(
            result.extracts['s'],
            types.MaterializedColumn(name='s', value=[100., 200., 300.]))
예제 #14
0
def _AugmentExtracts(fpl_dict, prefix, excludes, extracts):
    """Augments the Extracts with FeaturesPredictionsLabels.

  Args:
    fpl_dict: The dictionary returned by PredictExtractor.
    prefix: Prefix to use in column naming (e.g. 'features', 'labels', etc).
    excludes: List of strings containing features, predictions, or labels to
      exclude from materialization.
    extracts: The Extracts to be augmented. This is mutated in-place.

  Raises:
    TypeError: If the FeaturesPredictionsLabels is corrupt.
  """
    for name, val in fpl_dict.items():
        if excludes is not None and name in excludes:
            continue
        val = val.get(encoding.NODE_SUFFIX)

        if name in (prefix, constants.KEY_SEPARATOR + prefix):
            col_name = prefix
        else:
            col_name = util.compound_key([prefix, name])

        if isinstance(val, tf.SparseTensorValue):
            extracts[col_name] = types.MaterializedColumn(
                name=col_name,
                value=val.values[0:_MAX_SPARSE_FEATURES_PER_COLUMN])

        elif isinstance(val, np.ndarray):
            val = val[0]  # only support first dim for now.
            if not np.isscalar(val):
                val = val[0:_MAX_SPARSE_FEATURES_PER_COLUMN]
            extracts[col_name] = types.MaterializedColumn(name=col_name,
                                                          value=val)

        else:
            raise TypeError(
                'Dictionary item with key %s, value %s had unexpected type %s'
                % (name, val, type(val)))
예제 #15
0
    def process(
            self, element: types.Extracts,
            slice_spec: List[slicer.SingleSliceSpec]) -> List[types.Extracts]:
        # Slice on transformed features if available.
        features_dicts = []
        if (constants.TRANSFORMED_FEATURES_KEY in element
                and element[constants.TRANSFORMED_FEATURES_KEY] is not None):
            transformed_features = element[constants.TRANSFORMED_FEATURES_KEY]
            # If only one model, the output is stored without keying on model name.
            if not self._eval_config or len(
                    self._eval_config.model_specs) == 1:
                features_dicts.append(transformed_features)
            else:
                # Search for slices in each model's transformed features output.
                for spec in self._eval_config.model_specs:
                    if spec.name in transformed_features:
                        features_dicts.append(transformed_features[spec.name])
        # Search for slices first in transformed features (if any). If a match is
        # not found there then search in raw features.
        slice_keys = list(
            slicer.get_slices_for_features_dicts(
                features_dicts, util.get_features_from_extracts(element),
                slice_spec))

        # If SLICE_KEY_TYPES_KEY already exists, that means the
        # SqlSliceKeyExtractor has generated some slice keys. We need to add
        # them to current slice_keys list.
        if (constants.SLICE_KEY_TYPES_KEY in element
                and element[constants.SLICE_KEY_TYPES_KEY]):
            slice_keys.extend(element[constants.SLICE_KEY_TYPES_KEY])

        unique_slice_keys = list(set(slice_keys))
        if len(slice_keys) != len(unique_slice_keys):
            self._duplicate_slice_keys_counter.inc()

        # Make a a shallow copy, so we don't mutate the original.
        element_copy = copy.copy(element)

        element_copy[constants.SLICE_KEY_TYPES_KEY] = (
            slicer.slice_keys_to_numpy_array(unique_slice_keys))
        # Add a list of stringified slice keys to be materialized to output table.
        if self._materialize:
            element_copy[constants.SLICE_KEYS_KEY] = types.MaterializedColumn(
                name=constants.SLICE_KEYS_KEY,
                value=(list(
                    slicer.stringify_slice_key(x).encode('utf-8')
                    for x in unique_slice_keys)))
        return [element_copy]
예제 #16
0
  def process(self, element: types.Extracts) -> List[types.Extracts]:
    features = util.get_features_from_extracts(element)
    slices = list(
        slicer.get_slices_for_features_dict(features, self._slice_spec))

    # Make a a shallow copy, so we don't mutate the original.
    element_copy = copy.copy(element)

    element_copy[constants.SLICE_KEY_TYPES_KEY] = slices
    # Add a list of stringified slice keys to be materialized to output table.
    if self._materialize:
      element_copy[constants.SLICE_KEYS_KEY] = types.MaterializedColumn(
          name=constants.SLICE_KEYS_KEY,
          value=(list(
              slicer.stringify_slice_key(x).encode('utf-8') for x in slices)))
    return [element_copy]
예제 #17
0
def _ParseExample(extracts):
    """Feature extraction from serialized tf.Example."""
    # Deserialize the example.
    example = tf.train.Example()
    example.ParseFromString(extracts[constants.INPUT_KEY])

    for name in example.features.feature:
        key = util.compound_key(['features', name])
        value = example.features.feature[name]
        if value.HasField('bytes_list'):
            values = [v for v in value.bytes_list.value]
        elif value.HasField('float_list'):
            values = [v for v in value.float_list.value]
        elif value.HasField('int64_list'):
            values = [v for v in value.int64_list.value]
        extracts[key] = types.MaterializedColumn(name=key, value=values)
예제 #18
0
            def check_result(got):
                self.assertEqual(1, len(got), 'got: %s' % got)
                extracts = got[0]

                # Values of type MaterializedColumn are emitted to signal to
                # downstream sink components to output the data to file.
                materialized_dict = dict(
                    (k, v) for k, v in extracts.items()
                    if isinstance(v, types.MaterializedColumn))
                self._assertMaterializedColumns(materialized_dict, {
                    constants.SLICE_KEYS_KEY:
                    types.MaterializedColumn(
                        name=constants.SLICE_KEYS_KEY,
                        value=[b'age:3.0', b'age_X_language:3.0_X_english'])
                },
                                                sort_values=True)
                self._assertMaterializedColumnsExist(materialized_dict, [
                    'predictions__logits', 'predictions__probabilities',
                    'predictions__classes', 'predictions__logistic',
                    'predictions__class_ids'
                ])
예제 #19
0
    def process(self, element):
        fpl = element.extracts.get(constants.FEATURES_PREDICTIONS_LABELS_KEY)
        if not fpl:
            raise RuntimeError(
                'FPL missing, Please ensure Predict() was called.')
        if not isinstance(fpl, load.FeaturesPredictionsLabels):
            raise TypeError(
                'Expected FPL to be instance of FeaturesPredictionsLabel. FPL was: '
                '%s of type %s' % (str(fpl), type(fpl)))
        features = fpl.features
        slices = list(
            slicer.get_slices_for_features_dict(features, self._slice_spec))

        # Make a a shallow copy, so we don't mutate the original.
        element_copy = (element.create_copy_with_shallow_copy_of_extracts())

        element_copy.extracts[constants.SLICE_KEYS] = slices
        # Add a list of stringified slice keys to be materialized to output table.
        element_copy.extracts[
            constants.SLICE_KEYS_MATERIALIZED] = types.MaterializedColumn(
                name=constants.SLICE_KEYS_MATERIALIZED,
                value=(list(slicer.stringify_slice_key(x) for x in slices)))
        return [element_copy]
            def check_result(got):
                self.assertEqual(1, len(got), 'got: %s' % got)
                _, extracts = got[0]

                # Values of type MaterializedColumn are emitted to signal to
                # downstream sink components to output the data to file.
                materialized_dict = dict(
                    (k, v) for k, v in extracts.iteritems()
                    if isinstance(v, types.MaterializedColumn))
                self._assertMaterializedColumns(
                    materialized_dict, {
                        'tfma_slice_keys':
                        types.MaterializedColumn(
                            name='tfma_slice_keys',
                            value=[
                                'age:3.0', 'age:3',
                                'age_X_language:3.0_X_english'
                            ])
                    })
                self._assertMaterializedColumnsExist(materialized_dict, [
                    'logits', 'probabilities', 'classes', 'logistic',
                    'class_ids'
                ])