def _AugmentExtracts(fpl_dict, example_and_extracts): """Augments the ExampleAndExtracts with FeaturesPredictionsLabels. Args: fpl_dict: The dictionary returned by evaluate._Predict() example_and_extracts: The ExampleAndExtracts to be augmented. This is mutated in-place. Raises: TypeError: If the FeaturesPredictionsLabels is corrupt. """ for name, val in fpl_dict.items(): val = val.get(encoding.NODE_SUFFIX) if isinstance(val, tf.SparseTensorValue): example_and_extracts.extracts[name] = types.MaterializedColumn( name=name, value=val.values[0:_MAX_SPARSE_FEATURES_PER_COLUMN]) elif isinstance(val, np.ndarray): val = val[0] # only support first dim for now. if not np.isscalar(val): val = val[0:_MAX_SPARSE_FEATURES_PER_COLUMN] example_and_extracts.extracts[name] = types.MaterializedColumn( name=name, value=val) else: raise TypeError( 'Dictionary item with key %s, value %s had unexpected type %s' % (name, val, type(val)))
def _AugmentExtracts(fpl_dict, example_and_extracts): """Augments The ExampleAndExtracts with FeaturesPredictionsLabels. Args: fpl_dict: The dictionary returned by evaluate._Predict() example_and_extracts: The ExampleAndExtracts to be augmented -- note that this variable modified (ie both an input and output) Raises: TypeError: if the FeaturesPredictionsLabels is corrupt. """ for name, val in fpl_dict.iteritems(): val = val.get(encoding.NODE_SUFFIX) if isinstance(val, tf.SparseTensorValue): example_and_extracts.extracts[name] = types.MaterializedColumn( name=name, value=val.values[0:_MAX_SPARSE_FEATURES_PER_COLUMN]) elif isinstance(val, np.ndarray): val = val[0] # only support first dim for now. if not np.isscalar(val): val = val[0:_MAX_SPARSE_FEATURES_PER_COLUMN] example_and_extracts.extracts[name] = types.MaterializedColumn( name=name, value=val) else: raise TypeError('Unexpected fpl type: %s' % str(val))
def process(self, element: types.Extracts) -> List[types.Extracts]: # Slice on transformed features if available. features_dicts = [] if (constants.TRANSFORMED_FEATURES_KEY in element and element[constants.TRANSFORMED_FEATURES_KEY] is not None): transformed_features = element[constants.TRANSFORMED_FEATURES_KEY] # If only one model, the output is stored without keying on model name. if not self._eval_config or len( self._eval_config.model_specs) == 1: features_dicts.append(transformed_features) else: # Search for slices in each model's transformed features output. for spec in self._eval_config.model_specs: if spec.name in transformed_features: features_dicts.append(transformed_features[spec.name]) # Search for slices first in transformed features (if any). If a match is # not found there then search in raw features. slices = list( slicer.get_slices_for_features_dicts( features_dicts, util.get_features_from_extracts(element), self._slice_spec)) # Make a a shallow copy, so we don't mutate the original. element_copy = copy.copy(element) element_copy[constants.SLICE_KEY_TYPES_KEY] = slices # Add a list of stringified slice keys to be materialized to output table. if self._materialize: element_copy[constants.SLICE_KEYS_KEY] = types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=(list( slicer.stringify_slice_key(x).encode('utf-8') for x in slices))) return [element_copy]
def _ParseExample(extracts: types.Extracts, materialize_columns: bool = True) -> None: """Feature extraction from serialized tf.Example.""" # Deserialize the example. example = tf.train.Example() try: example.ParseFromString(extracts[constants.INPUT_KEY]) except: # pylint: disable=bare-except logging.warning('Could not parse tf.Example from the input source.') features = {} if constants.FEATURES_PREDICTIONS_LABELS_KEY in extracts: features = extracts[constants.FEATURES_PREDICTIONS_LABELS_KEY].features for name in example.features.feature: if materialize_columns or name not in features: key = util.compound_key(['features', name]) value = example.features.feature[name] if value.HasField('bytes_list'): values = [v for v in value.bytes_list.value] elif value.HasField('float_list'): values = [v for v in value.float_list.value] elif value.HasField('int64_list'): values = [v for v in value.int64_list.value] if materialize_columns: extracts[key] = types.MaterializedColumn(name=key, value=values) if name not in features: features[name] = {encoding.NODE_SUFFIX: np.array([values])}
def process(self, element: types.Extracts) -> List[types.Extracts]: fpl = element.get(constants.FEATURES_PREDICTIONS_LABELS_KEY) if not fpl: raise RuntimeError( 'FPL missing, Please ensure Predict() was called.') if not isinstance(fpl, types.FeaturesPredictionsLabels): raise TypeError( 'Expected FPL to be instance of FeaturesPredictionsLabel. FPL was: ' '%s of type %s' % (str(fpl), type(fpl))) features = fpl.features slices = list( slicer.get_slices_for_features_dict(features, self._slice_spec)) # Make a a shallow copy, so we don't mutate the original. element_copy = copy.copy(element) element_copy[constants.SLICE_KEY_TYPES_KEY] = slices # Add a list of stringified slice keys to be materialized to output table. if self._materialize: element_copy[constants.SLICE_KEYS_KEY] = types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=(list( slicer.stringify_slice_key(x).encode('utf-8') for x in slices))) return [element_copy]
def testMaterializeFeaturesNoMaterializedColumns(self): example1 = self._makeExample(age=3.0, language='english', label=1.0, slice_key='first_slice') features = { 'f': { encoding.NODE_SUFFIX: np.array([1]) }, 's': { encoding.NODE_SUFFIX: tf.compat.v1.SparseTensorValue(indices=[[0, 5], [1, 2], [3, 6]], values=[100., 200., 300.], dense_shape=[4, 10]) } } predictions = {'p': {encoding.NODE_SUFFIX: np.array([2])}} labels = {'l': {encoding.NODE_SUFFIX: np.array([3])}} extracts = { constants.INPUT_KEY: example1.SerializeToString(), constants.FEATURES_PREDICTIONS_LABELS_KEY: types.FeaturesPredictionsLabels(input_ref=0, features=features, predictions=predictions, labels=labels) } fpl = extracts[constants.FEATURES_PREDICTIONS_LABELS_KEY] result = feature_extractor._MaterializeFeatures(extracts) self.assertIsInstance(result, dict) self.assertEqual(result[constants.FEATURES_PREDICTIONS_LABELS_KEY], fpl) # should still be there. self.assertEqual( result['features__f'], types.MaterializedColumn(name='features__f', value=[1])) self.assertEqual( result['predictions__p'], types.MaterializedColumn(name='predictions__p', value=[2])) self.assertEqual(result['labels__l'], types.MaterializedColumn(name='labels__l', value=[3])) self.assertEqual( result['features__s'], types.MaterializedColumn(name='features__s', value=[100., 200., 300.]))
def _AugmentExtracts(data: Dict[Text, Any], prefix: Text, excludes: List[bytes], extracts: types.Extracts) -> None: """Augments the Extracts with FeaturesPredictionsLabels. Args: data: Data dictionary returned by PredictExtractor. prefix: Prefix to use in column naming (e.g. 'features', 'labels', etc). excludes: List of strings containing features, predictions, or labels to exclude from materialization. extracts: The Extracts to be augmented. This is mutated in-place. Raises: TypeError: If the FeaturesPredictionsLabels is corrupt. """ for name, val in data.items(): if excludes is not None and name in excludes: continue # If data originated from FeaturesPredictionsLabels, then the value will be # stored under a 'node' key. if isinstance(val, dict) and encoding.NODE_SUFFIX in val: val = val.get(encoding.NODE_SUFFIX) if name in (prefix, util.KEY_SEPARATOR + prefix): col_name = prefix elif prefix not in ('features', 'predictions', 'labels'): # Names used by additional extracts should be properly escaped already so # avoid escaping the name a second time by manually combining the prefix. col_name = prefix + util.KEY_SEPARATOR + name else: col_name = util.compound_key([prefix, name]) if isinstance(val, tf.compat.v1.SparseTensorValue): extracts[col_name] = types.MaterializedColumn( name=col_name, value=val.values[0:_MAX_SPARSE_FEATURES_PER_COLUMN]) elif isinstance(val, np.ndarray): val = val[0] # only support first dim for now. if not np.isscalar(val): val = val[0:_MAX_SPARSE_FEATURES_PER_COLUMN] extracts[col_name] = types.MaterializedColumn(name=col_name, value=val) else: raise TypeError( 'Dictionary item with key %s, value %s had unexpected type %s' % (name, val, type(val)))
def check_result(got): try: self.assertLen(got, 2) expected_results = sorted([ types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=[b'Overall', b'gender:f']), types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=[b'Overall', b'gender:m']) ]) got_results = [] for item in got: self.assertIn(constants.SLICE_KEYS_KEY, item) got_results.append(item[constants.SLICE_KEYS_KEY]) self.assertCountEqual(got_results, expected_results) except AssertionError as err: raise util.BeamAssertException(err)
def check_result(got): try: self.assertEqual(2, len(got), 'got: %s' % got) expected_results = sorted([ types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=[b'Overall', b'gender:f']), types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=[b'Overall', b'gender:m']) ]) got_results = [] for item in got: self.assertTrue(constants.SLICE_KEYS_KEY in item) got_results.append(item[constants.SLICE_KEYS_KEY]) self.assertEqual(sorted(got_results), sorted(expected_results)) except AssertionError as err: raise util.BeamAssertException(err)
def testMaterializeFeaturesFromTfExample(self): example1 = self._makeExample(age=3.0, language='english', label=1.0) extracts = {constants.INPUT_KEY: example1.SerializeToString()} input_example = extracts[constants.INPUT_KEY] result = feature_extractor._MaterializeFeatures( extracts, source=constants.INPUT_KEY) self.assertIsInstance(result, dict) self.assertEqual(result[constants.INPUT_KEY], input_example) # should still be there. self.assertEqual( result['features__age'], types.MaterializedColumn(name='features__age', value=[3.0])) self.assertEqual( result['features__language'], types.MaterializedColumn(name='features__language', value=[b'english'])) self.assertEqual( result['features__label'], types.MaterializedColumn(name='features__label', value=[1.0]))
def check_result(got): try: self.assertEqual(2, len(got), 'got: %s' % got) expected_results = sorted([ types.MaterializedColumn( name='materialized_slice_keys', value=[b'Overall', b'gender:f']), types.MaterializedColumn( name='materialized_slice_keys', value=[b'Overall', b'gender:m']) ]) got_results = [] for item in got: extracts_dict = item.extracts self.assertTrue('materialized_slice_keys' in extracts_dict) got_results.append(extracts_dict['materialized_slice_keys']) self.assertEqual(sorted(got_results), sorted(expected_results)) except AssertionError as err: raise util.BeamAssertException(err)
def check_result(got): self.assertEqual(1, len(got), 'got: %s' % got) extracts = got[0] # Values of type MaterializedColumn are emitted to signal to # downstream sink components to output the data to file. materialized_dict = dict( (k, v) for k, v in extracts.items() if isinstance(v, types.MaterializedColumn)) self._assertMaterializedColumns( materialized_dict, { # Slice key 'features__slice_key': types.MaterializedColumn(name='features__slice_key', value=[b'first_slice']), # Features 'features__language': types.MaterializedColumn(name='features__language', value=[b'english']), 'features__age': types.MaterializedColumn(name='features__age', value=np.array( [3.], dtype=np.float32)), # Label 'features__label': types.MaterializedColumn(name='features__label', value=np.array( [1.], dtype=np.float32)), 'labels': types.MaterializedColumn(name='labels', value=np.array( [1.], dtype=np.float32)), }) self._assertMaterializedColumnsExist(materialized_dict, [ 'predictions__logits', 'predictions__probabilities', 'predictions__classes', 'predictions__logistic', 'predictions__class_ids', constants.SLICE_KEYS_KEY ])
def testMaterializeFeaturesNoMaterializedColumns(self): example1 = self._makeExample(age=3.0, language='english', label=1.0, slice_key='first_slice') features = { 'f': { encoding.NODE_SUFFIX: np.array([1]) }, 's': { encoding.NODE_SUFFIX: tf.SparseTensorValue(indices=[[0, 5], [1, 2], [3, 6]], values=[100., 200., 300.], dense_shape=[4, 10]) } } predictions = {'p': {encoding.NODE_SUFFIX: np.array([2])}} labels = {'l': {encoding.NODE_SUFFIX: np.array([3])}} example_and_extracts = types.ExampleAndExtracts( example=example1.SerializeToString(), extracts={ 'fpl': load.FeaturesPredictionsLabels(features, predictions, labels) }) fpl = example_and_extracts.extracts[ constants.FEATURES_PREDICTIONS_LABELS_KEY] result = feature_extractor._MaterializeFeatures(example_and_extracts) self.assertTrue(isinstance(result, types.ExampleAndExtracts)) self.assertEqual(result.extracts['fpl'], fpl) # should still be there. self.assertEqual(result.extracts['f'], types.MaterializedColumn(name='f', value=[1])) self.assertEqual(result.extracts['p'], types.MaterializedColumn(name='p', value=[2])) self.assertEqual(result.extracts['l'], types.MaterializedColumn(name='l', value=[3])) self.assertEqual( result.extracts['s'], types.MaterializedColumn(name='s', value=[100., 200., 300.]))
def _AugmentExtracts(fpl_dict, prefix, excludes, extracts): """Augments the Extracts with FeaturesPredictionsLabels. Args: fpl_dict: The dictionary returned by PredictExtractor. prefix: Prefix to use in column naming (e.g. 'features', 'labels', etc). excludes: List of strings containing features, predictions, or labels to exclude from materialization. extracts: The Extracts to be augmented. This is mutated in-place. Raises: TypeError: If the FeaturesPredictionsLabels is corrupt. """ for name, val in fpl_dict.items(): if excludes is not None and name in excludes: continue val = val.get(encoding.NODE_SUFFIX) if name in (prefix, constants.KEY_SEPARATOR + prefix): col_name = prefix else: col_name = util.compound_key([prefix, name]) if isinstance(val, tf.SparseTensorValue): extracts[col_name] = types.MaterializedColumn( name=col_name, value=val.values[0:_MAX_SPARSE_FEATURES_PER_COLUMN]) elif isinstance(val, np.ndarray): val = val[0] # only support first dim for now. if not np.isscalar(val): val = val[0:_MAX_SPARSE_FEATURES_PER_COLUMN] extracts[col_name] = types.MaterializedColumn(name=col_name, value=val) else: raise TypeError( 'Dictionary item with key %s, value %s had unexpected type %s' % (name, val, type(val)))
def process( self, element: types.Extracts, slice_spec: List[slicer.SingleSliceSpec]) -> List[types.Extracts]: # Slice on transformed features if available. features_dicts = [] if (constants.TRANSFORMED_FEATURES_KEY in element and element[constants.TRANSFORMED_FEATURES_KEY] is not None): transformed_features = element[constants.TRANSFORMED_FEATURES_KEY] # If only one model, the output is stored without keying on model name. if not self._eval_config or len( self._eval_config.model_specs) == 1: features_dicts.append(transformed_features) else: # Search for slices in each model's transformed features output. for spec in self._eval_config.model_specs: if spec.name in transformed_features: features_dicts.append(transformed_features[spec.name]) # Search for slices first in transformed features (if any). If a match is # not found there then search in raw features. slice_keys = list( slicer.get_slices_for_features_dicts( features_dicts, util.get_features_from_extracts(element), slice_spec)) # If SLICE_KEY_TYPES_KEY already exists, that means the # SqlSliceKeyExtractor has generated some slice keys. We need to add # them to current slice_keys list. if (constants.SLICE_KEY_TYPES_KEY in element and element[constants.SLICE_KEY_TYPES_KEY]): slice_keys.extend(element[constants.SLICE_KEY_TYPES_KEY]) unique_slice_keys = list(set(slice_keys)) if len(slice_keys) != len(unique_slice_keys): self._duplicate_slice_keys_counter.inc() # Make a a shallow copy, so we don't mutate the original. element_copy = copy.copy(element) element_copy[constants.SLICE_KEY_TYPES_KEY] = ( slicer.slice_keys_to_numpy_array(unique_slice_keys)) # Add a list of stringified slice keys to be materialized to output table. if self._materialize: element_copy[constants.SLICE_KEYS_KEY] = types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=(list( slicer.stringify_slice_key(x).encode('utf-8') for x in unique_slice_keys))) return [element_copy]
def process(self, element: types.Extracts) -> List[types.Extracts]: features = util.get_features_from_extracts(element) slices = list( slicer.get_slices_for_features_dict(features, self._slice_spec)) # Make a a shallow copy, so we don't mutate the original. element_copy = copy.copy(element) element_copy[constants.SLICE_KEY_TYPES_KEY] = slices # Add a list of stringified slice keys to be materialized to output table. if self._materialize: element_copy[constants.SLICE_KEYS_KEY] = types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=(list( slicer.stringify_slice_key(x).encode('utf-8') for x in slices))) return [element_copy]
def _ParseExample(extracts): """Feature extraction from serialized tf.Example.""" # Deserialize the example. example = tf.train.Example() example.ParseFromString(extracts[constants.INPUT_KEY]) for name in example.features.feature: key = util.compound_key(['features', name]) value = example.features.feature[name] if value.HasField('bytes_list'): values = [v for v in value.bytes_list.value] elif value.HasField('float_list'): values = [v for v in value.float_list.value] elif value.HasField('int64_list'): values = [v for v in value.int64_list.value] extracts[key] = types.MaterializedColumn(name=key, value=values)
def check_result(got): self.assertEqual(1, len(got), 'got: %s' % got) extracts = got[0] # Values of type MaterializedColumn are emitted to signal to # downstream sink components to output the data to file. materialized_dict = dict( (k, v) for k, v in extracts.items() if isinstance(v, types.MaterializedColumn)) self._assertMaterializedColumns(materialized_dict, { constants.SLICE_KEYS_KEY: types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=[b'age:3.0', b'age_X_language:3.0_X_english']) }, sort_values=True) self._assertMaterializedColumnsExist(materialized_dict, [ 'predictions__logits', 'predictions__probabilities', 'predictions__classes', 'predictions__logistic', 'predictions__class_ids' ])
def process(self, element): fpl = element.extracts.get(constants.FEATURES_PREDICTIONS_LABELS_KEY) if not fpl: raise RuntimeError( 'FPL missing, Please ensure Predict() was called.') if not isinstance(fpl, load.FeaturesPredictionsLabels): raise TypeError( 'Expected FPL to be instance of FeaturesPredictionsLabel. FPL was: ' '%s of type %s' % (str(fpl), type(fpl))) features = fpl.features slices = list( slicer.get_slices_for_features_dict(features, self._slice_spec)) # Make a a shallow copy, so we don't mutate the original. element_copy = (element.create_copy_with_shallow_copy_of_extracts()) element_copy.extracts[constants.SLICE_KEYS] = slices # Add a list of stringified slice keys to be materialized to output table. element_copy.extracts[ constants.SLICE_KEYS_MATERIALIZED] = types.MaterializedColumn( name=constants.SLICE_KEYS_MATERIALIZED, value=(list(slicer.stringify_slice_key(x) for x in slices))) return [element_copy]
def check_result(got): self.assertEqual(1, len(got), 'got: %s' % got) _, extracts = got[0] # Values of type MaterializedColumn are emitted to signal to # downstream sink components to output the data to file. materialized_dict = dict( (k, v) for k, v in extracts.iteritems() if isinstance(v, types.MaterializedColumn)) self._assertMaterializedColumns( materialized_dict, { 'tfma_slice_keys': types.MaterializedColumn( name='tfma_slice_keys', value=[ 'age:3.0', 'age:3', 'age_X_language:3.0_X_english' ]) }) self._assertMaterializedColumnsExist(materialized_dict, [ 'logits', 'probabilities', 'classes', 'logistic', 'class_ids' ])