def _preprocessing_fn_with_chained_ptransforms(inputs):
    class FakeChainable(tfx_namedtuple.namedtuple('FakeChainable', ['label']),
                        nodes.OperationDef):
        def __new__(cls):
            scope = tf.compat.v1.get_default_graph().get_name_scope()
            label = '{}[{}]'.format(cls.__name__, scope)
            return super(FakeChainable, cls).__new__(cls, label=label)

    with tf.compat.v1.name_scope('x'):
        input_values_node = nodes.apply_operation(analyzer_nodes.TensorSource,
                                                  tensors=[inputs['x']])
        with tf.compat.v1.name_scope('ptransform1'):
            intermediate_value_node = nodes.apply_operation(
                FakeChainable, input_values_node)
        with tf.compat.v1.name_scope('ptransform2'):
            output_value_node = nodes.apply_operation(FakeChainable,
                                                      intermediate_value_node)
        x_chained = analyzer_nodes.bind_future_as_tensor(
            output_value_node,
            analyzer_nodes.TensorInfo(tf.float32, (17, 27), None))
        return {'x_chained': x_chained}
示例#2
0
def _preprocessing_fn_for_generalized_chained_ptransforms(inputs):
    class FakeChainablePartitionable(
            collections.namedtuple('FakeChainablePartitionable', ['label']),
            nodes.OperationDef):
        def __new__(cls, label=None):
            if label is None:
                scope = tf.compat.v1.get_default_graph().get_name_scope()
                label = '{}[{}]'.format(cls.__name__, scope)
            return super(FakeChainablePartitionable, cls).__new__(cls,
                                                                  label=label)

        @property
        def num_outputs(self):
            return 1

        @property
        def is_partitionable(self):
            return True

    class FakeChainableCacheable(
            collections.namedtuple('FakeChainableCacheable', ['label']),
            nodes.OperationDef):
        def __new__(cls, label=None):
            if label is None:
                scope = tf.compat.v1.get_default_graph().get_name_scope()
                label = '{}[{}]'.format(cls.__name__, scope)
            return super(FakeChainableCacheable, cls).__new__(cls, label=label)

        @property
        def num_outputs(self):
            return 1

        @property
        def is_partitionable(self):
            return True

        @property
        def cache_coder(self):
            return 'Not-a-coder-but-thats-ok!'

    class FakeChainable(collections.namedtuple('FakeChainable', ['label']),
                        nodes.OperationDef):
        def __new__(cls, label=None):
            if label is None:
                scope = tf.compat.v1.get_default_graph().get_name_scope()
                label = '{}[{}]'.format(cls.__name__, scope)
            return super(FakeChainable, cls).__new__(cls, label=label)

        @property
        def num_outputs(self):
            return 1

        @property
        def is_partitionable(self):
            return False

    with tf.compat.v1.name_scope('x'):
        input_values_node = nodes.apply_operation(analyzer_nodes.TensorSource,
                                                  tensors=[inputs['x']])
        with tf.compat.v1.name_scope('partitionable1'):
            partitionable_outputs = nodes.apply_multi_output_operation(
                FakeChainablePartitionable, input_values_node)
        with tf.compat.v1.name_scope('cacheable1'):
            intermediate_cached_value_node = nodes.apply_multi_output_operation(
                FakeChainableCacheable, *partitionable_outputs)
        with tf.compat.v1.name_scope('partitionable2'):
            partitionable_outputs = nodes.apply_multi_output_operation(
                FakeChainablePartitionable, *intermediate_cached_value_node)
        with tf.compat.v1.name_scope('cacheable2'):
            cached_value_node = nodes.apply_multi_output_operation(
                FakeChainableCacheable, *partitionable_outputs)
        with tf.compat.v1.name_scope('partitionable3'):
            output_value_node = nodes.apply_multi_output_operation(
                FakeChainablePartitionable, *cached_value_node)
        with tf.compat.v1.name_scope('merge'):
            output_value_node = nodes.apply_operation(FakeChainable,
                                                      *output_value_node)
        with tf.compat.v1.name_scope('not-cacheable'):
            non_cached_output = nodes.apply_operation(FakeChainable,
                                                      input_values_node)
        x_chained = analyzer_nodes.bind_future_as_tensor(
            output_value_node,
            analyzer_nodes.TensorInfo(tf.float32, (17, 27), False))
        x_plain = analyzer_nodes.bind_future_as_tensor(
            non_cached_output,
            analyzer_nodes.TensorInfo(tf.int64, (7, 13), False))
        return {'x_chained': x_chained, 'x_plain': x_plain}
示例#3
0
 def output_tensor_infos(self) -> List[analyzer_nodes.TensorInfo]:
     return [analyzer_nodes.TensorInfo(tf.string, [None, 2], None)]
示例#4
0
def ptransform_analyzer(inputs: Collection[tf.Tensor],
                        ptransform: Union[_BeamPTransform,
                                          CacheablePTransformAnalyzer],
                        output_dtypes: Collection[tf.dtypes.DType],
                        output_shapes: Collection[List[int]],
                        output_asset_default_values: Optional[Collection[
                            Optional[bytes]]] = None,
                        name: Optional[str] = None):
    # pylint: disable=line-too-long
    """Applies a user-provided PTransform over the whole dataset.

  WARNING: This is experimental.

  Note that in order to have asset files copied correctly, any outputs that
  represent asset filenames must be added to the `tf.GraphKeys.ASSET_FILEPATHS`
  collection by the caller if using Transform's APIs in compat v1 mode.

  Example:

  >>> class MeanPerKey(beam.PTransform):
  ...   def expand(self, pcoll: beam.PCollection[Tuple[np.ndarray, np.ndarray]]) -> Tuple[beam.PCollection[np.ndarray], beam.PCollection[np.ndarray]]:
  ...     def extract_output(key_value_pairs):
  ...       keys, values = zip(*key_value_pairs)
  ...       return [beam.TaggedOutput('keys', keys),
  ...               beam.TaggedOutput('values', values)]
  ...     return tuple(
  ...         pcoll
  ...         | 'ZipAndFlatten' >> beam.FlatMap(lambda batches: list(zip(*batches)))
  ...         | 'MeanPerKey' >> beam.CombinePerKey(beam.combiners.MeanCombineFn())
  ...         | 'ToList' >> beam.combiners.ToList()
  ...         | 'Extract' >> beam.FlatMap(extract_output).with_outputs(
  ...             'keys', 'values'))
  >>> def preprocessing_fn(inputs):
  ...   outputs = tft.experimental.ptransform_analyzer(
  ...       inputs=[inputs['s'], inputs['x']],
  ...       ptransform=MeanPerKey(),
  ...       output_dtypes=[tf.string, tf.float32],
  ...       output_shapes=[[2], [2]])
  ...   (keys, means) = outputs
  ...   mean_a = tf.reshape(tf.gather(means, tf.where(keys == 'a')), [])
  ...   return { 'x/mean_a': inputs['x'] / mean_a }
  >>> raw_data = [dict(x=1, s='a'), dict(x=8, s='b'), dict(x=3, s='a')]
  >>> feature_spec = dict(
  ...     x=tf.io.FixedLenFeature([], tf.float32),
  ...     s=tf.io.FixedLenFeature([], tf.string))
  >>> raw_data_metadata = tft.DatasetMetadata.from_feature_spec(feature_spec)
  >>> with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
  ...   transformed_dataset, transform_fn = (
  ...       (raw_data, raw_data_metadata)
  ...       | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
  >>> transformed_data, transformed_metadata = transformed_dataset
  >>> transformed_data
  [{'x/mean_a': 0.5}, {'x/mean_a': 4.0}, {'x/mean_a': 1.5}]

  Args:
    inputs: An ordered collection of input `Tensor`s.
    ptransform: A Beam PTransform that accepts a Beam PCollection where each
      element is a tuple of `ndarray`s.  Each element in the tuple contains a
      batch of values for the corresponding input tensor of the analyzer and
      maintain their shapes and dtypes.
      It returns a `PCollection`, or a tuple of `PCollections`, each containing
      a single element which is an `ndarray` or a list of primitive types. The
      contents of these output `PCollection`s must be consistent with the given
      values of `output_dtypes` and `output_shapes`.
      It may inherit from `tft_beam.experimental.PTransformAnalyzer` if access
      to a temp base directory is needed.
      Alternatively, it could be an instance of
      `tft.experimental.CacheablePTransformAnalyzer` in order to enable cache
      for this analyzer, when analyzer cache is enabled for this pipeline.
    output_dtypes: An ordered collection of TensorFlow dtypes of the output of
      the analyzer.
    output_shapes: An ordered collection of shapes of the output of the
      analyzer. Must have the same length as output_dtypes.
    output_asset_default_values: (Optional) An ordered collection of optional
      `bytes` aligned with output_dtypes/output_shapes. Every item in this
      collection which is not `None` indicates that the output is a TF asset
      path, and its value would be used as the default value of this asset file
      prior to analysis.
    name: (Optional) Similar to a TF op name.  Used to define a unique scope for
      this analyzer, which can be used for debugging info.

  Returns:
    A list of output `Tensor`s.  These will have `dtype` and `shape` as
      specified by `output_dtypes` and `output_shapes`.

  Raises:
    ValueError: If output_dtypes and output_shapes have different lengths.
  """
    # pylint: enable=line-too-long
    if len(output_dtypes) != len(output_shapes):
        raise ValueError(
            'output_dtypes ({}) and output_shapes ({}) had different'
            ' lengths'.format(output_dtypes, output_shapes))
    if output_asset_default_values is not None:
        if len(output_asset_default_values) != len(output_dtypes):
            raise ValueError(
                'output_dtypes ({}) and output_asset_default_values ({}) had '
                'different lengths'.format(output_dtypes,
                                           output_asset_default_values))
        output_asset_default_values = [
            analyzer_nodes.TemporaryAssetInfo(value, 'text')
            for value in output_asset_default_values
        ]
    else:
        output_asset_default_values = [None] * len(output_dtypes)
    with tf.compat.v1.name_scope(name, 'ptransform'):
        output_tensor_infos = [
            analyzer_nodes.TensorInfo(dtype, shape, default_asset_content)
            for dtype, shape, default_asset_content in zip(
                output_dtypes, output_shapes, output_asset_default_values)
        ]
        return _apply_analyzer(ptransform,
                               *inputs,
                               output_tensor_info_list=output_tensor_infos)