Exemplo n.º 1
0
def encode():
    """
    Creates a Beam pipeline that generates data, transforms it and encodes it in ELWC
    """
    output_path = "./output"
    options = PipelineOptions()
    options.view_as(StandardOptions).runner = "DirectRunner"

    with beam.Pipeline(options=options) as pipeline:
        with tft_beam.Context(temp_dir="./tmp"):
            raw_data = generate_data(100)
            input_data = (pipeline | beam.Create(raw_data))

            transformed_data, transform_fn = (
                (input_data, raw_metadata)
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

            elwc_coder = ELWCProtoCoder(context_specs, examples_specs)
            data, metadata = transformed_data

            _ = (data | beam.Map(elwc_coder.encode) | beam.io.WriteToTFRecord(
                file_path_prefix="{}/data".format(output_path),
                file_name_suffix=".tfrecords"))

            _ = (transform_fn | tft_beam.WriteTransformFn(output_path))
Exemplo n.º 2
0
  def testPreprocessingFn(self):
    schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt')
    schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
    feature_spec = taxi_utils._get_raw_feature_spec(schema)
    working_dir = self.get_temp_dir()
    transform_graph_path = os.path.join(working_dir, 'transform_graph')
    transformed_examples_path = os.path.join(
        working_dir, 'transformed_examples')

    # Run very simplified version of executor logic.
    # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults.
    # Generate legacy `DatasetMetadata` object.  Future version of Transform
    # will accept the `Schema` proto directly.
    legacy_metadata = dataset_metadata.DatasetMetadata(
        schema_utils.schema_from_feature_spec(feature_spec))
    tfxio = tf_example_record.TFExampleRecord(
        file_pattern=os.path.join(self._testdata_path,
                                  'csv_example_gen/Split-train/*'),
        telemetry_descriptors=['Tests'],
        schema=legacy_metadata.schema)
    with beam.Pipeline() as p:
      with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')):
        examples = p | 'ReadTrainData' >> tfxio.BeamSource()
        (transformed_examples, transformed_metadata), transform_fn = (
            (examples, tfxio.TensorAdapterConfig())
            | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(
                taxi_utils.preprocessing_fn))

        # WriteTransformFn writes transform_fn and metadata to subdirectories
        # tensorflow_transform.SAVED_MODEL_DIR and
        # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
        # pylint: disable=expression-not-assigned
        (transform_fn
         |
         'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_graph_path))

        encoder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)
        (transformed_examples
         | 'EncodeTrainData' >> beam.Map(encoder.encode)
         | 'WriteTrainData' >> beam.io.WriteToTFRecord(
             os.path.join(transformed_examples_path,
                          'Split-train/transformed_examples.gz'),
             coder=beam.coders.BytesCoder()))
        # pylint: enable=expression-not-assigned

    # Verify the output matches golden output.
    # NOTE: we don't verify that transformed examples match golden output.
    expected_transformed_schema = io_utils.parse_pbtxt_file(
        os.path.join(
            self._testdata_path,
            'transform/transform_graph/transformed_metadata/schema.pbtxt'),
        schema_pb2.Schema())
    transformed_schema = io_utils.parse_pbtxt_file(
        os.path.join(transform_graph_path, 'transformed_metadata/schema.pbtxt'),
        schema_pb2.Schema())
    # Clear annotations so we only have to test main schema.
    transformed_schema.ClearField('annotation')
    for feature in transformed_schema.feature:
      feature.ClearField('annotation')
    self.assertEqual(transformed_schema, expected_transformed_schema)
    def test_train(self):
        """Tests case where training data is passed."""

        with self.pipeline as p:
            with tft_beam.Context(temp_dir=os.path.join(self.test_dir, 'tmp')):
                df = self.pre_tft_df[self.pre_tft_df.split == 'TRAIN']
                dataset = self._get_dataset(p, df)
                preprocessing_fn = functools.partial(
                    beam_pipeline._preprocessing_fn,
                    schema_map=self.schema.pre_tft_schema_map)
                transform_fn = (beam_pipeline._transform_and_write_tfr(
                    dataset,
                    self.tfr_writer,
                    preprocessing_fn=preprocessing_fn,
                    metadata=self.pre_tft_metadata,
                    label='Train'))
                _ = transform_fn | tft_beam.WriteTransformFn(self.test_dir)

        self.assertTrue(
            os.path.isdir(os.path.join(self.test_dir, 'transform_fn')))
        self.assertTrue(
            os.path.isdir(os.path.join(self.test_dir, 'transformed_metadata')))
        self.assertTrue(glob.glob(os.path.join(self.test_dir, 'train*.gz')))
        self.assertFalse(
            glob.glob(os.path.join(self.test_dir, 'validation*.gz')))
        self.assertFalse(glob.glob(os.path.join(self.test_dir, 'test*.gz')))
Exemplo n.º 4
0
def _main(argv=None):
    logging.getLogger().setLevel(logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument('--raw_examples_path', required=True)
    parser.add_argument('--raw_examples_schema_path', required=True)
    parser.add_argument('--preprocessing_module_path', required=True)
    parser.add_argument('--transform_fn_dir', required=True)
    known_args, pipeline_args = parser.parse_known_args(argv)

    raw_examples_schema = load_schema(known_args.raw_examples_schema_path)
    raw_examples_coder = tft.coders.ExampleProtoCoder(raw_examples_schema)
    raw_examples_metadata = dataset_metadata.DatasetMetadata(
        raw_examples_schema)

    tft_preprocessing = load_module_from_file_path(
        'tft_preprocessing', known_args.preprocessing_module_path)
    preprocessing_fn = tft_preprocessing.preprocessing_fn

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as pipeline:
        with tft_beam.Context(temp_dir=get_beam_temp_dir(pipeline_options)):
            raw_examples = pipeline | 'ReadRawExamples' >> beam.io.ReadFromTFRecord(
                known_args.raw_examples_path, coder=raw_examples_coder)
            raw_examples_dataset = (raw_examples, raw_examples_metadata)
            transform_fn = raw_examples_dataset | tft_beam.AnalyzeDataset(
                preprocessing_fn)
            transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(
                known_args.transform_fn_dir)
Exemplo n.º 5
0
def transform_tft(train_data, test_data, working_dir):
    options = PipelineOptions()
    options.view_as(StandardOptions).runner = 'DirectRunner'
    with beam.Pipeline(options=options) as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            data_shape = train_data[0][0].shape
            raw_data = (
                pipeline | 'ReadTrainData' >> beam.Create(train_data)
                | 'CreateTrainData' >> beam.Map(lambda data: format(data)))
            raw_data_metadata = dataset_metadata.DatasetMetadata(
                dataset_schema.from_feature_spec({
                    IMAGE_KEY:
                    tf.FixedLenFeature(list(data_shape), tf.float32),
                    LABEL_KEY:
                    tf.FixedLenFeature([], tf.int64)
                }))
            raw_dataset = (raw_data, raw_data_metadata)
            transformed_dataset, transform_fn = (
                raw_dataset
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            _ = (
                transformed_data
                | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE),
                    file_name_suffix='.tfrecords'))

            raw_test_data = (
                pipeline | 'ReadTestData' >> beam.Create(test_data)
                | 'CreateTestData' >> beam.Map(lambda data: format(data)))
            raw_test_dataset = (raw_test_data, raw_data_metadata)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | tft_beam.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = (transformed_test_data
                 | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTestData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE),
                     file_name_suffix='.tfrecords'))

            _ = (transform_fn |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
Exemplo n.º 6
0
def transformed_data(working_dir):
    """数据处理与生成transform_fn"""
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        xi, yi = inputs["x"], inputs["y"]
        x_integerized = tft.compute_and_apply_vocabulary(xi, default_value=0, name="vocab")  # , top_k=VOCAB_SIZE)
        y_integerized = tft.compute_and_apply_vocabulary(yi, default_value=0, name="label")  # ,top_k=LABEL_SIZE
        return {"x": x_integerized, "y": y_integerized}

    # path_transform
    with tft_beam.Context(temp_dir=path_transform):
        transformed_dataset, transform_fn = ((xys, DATA_STRING_FEATURE_SPEC) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
        transformed_train_data, transformed_metadata = transformed_dataset
        _ = (transform_fn | tft_beam.WriteTransformFn(working_dir))
    return transformed_train_data
Exemplo n.º 7
0
def transform_train_and_eval(pipeline, train_data, eval_data, data_source,
                             transform_dir, output_dir, schema):
  """Analyzes and transforms data.

  Args:
    pipeline: Beam Pipeline instance.
    train_data: Training CSV data.
    eval_data: Evaluation CSV data.
    data_source: Input data source - path to CSV file or BigQuery table. Expects
      either `csv` or `bigquery`.
    transform_dir: Directory to write transformed output. If this directory
      exists beam loads `transform_fn` instead of computing it again.
    output_dir: Directory to write transformed output.
    schema: A text-serialized TensorFlow metadata schema for the input data.
  """
  train_raw_data = (
      pipeline | 'ReadTrainData' >> ReadData(train_data, data_source, schema,
                                             tf.estimator.ModeKeys.TRAIN))
  eval_raw_data = (
      pipeline | 'ReadEvalData' >> ReadData(eval_data, data_source, schema,
                                            tf.estimator.ModeKeys.EVAL))
  schema = utils.make_dataset_schema(schema, mode=tf.estimator.ModeKeys.TRAIN)
  input_metadata = dataset_metadata.DatasetMetadata(schema)
  logger.info('Creating new transform model.')
  transform_fn = ((train_raw_data, input_metadata)
                  | ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

  (transform_fn
   | ('WriteTransformFn' >> tft_beam.WriteTransformFn(transform_dir)))

  (train_raw_data
   | 'TransformAndWriteTraining' >> transform_and_write(
       input_metadata, output_dir, transform_fn, _TRAIN_PREFIX))
  (eval_raw_data
   | 'TransformAndWriteEval' >> transform_and_write(input_metadata, output_dir,
                                                    transform_fn, _EVAL_PREFIX))
Exemplo n.º 8
0
  def test_single_phase_mixed_analyzer_run_once(self):
    span_0_key = 'span-0'
    span_1_key = 'span-1'

    def preprocessing_fn(inputs):

      integerized_s = tft.compute_and_apply_vocabulary(inputs['s'])

      _ = tft.bucketize(inputs['x'], 2, name='bucketize')

      return {
          'integerized_s':
              integerized_s,
          'x_min':
              tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'x_mean':
              tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'y_min':
              tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
          'y_mean':
              tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
      }

    # Run AnalyzeAndTransform on some input data and compare with expected
    # output.
    input_data = [{'x': 12, 'y': 1, 's': 'd'}, {'x': 10, 'y': 1, 's': 'c'}]
    input_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'x': tf.io.FixedLenFeature([], tf.float32),
            'y': tf.io.FixedLenFeature([], tf.float32),
            's': tf.io.FixedLenFeature([], tf.string),
        }))
    input_data_dict = {
        span_0_key: [{
            'x': -2,
            'y': 1,
            's': 'b',
        }, {
            'x': 4,
            'y': -4,
            's': 'b',
        }],
        span_1_key: input_data,
    }

    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(
          list(itertools.chain(*input_data_dict.values())))
      cache_dict = {
          span_0_key: {
              b'__v0__CacheableCombineAccumulate[x_1/mean_and_var]-.\xc4t>ZBv\xea\xa5SU\xf4\x065\xc6\x1c\x81W\xf9\x1b':
                  p | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0, 0.0]']),
              b'__v0__CacheableCombineAccumulate[x/x]-\x95\xc5w\x88\x85\x8b5V\xc9\x00\xe0\x0f\x03\x1a\xdaL\x9d\xd5\xb3\xe3':
                  p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']),
              b'__v0__CacheableCombineAccumulate[y_1/mean_and_var]-E^\xb7VZ\xeew4rm\xab\xa3\xa4k|J\x80ck\x16':
                  p | 'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25, 0.0]']),
              b'__v0__CacheableCombineAccumulate[y/y]-\xdf\x1ey\x03\x1c\x96\xd5'
              b' e\x9bJ\xa1\xd2\xfc\x9c\x03\x0fM \xdb':
                  p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']),
          },
          span_1_key: {},
      }

      transform_fn, cache_output = (
          (flat_data, input_data_dict, cache_dict, input_metadata)
          | 'Analyze' >>
          (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
      _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
          self._cache_dir)

      transformed_dataset = ((
          (input_data_dict[span_1_key], input_metadata), transform_fn)
                             | 'Transform' >> beam_impl.TransformDataset())

      dot_string = nodes.get_dot_graph([analysis_graph_builder._ANALYSIS_GRAPH
                                       ]).to_string()
      self.WriteRenderedDotFile(dot_string)

      # The output cache should not have entries for the cache that is present
      # in the input cache.
      self.assertEqual(
          len(cache_output[span_0_key]),
          len(cache_output[span_1_key]) - 4)

      transformed_data, unused_transformed_metadata = transformed_dataset

      expected_transformed = [
          {
              'x_mean': 6.0,
              'x_min': -2.0,
              'y_mean': -0.25,
              'y_min': -4.0,
              'integerized_s': 1,
          },
          {
              'x_mean': 6.0,
              'x_min': -2.0,
              'y_mean': -0.25,
              'y_min': -4.0,
              'integerized_s': 2,
          },
      ]
      beam_test_util.assert_that(transformed_data,
                                 beam_test_util.equal_to(expected_transformed))

      transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn')
      _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)

    # 4 from analyzing 2 spans, and 2 from transform.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 6)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 4)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 8)
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)
Exemplo n.º 9
0
    def test_preprocessing_fn(self):
        schema_file = os.path.join(self._testdata_path,
                                   'schema_gen/schema.pbtxt')
        schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
        feature_spec = taxi_utils._get_raw_feature_spec(schema)
        working_dir = self.get_temp_dir()
        transform_output_path = os.path.join(working_dir, 'transform_output')
        transformed_examples_path = os.path.join(working_dir,
                                                 'transformed_examples')

        # Run very simplified version of executor logic.
        # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults.
        # Generate legacy `DatasetMetadata` object.  Future version of Transform
        # will accept the `Schema` proto directly.
        legacy_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec(feature_spec))
        decoder = tft.coders.ExampleProtoCoder(legacy_metadata.schema)
        with beam.Pipeline() as p:
            with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')):
                examples = (
                    p
                    | 'ReadTrainData' >> beam.io.ReadFromTFRecord(
                        os.path.join(self._testdata_path,
                                     'csv_example_gen/train/*'),
                        coder=beam.coders.BytesCoder(),
                        # TODO(b/114938612): Eventually remove this override.
                        validate=False)
                    | 'DecodeTrainData' >> beam.Map(decoder.decode))
                (transformed_examples, transformed_metadata), transform_fn = (
                    (examples, legacy_metadata)
                    | 'AnalyzeAndTransform' >>
                    tft_beam.AnalyzeAndTransformDataset(
                        taxi_utils.preprocessing_fn))

                # WriteTransformFn writes transform_fn and metadata to subdirectories
                # tensorflow_transform.SAVED_MODEL_DIR and
                # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
                # pylint: disable=expression-not-assigned
                (transform_fn
                 | 'WriteTransformFn' >>
                 tft_beam.WriteTransformFn(transform_output_path))

                encoder = tft.coders.ExampleProtoCoder(
                    transformed_metadata.schema)
                (transformed_examples
                 | 'EncodeTrainData' >> beam.Map(encoder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(transformed_examples_path,
                                  'train/transformed_examples.gz'),
                     coder=beam.coders.BytesCoder()))
                # pylint: enable=expression-not-assigned

        # Verify the output matches golden output.
        # NOTE: we don't verify that transformed examples match golden output.
        expected_transformed_schema = io_utils.parse_pbtxt_file(
            os.path.join(
                self._testdata_path,
                'transform/transform_output/transformed_metadata/schema.pbtxt'
            ), schema_pb2.Schema())
        transformed_schema = io_utils.parse_pbtxt_file(
            os.path.join(transform_output_path,
                         'transformed_metadata/schema.pbtxt'),
            schema_pb2.Schema())
        # Clear annotations so we only have to test main schema.
        for feature in transformed_schema.feature:
            feature.ClearField('annotation')
        self.assertEqual(transformed_schema, expected_transformed_schema)
Exemplo n.º 10
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
                _fill_in_missing(inputs[key]))

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[taxi.transformed_name(
                key)] = transform.compute_and_apply_vocabulary(
                    _fill_in_missing(inputs[key]),
                    top_k=taxi.VOCAB_SIZE,
                    num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = transform.bucketize(
                _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

        # Was this passenger a big tipper?
        taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
        tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
        outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
            tf.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                    tf.int64))

        return outputs

    schema = taxi.read_schema(schema_file)
    raw_feature_spec = taxi.get_raw_feature_spec(schema)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        with tft_beam.Context(temp_dir=working_dir):
            if input_handle.lower().endswith('csv'):
                csv_coder = taxi.make_csv_coder(schema)
                raw_data = (pipeline
                            | 'ReadFromText' >> beam.io.ReadFromText(
                                input_handle, skip_header_lines=1))
                decode_transform = beam.Map(csv_coder.decode)
            else:
                query = taxi.make_sql(input_handle, max_rows, for_eval=False)
                raw_data = (pipeline
                            | 'ReadBigQuery' >> beam.io.Read(
                                beam.io.BigQuerySource(query=query,
                                                       use_standard_sql=True)))
                decode_transform = beam.Map(taxi.clean_raw_data_dict,
                                            raw_feature_spec=raw_feature_spec)

            if transform_dir is None:
                decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform
                transform_fn = (
                    (decoded_data, raw_data_metadata) |
                    ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

                _ = (transform_fn
                     | ('WriteTransformFn' >>
                        tft_beam.WriteTransformFn(working_dir)))
            else:
                transform_fn = pipeline | tft_beam.ReadTransformFn(
                    transform_dir)

            # Shuffling the data before materialization will improve Training
            # effectiveness downstream. Here we shuffle the raw_data (as opposed to
            # decoded data) since it has a compact representation.
            shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
            )

            decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform
            (transformed_data, transformed_metadata) = (
                ((decoded_data, raw_data_metadata), transform_fn)
                | 'Transform' >> tft_beam.TransformDataset())

            coder = example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema)
            _ = (transformed_data
                 | 'SerializeExamples' >> beam.Map(coder.encode)
                 | 'WriteExamples' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, outfile_prefix),
                     file_name_suffix='.gz'))
Exemplo n.º 11
0
    def test_single_phase_run_twice(self):

        span_0_key = 'span-0'
        span_1_key = 'span-1'

        def preprocessing_fn(inputs):

            _ = tft.vocabulary(inputs['s'])

            _ = tft.bucketize(inputs['x'], 2, name='bucketize')

            return {
                'x_min':
                tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'x_mean':
                tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'y_min':
                tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
                'y_mean':
                tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
            }

        input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.io.FixedLenFeature([], tf.float32),
                'y':
                tf.io.FixedLenFeature([], tf.float32),
                's':
                tf.io.FixedLenFeature([], tf.string),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
                'y': 1,
                's': 'a',
            }, {
                'x': 4,
                'y': -4,
                's': 'a',
            }],
            span_1_key:
            input_data,
        }
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # wrap each value in input_data_dict as a pcoll.
                input_data_pcoll_dict = {}
                for a, b in six.iteritems(input_data_dict):
                    input_data_pcoll_dict[a] = p | a >> beam.Create(b)

                transform_fn_1, cache_output = (
                    (flat_data, input_data_pcoll_dict, {}, input_metadata)
                    | 'Analyze' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
                _ = (cache_output | 'WriteCache' >>
                     analyzer_cache.WriteAnalysisCacheToFS(self._cache_dir))

                transformed_dataset = (
                    ((input_data_pcoll_dict[span_1_key], input_metadata),
                     transform_fn_1)
                    | 'Transform' >> beam_impl.TransformDataset())

                del input_data_pcoll_dict
                transformed_data, unused_transformed_metadata = transformed_dataset

                expected_transformed_data = [
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                    },
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                    },
                ]
                beam_test_util.assert_that(
                    transformed_data,
                    beam_test_util.equal_to(expected_transformed_data),
                    label='first')

                transform_fn_dir = os.path.join(self.base_test_dir,
                                                'transform_fn_1')
                _ = transform_fn_1 | tft_beam.WriteTransformFn(
                    transform_fn_dir)

                for key in input_data_dict:
                    self.assertIn(key, cache_output)
                    self.assertEqual(6, len(cache_output[key]))

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # wrap each value in input_data_dict as a pcoll.
                input_data_pcoll_dict = {}
                for a, b in six.iteritems(input_data_dict):
                    input_data_pcoll_dict[a] = p | a >> beam.Create(b)

                input_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
                    self._cache_dir, list(input_data_dict.keys()))

                transform_fn_2, second_output_cache = (
                    (flat_data, input_data_pcoll_dict, input_cache,
                     input_metadata)
                    | 'AnalyzeAgain' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

                dot_string = nodes.get_dot_graph(
                    [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
                self.WriteRenderedDotFile(dot_string)

                transformed_dataset = (
                    ((input_data_dict[span_1_key], input_metadata),
                     transform_fn_2)
                    | 'TransformAgain' >> beam_impl.TransformDataset())
        transformed_data, unused_transformed_metadata = transformed_dataset
        beam_test_util.assert_that(
            transformed_data,
            beam_test_util.equal_to(expected_transformed_data),
            label='second')

        self.assertFalse(second_output_cache)
Exemplo n.º 12
0
def transform_data(working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    working_dir: Directory to read shuffled data from and write transformed data
        and metadata to.
  """

    with beam.Pipeline() as pipeline:
        with tft_beam.Context(
                temp_dir=os.path.join(working_dir, TRANSFORM_TEMP_DIR)):
            tfxio_train_data = tfxio.TFExampleRecord(file_pattern=os.path.join(
                working_dir, SHUFFLED_TRAIN_DATA_FILEBASE + '*'),
                                                     schema=SCHEMA)
            train_data = (pipeline |
                          'TFXIORead[Train]' >> tfxio_train_data.BeamSource())

            tfxio_test_data = tfxio.TFExampleRecord(file_pattern=os.path.join(
                working_dir, SHUFFLED_TEST_DATA_FILEBASE + '*'),
                                                    schema=SCHEMA)
            test_data = (pipeline
                         | 'TFXIORead[Test]' >> tfxio_test_data.BeamSource())

            def preprocessing_fn(inputs):
                """Preprocess input columns into transformed columns."""
                review = inputs[REVIEW_KEY]

                # Here tf.compat.v1.string_split behaves differently from
                # tf.strings.split.
                review_tokens = tf.compat.v1.string_split(review, DELIMITERS)
                review_indices = tft.compute_and_apply_vocabulary(
                    review_tokens, top_k=VOCAB_SIZE)
                # Add one for the oov bucket created by compute_and_apply_vocabulary.
                review_bow_indices, review_weight = tft.tfidf(
                    review_indices, VOCAB_SIZE + 1)
                return {
                    REVIEW_KEY: review_bow_indices,
                    REVIEW_WEIGHT_KEY: review_weight,
                    LABEL_KEY: inputs[LABEL_KEY]
                }

            # Transformed metadata is not necessary for encoding.
            # The TFXIO output format is chosen for improved performance.
            (transformed_train_data, _), transform_fn = (
                (train_data, tfxio_train_data.TensorAdapterConfig())
                | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(
                    preprocessing_fn, output_record_batches=True))

            transformed_test_data, _ = (
                ((test_data, tfxio_test_data.TensorAdapterConfig()),
                 transform_fn)
                | 'Transform' >>
                tft_beam.TransformDataset(output_record_batches=True))

            # Extract transformed RecordBatches, encode and write them to the given
            # directory.
            coder = tfxio.RecordBatchToExamplesEncoder()
            _ = (transformed_train_data
                 | 'EncodeTrainData' >>
                 beam.FlatMapTuple(lambda batch, _: coder.encode(batch))
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            _ = (
                transformed_test_data
                | 'EncodeTestData' >>
                beam.FlatMapTuple(lambda batch, _: coder.encode(batch))
                | 'WriteTestData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to two subdirectories of
            # working_dir, given by tft.TRANSFORM_FN_DIR and
            # tft.TRANSFORMED_METADATA_DIR respectively.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
Exemplo n.º 13
0
def transform_data(train_data_file, test_data_file, working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    working_dir: Directory to write transformed data and metadata to
  """

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with apache_beam.Pipeline() as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            # Create a coder to read the census data with the schema.  To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            ordered_columns = [
                'age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'label'
            ]
            converter = tft.coders.CsvCoder(ordered_columns,
                                            RAW_DATA_METADATA.schema)

            # Read in raw data and convert using CSV converter.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do them from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV converter can read, in particular
            # removing spaces after commas.
            #
            # We use MapAndFilterErrors instead of Map to filter out decode errors in
            # convert.decode which should only occur for the trailing blank line.
            raw_data = (
                pipeline
                |
                'ReadTrainData' >> apache_beam.io.ReadFromText(train_data_file)
                | 'FixCommasTrainData' >>
                apache_beam.Map(lambda line: line.replace(', ', ','))
                | 'DecodeTrainData' >> MapAndFilterErrors(converter.decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, RAW_DATA_METADATA)
            transformed_dataset, transform_fn = (
                raw_dataset
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset
            # A coder between TF Examples and tf.Transform datasets.
            # Used to encode a tf.transform encoded dict as tf.Example.
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            _ = (transformed_data
                 | 'EncodeTrainData' >> apache_beam.Map(
                     transformed_data_coder.encode)
                 | 'WriteTrainData' >> apache_beam.io.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            # Now apply transform function to test data.  In this case we remove the
            # trailing period at the end of each line, and also ignore the header line
            # that is present in the test data file.
            raw_test_data = (
                pipeline
                | 'ReadTestData' >> apache_beam.io.ReadFromText(
                    test_data_file, skip_header_lines=1)
                | 'FixCommasTestData' >>
                apache_beam.Map(lambda line: line.replace(', ', ','))
                | 'RemoveTrailingPeriodsTestData' >>
                apache_beam.Map(lambda line: line[:-1])
                | 'DecodeTestData' >> MapAndFilterErrors(converter.decode))

            raw_test_dataset = (raw_test_data, RAW_DATA_METADATA)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | tft_beam.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = (
                transformed_test_data
                | 'EncodeTestData' >> apache_beam.Map(
                    transformed_data_coder.encode)
                | 'WriteTestData' >> apache_beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to working_dir, which can then
            # be read by the tft.TFTransformOutput class.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
Exemplo n.º 14
0
def build_pipeline(df: pd.DataFrame, job_label: str, runner: str, project: str,
                   region: str, output_dir: str, compression: str,
                   num_shards: int, dataflow_options: dict,
                   integer_label: bool) -> beam.Pipeline:
    """Runs TFRecorder Beam Pipeline.

  Args:
    df: Pandas DataFrame
    job_label: User description for the beam job.
    runner: Beam Runner: (e.g. DataflowRunner, DirectRunner).
    project: GCP project ID (if DataflowRunner)
    region: GCP compute region (if DataflowRunner)
    output_dir: GCS or Local Path for output.
    compression: gzip or None.
    num_shards: Number of shards.
    dataflow_options: Dataflow Runner Options (optional)
    integer_label: Flags if label is already an integer.

  Returns:
    beam.Pipeline

  Note: These inputs must be validated upstream (by client.create_tfrecord())
  """

    job_name = _get_job_name(job_label)
    job_dir = _get_job_dir(output_dir, job_name)
    options = _get_pipeline_options(runner, job_name, job_dir, project, region,
                                    dataflow_options)

    #with beam.Pipeline(runner, options=options) as p:
    p = beam.Pipeline(options=options)
    with tft_beam.Context(temp_dir=os.path.join(job_dir, 'tft_tmp')):

        converter = tft.coders.CsvCoder(constants.IMAGE_CSV_COLUMNS,
                                        constants.IMAGE_CSV_METADATA.schema)

        extract_images_fn = beam_image.ExtractImagesDoFn(
            constants.IMAGE_URI_KEY)
        flatten_rows = ToCSVRows()

        # Each element in the image_csv_data PCollection will be a dict
        # including the image_csv_columns and the image features created from
        # extract_images_fn.
        image_csv_data = (
            p
            | 'ReadFromDataFrame' >> beam.Create(df.values.tolist())
            | 'ToCSVRows' >> beam.ParDo(flatten_rows)
            | 'DecodeCSV' >> beam.Map(converter.decode)
            | 'ReadImage' >> beam.ParDo(extract_images_fn))

        # Split dataset into train and validation.
        train_data, val_data, test_data, discard_data = (
            image_csv_data | 'SplitDataset' >> beam.Partition(
                _partition_fn, len(constants.SPLIT_VALUES)))

        train_dataset = (train_data, constants.RAW_METADATA)
        val_dataset = (val_data, constants.RAW_METADATA)
        test_dataset = (test_data, constants.RAW_METADATA)

        # TensorFlow Transform applied to all datasets.
        preprocessing_fn = functools.partial(_preprocessing_fn,
                                             integer_label=integer_label)
        transformed_train_dataset, transform_fn = (
            train_dataset
            | 'AnalyzeAndTransformTrain' >>
            tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

        transformed_train_data, transformed_metadata = transformed_train_dataset
        transformed_data_coder = tft.coders.ExampleProtoCoder(
            transformed_metadata.schema)

        transformed_val_data, _ = (
            (val_dataset, transform_fn)
            | 'TransformVal' >> tft_beam.TransformDataset())

        transformed_test_data, _ = (
            (test_dataset, transform_fn)
            | 'TransformTest' >> tft_beam.TransformDataset())

        # Sinks for TFRecords and metadata.
        tfr_writer = functools.partial(_get_write_to_tfrecord,
                                       output_dir=job_dir,
                                       compress=compression,
                                       num_shards=num_shards)

        _ = (transformed_train_data
             | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
             | 'WriteTrainData' >> tfr_writer(prefix='train'))

        _ = (transformed_val_data
             | 'EncodeValData' >> beam.Map(transformed_data_coder.encode)
             | 'WriteValData' >> tfr_writer(prefix='val'))

        _ = (transformed_test_data
             | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
             | 'WriteTestData' >> tfr_writer(prefix='test'))

        _ = (discard_data
             | 'DiscardDataWriter' >> beam.io.WriteToText(
                 os.path.join(job_dir, 'discarded-data')))

        # Output transform function and metadata
        _ = (transform_fn
             | 'WriteTransformFn' >> tft_beam.WriteTransformFn(job_dir))

        # Output metadata schema
        _ = (transformed_metadata
             | 'WriteMetadata' >> tft_beam.WriteMetadata(job_dir, pipeline=p))

    return p
Exemplo n.º 15
0
def transform_data(train_data_file, test_data_file, working_dir):
    """Transform the data and write out as a TFRecord of Example protos.
    Read in the data using the parquet io, and transform it using a
    preprocessing pipeline that scales numeric data and converts categorical data
    from strings to int64 values indices, by creating a vocabulary for each
    category.
    Args:
      train_data_file: File containing training data
      test_data_file: File containing test data
      feature_config: named tuple with feature types
      working_dir: Directory to write transformed data and metadata to
    """

    numerical_feats = [
        "startCountTotal", "purchaseCountTotal", "globalStartCountTotal",
        "globalPurchaseCountTotal"
    ]

    categorical_feats = ["country", "sourceGameId", "platform"]

    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        for key in numerical_feats:
            outputs[key] = tf.cast(tft.bucketize(inputs[key], 20),
                                   tf.float32) / 20.0 - 0.5

        outputs["campaignCost_mod"] = inputs["campaignCost"] / 100.0

        inputs["game_zone"] = tf.string_join(
            [inputs["sourceGameId"], inputs["zone"]], separator="_")
        inputs["game_campaignId"] = tf.string_join(
            [inputs["sourceGameId"], inputs["campaignId"]], separator="_")

        for key in categorical_feats + ["game_zone", "game_campaignId"]:
            vocab = tft.vocabulary(inputs[key],
                                   vocab_filename=key,
                                   frequency_threshold=100)
            outputs[key] = tft.apply_vocabulary(inputs[key],
                                                vocab,
                                                default_value=0)

        outputs["label"] = inputs["label"]
        outputs["key"] = inputs["key"]

        return outputs

    # Input schema definition
    RAW_DATA_METADATA = gather_raw_metadata(
        numerical_feats + ["campaignCost"],
        categorical_feats + ["zone", "campaignId", "key"])

    # pipeline args to read from gcs, currently unused because reading local file
    pipeline_args = [
        '--runner=DirectRunner',
        '--project=unity-ads-ds-prd',
        #     '--staging_location=gs://unity-ads-ds-prd-users/villew/promo/staging',
        #     '--temp_location=gs://unity-ads-ds-prd-users/villew/promo/temp',
        '--job_name=transform-promo-data-to-tf-records'
    ]
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # create a beam pipeline
    with beam.Pipeline(options=pipeline_options) as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            raw_data = (
                pipeline
                | 'ReadTrainData' >> beam.io.ReadFromParquet(train_data_file))

            # Combine data and schema into a dataset tuple.
            raw_dataset = (raw_data, RAW_DATA_METADATA)
            transformed_dataset, transform_fn = (
                raw_dataset
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            # write to tf record
            _ = (transformed_data
                 | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, "train_tfrecord")))

            # Now apply transform function to test data.
            raw_test_data = (
                pipeline
                | 'ReadTestData' >> beam.io.ReadFromParquet(test_data_file))

            raw_test_dataset = (raw_test_data, RAW_DATA_METADATA)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | tft_beam.TransformDataset())

            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = (transformed_test_data
                 | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTestData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, "test_tfrecord")))

            # Will write a SavedModel and metadata to working_dir, which can then
            # be read by the tft.TFTransformOutput class.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
Exemplo n.º 16
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None):
  """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """

  def transform_ngrams(input, ngram_range):
    """ helper function to transform ngrams and print output. """
    # this print statement causes output to concat itself!
    # input = tf.Print(input, [input], "raw input:", first_n=-1, summarize=100)

    transformed = transform.ngrams(
      tf.string_split(input, delimiter=" "),
      ngram_range=ngram_range,
      separator=' ')

    # SparseTensor basically cannot be printed because it's made up of 3
    # tensors. We can use this trick to print the values column, but without the index
    # it's not too meaningful.
    #
    # values = tf.Print(transformed.values, [transformed.values], "ngram output:")
    # transformed = tf.SparseTensor(
    #       indices=transformed.indices,
    #       values=values,
    #       dense_shape=transformed.dense_shape)
    return transformed

  def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.
    https://cloud.google.com/solutions/machine-learning/data-preprocessing-for-ml-with-tf-transform-pt2

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
    outputs = {}
    for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
      print('processing key', key)
      print('input:', inputs[key])
      # Preserve this feature as a dense float, setting nan's to the mean.
      outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
          _fill_in_missing(inputs[key]))

    for key in taxi.VOCAB_FEATURE_KEYS:
      # Build a vocabulary for this feature.
      outputs[
          taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
              _fill_in_missing(inputs[key]),
              top_k=taxi.VOCAB_SIZE,
              num_oov_buckets=taxi.OOV_SIZE)

    # for key in taxi.FEATURE_NGRAM:
    #   # Extract nggrams and build a vocab.
    #   outputs[
    #       taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
    #           transform.ngrams(
    #             tf.string_split(_fill_in_missing(inputs[key])),
    #             ngram_range=taxi.NGRAM_RANGE,
    #             separator=' '),
    #           top_k=512,
    #           num_oov_buckets=taxi.OOV_SIZE)

    for key in taxi.FEATURE_NGRAM:
      # Extract nggrams and build a vocab.
      outputs[
          taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
            transform_ngrams(_fill_in_missing(inputs[key]), taxi.NGRAM_RANGE),
            top_k=taxi.VOCAB_SIZE,
            num_oov_buckets=taxi.OOV_SIZE)

    for key in taxi.BUCKET_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = transform.bucketize(
          _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

    for key in taxi.CATEGORICAL_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
    tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
    outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
        tf.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 20% of the fare.
        tf.cast(
            tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
            tf.int64))

    return outputs

  schema = taxi.read_schema(schema_file)
  raw_feature_spec = taxi.get_raw_feature_spec(schema)
  raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
  raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

  with beam.Pipeline(argv=pipeline_args) as pipeline:
    with tft_beam.Context(temp_dir=working_dir):
      if input_handle.lower().endswith('csv'):
        csv_coder = taxi.make_csv_coder(schema, input_handle.lower())
        raw_data = (
            pipeline
            | 'ReadFromText' >> beam.io.ReadFromText(
                input_handle, skip_header_lines=1))
        decode_transform = beam.Map(csv_coder.decode)
      else:
        query = taxi.make_sql(input_handle, max_rows, for_eval=False)
        raw_data = (
            pipeline
            | 'ReadBigQuery' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True)))
        decode_transform = beam.Map(
            taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec)

      if transform_dir is None:
        decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform
        transform_fn = (
            (decoded_data, raw_data_metadata) |
            ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

        _ = (
            transform_fn
            | ('WriteTransformFn' >>
               tft_beam.WriteTransformFn(working_dir)))
      else:
        transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir)

      # Shuffling the data before materialization will improve Training
      # effectiveness downstream. Here we shuffle the raw_data (as opposed to
      # decoded data) since it has a compact representation.
      shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle()

      decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform
      (transformed_data, transformed_metadata) = (
          ((decoded_data, raw_data_metadata), transform_fn)
          | 'Transform' >> tft_beam.TransformDataset())

      coder = example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)
      _ = (
          transformed_data
          | 'SerializeExamples' >> beam.Map(coder.encode)
          | 'WriteExamples' >> beam.io.WriteToTFRecord(
              os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz')
      )
def compute_gci_vocab(input_handle,
                      working_dir,
                      schema_file,
                      transform_dir=None,
                      max_rows=None,
                      pipeline_args=None):
    """The main tf.transform method which analyzes and transforms data.
    Args:
      input_handle: BigQuery table name to process specified as DATASET.TABLE or
        path to csv file with input data.
      outfile_prefix: Filename prefix for emitted transformed examples
      working_dir: Directory in which transformed examples and transform function
        will be emitted.
      schema_file: An file path that contains a text-serialized TensorFlow
        metadata schema of the input data.
      transform_dir: Directory in which the transform output is located. If
        provided, this will load the transform_fn from disk instead of computing
        it over the data. Hint: this is useful for transforming eval data.
      max_rows: Number of rows to query from BigQuery
      pipeline_args: additional DataflowRunner or DirectRunner args passed to the
        beam pipeline.
    """
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.
        Args:
          inputs: map from feature keys to raw not-yet-transformed features.
        Returns:
          Map from string feature key to transformed feature operations.
        """
        outputs = {}
        DENSE_FLOAT_FEATURE_KEYS = []
        VOCAB_FEATURE_KEYS = []
        _CSV_COLUMNS_NAMES, _CSV_COLUMN_DEFAULTS, _CSV_COLUMN_types, _UNUSED = setcolumn_list_original(
        )
        for i in range(len(_CSV_COLUMNS_NAMES)):
            if _CSV_COLUMN_types[i] is tf.string:
                VOCAB_FEATURE_KEYS.append(_CSV_COLUMNS_NAMES[i])

        outputs['gci'] = tf.expand_dims(_fill_in_missing(inputs['gci']), 1)
        for key in VOCAB_FEATURE_KEYS:
            if key in _UNUSED:
                continue
            if 'gci' in key:
                appendlist = tf.expand_dims(_fill_in_missing(inputs[key]), 1)
                outputs['gci'] = tf.concat([appendlist, outputs['gci']], 0)
        transform.vocabulary(outputs['gci'], vocab_filename='gci')
        transform.vocabulary(inputs['LAT_LON_10'], vocab_filename='label')
        return outputs

    schema = read_schema(schema_file)
    raw_feature_spec = get_raw_feature_spec(schema)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        with tft_beam.Context(temp_dir=working_dir):
            csv_coder = make_csv_coder(schema)
            raw_data = (pipeline
                        | 'ReadFromText' >> beam.io.ReadFromText(
                            input_handle, skip_header_lines=1))
            decode_transform = beam.Map(csv_coder.decode)

            decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform
            transform_fn = (
                (decoded_data, raw_data_metadata) |
                ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

            _ = (
                transform_fn
                |
                ('WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir)))
Exemplo n.º 18
0
  def test_single_phase_run_twice(self):

    span_0_key = 'span-0'
    span_1_key = 'span-1'

    def preprocessing_fn(inputs):

      _ = tft.vocabulary(inputs['s'], vocab_filename='vocab1')

      _ = tft.bucketize(inputs['x'], 2, name='bucketize')

      return {
          'x_min':
              tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'x_mean':
              tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'y_min':
              tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
          'y_mean':
              tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
          's_integerized':
              tft.compute_and_apply_vocabulary(
                  inputs['s'],
                  labels=inputs['label'],
                  use_adjusted_mutual_info=True),
      }

    input_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'x': tf.io.FixedLenFeature([], tf.float32),
            'y': tf.io.FixedLenFeature([], tf.float32),
            's': tf.io.FixedLenFeature([], tf.string),
            'label': tf.io.FixedLenFeature([], tf.int64),
        }))
    input_data_dict = {
        span_0_key: [{
            'x': -2,
            'y': 1,
            's': 'a',
            'label': 0,
        }, {
            'x': 4,
            'y': -4,
            's': 'a',
            'label': 1,
        }, {
            'x': 5,
            'y': 11,
            's': 'a',
            'label': 1,
        }, {
            'x': 1,
            'y': -4,
            's': u'ȟᎥ𝒋ǩľḿꞑȯ𝘱𝑞𝗋𝘴'.encode('utf-8'),
            'label': 1,
        }],
        span_1_key: [{
            'x': 12,
            'y': 1,
            's': u'ȟᎥ𝒋ǩľḿꞑȯ𝘱𝑞𝗋𝘴'.encode('utf-8'),
            'label': 0
        }, {
            'x': 10,
            'y': 1,
            's': 'c',
            'label': 1
        }],
    }
    expected_vocabulary_contents = np.array(
        [b'a', u'ȟᎥ𝒋ǩľḿꞑȯ𝘱𝑞𝗋𝘴'.encode('utf-8'), b'c'],
        dtype=object)
    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(
          list(itertools.chain(*input_data_dict.values())))

      # wrap each value in input_data_dict as a pcoll.
      input_data_pcoll_dict = {}
      for a, b in six.iteritems(input_data_dict):
        input_data_pcoll_dict[a] = p | a >> beam.Create(b)

      transform_fn_1, cache_output = (
          (flat_data, input_data_pcoll_dict, {}, input_metadata)
          | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
      _ = (
          cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
              self._cache_dir))

      transformed_dataset = ((
          (input_data_pcoll_dict[span_1_key], input_metadata), transform_fn_1)
                             | 'Transform' >> beam_impl.TransformDataset())

      del input_data_pcoll_dict
      transformed_data, unused_transformed_metadata = transformed_dataset

      expected_transformed_data = [
          {
              'x_mean': 5.0,
              'x_min': -2.0,
              'y_mean': 1.0,
              'y_min': -4.0,
              's_integerized': 0,
          },
          {
              'x_mean': 5.0,
              'x_min': -2.0,
              'y_mean': 1.0,
              'y_min': -4.0,
              's_integerized': 2,
          },
      ]
      beam_test_util.assert_that(
          transformed_data,
          beam_test_util.equal_to(expected_transformed_data),
          label='first')

      transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_1')
      _ = transform_fn_1 | tft_beam.WriteTransformFn(transform_fn_dir)

      for key in input_data_dict:
        self.assertIn(key, cache_output)
        self.assertEqual(7, len(cache_output[key]))

    tf_transform_output = tft.TFTransformOutput(transform_fn_dir)
    vocab1_path = tf_transform_output.vocabulary_file_by_name('vocab1')
    self.AssertVocabularyContents(vocab1_path, expected_vocabulary_contents)

    # 4 from analyzing 2 spans, and 2 from transform.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 14)
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)

    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(
          list(itertools.chain(*input_data_dict.values())))

      # wrap each value in input_data_dict as a pcoll.
      input_data_pcoll_dict = {}
      for a, b in six.iteritems(input_data_dict):
        input_data_pcoll_dict[a] = p | a >> beam.Create(b)

      input_cache = p | analyzer_cache.ReadAnalysisCacheFromFS(
          self._cache_dir, list(input_data_dict.keys()))

      transform_fn_2, second_output_cache = (
          (flat_data, input_data_pcoll_dict, input_cache, input_metadata)
          | 'AnalyzeAgain' >>
          (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

      dot_string = nodes.get_dot_graph([analysis_graph_builder._ANALYSIS_GRAPH
                                       ]).to_string()
      self.WriteRenderedDotFile(dot_string)

      transformed_dataset = ((
          (input_data_dict[span_1_key], input_metadata), transform_fn_2)
                             | 'TransformAgain' >> beam_impl.TransformDataset())
      transformed_data, unused_transformed_metadata = transformed_dataset
      beam_test_util.assert_that(
          transformed_data,
          beam_test_util.equal_to(expected_transformed_data),
          label='second')

      transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_2')
      _ = transform_fn_2 | tft_beam.WriteTransformFn(transform_fn_dir)

    tf_transform_output = tft.TFTransformOutput(transform_fn_dir)
    vocab1_path = tf_transform_output.vocabulary_file_by_name('vocab1')
    self.AssertVocabularyContents(vocab1_path, expected_vocabulary_contents)

    self.assertFalse(second_output_cache)

    # Only 2 from transform.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 2)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 14)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 0)

    # The root CreateSavedModel is optimized away because the data doesn't get
    # processed at all (only cache).
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 1)
Exemplo n.º 19
0
  def test_non_frequency_vocabulary_merge(self):
    """This test compares vocabularies produced with and without cache."""

    mi_vocab_name = 'mutual_information_vocab'
    adjusted_mi_vocab_name = 'adjusted_mutual_information_vocab'
    weighted_frequency_vocab_name = 'weighted_frequency_vocab'

    def preprocessing_fn(inputs):
      _ = tft.vocabulary(
          inputs['s'],
          labels=inputs['label'],
          store_frequency=True,
          vocab_filename=mi_vocab_name,
          min_diff_from_avg=0.1,
          use_adjusted_mutual_info=False)

      _ = tft.vocabulary(
          inputs['s'],
          labels=inputs['label'],
          store_frequency=True,
          vocab_filename=adjusted_mi_vocab_name,
          min_diff_from_avg=1.0,
          use_adjusted_mutual_info=True)

      _ = tft.vocabulary(
          inputs['s'],
          weights=inputs['weight'],
          store_frequency=True,
          vocab_filename=weighted_frequency_vocab_name,
          use_adjusted_mutual_info=False)
      return inputs

    span_0_key = 'span-0'
    span_1_key = 'span-1'

    input_data = [
        dict(s='a', weight=1, label=1),
        dict(s='a', weight=0.5, label=1),
        dict(s='b', weight=0.75, label=1),
        dict(s='b', weight=1, label=0),
    ]
    input_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            's': tf.io.FixedLenFeature([], tf.string),
            'label': tf.io.FixedLenFeature([], tf.int64),
            'weight': tf.io.FixedLenFeature([], tf.float32),
        }))
    input_data_dict = {
        span_0_key: input_data,
        span_1_key: input_data,
    }

    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(
          list(itertools.chain(*input_data_dict.values())))

      # wrap each value in input_data_dict as a pcoll.
      input_data_pcoll_dict = {}
      for a, b in six.iteritems(input_data_dict):
        input_data_pcoll_dict[a] = p | a >> beam.Create(b)

      transform_fn_with_cache, output_cache = (
          (flat_data, input_data_pcoll_dict, {}, input_metadata) |
          (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
      transform_fn_with_cache_dir = os.path.join(self.base_test_dir,
                                                 'transform_fn_with_cache')
      _ = transform_fn_with_cache | tft_beam.WriteTransformFn(
          transform_fn_with_cache_dir)

      expected_accumulators = {
          b'__v0__VocabularyAccumulate[vocabulary]-\xd3\xe0p\x82\xb1\xa0z\xa3S\xd7N8@\x8f\xa2\xd7\xa1\x9e\xac;':
              [
                  b'["a", [2, [0.0, 1.0], [0.0, 0.0], 1.0]]',
                  b'["b", [2, [0.5, 0.5], [0.0, 0.0], 1.0]]'
              ],
          b'__v0__VocabularyAccumulate[vocabulary_1]-A\xc7_0\xee\xff\x88@E<\xde\xcb\x8d\xff5\xebyZZ\x8d':
              [
                  b'["a", [2, [0.0, 1.0], [0.0, 0.0], 1.0]]',
                  b'["b", [2, [0.5, 0.5], [0.0, 0.0], 1.0]]'
              ],
          b"__v0__VocabularyAccumulate[vocabulary_2]-\x97\x1c>\x851\x94'\xdc\xdf\xfd\xcc\x86\xb7\xb8\xe1\xe8*\x89B\t":
              [b'["a", 1.5]', b'["b", 1.75]'],
      }
      spans = [span_0_key, span_1_key]
      self.assertCountEqual(output_cache.keys(), spans)
      for span in spans:
        self.assertCountEqual(output_cache[span].keys(),
                              expected_accumulators.keys())
        for idx, (key,
                  value) in enumerate(six.iteritems(expected_accumulators)):
          beam_test_util.assert_that(
              output_cache[span][key],
              beam_test_util.equal_to(value),
              label='AssertCache[{}][{}]'.format(span, idx))

    # 4 from analysis on each of the input spans.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 6)
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)

    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(input_data * 2)

      transform_fn_no_cache = ((flat_data, input_metadata) |
                               (beam_impl.AnalyzeDataset(preprocessing_fn)))

      transform_fn_no_cache_dir = os.path.join(self.base_test_dir,
                                               'transform_fn_no_cache')
      _ = transform_fn_no_cache | tft_beam.WriteTransformFn(
          transform_fn_no_cache_dir)

    # 4 from analysis on each of the input spans.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 0)
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)

    tft_output_cache = tft.TFTransformOutput(transform_fn_with_cache_dir)
    tft_output_no_cache = tft.TFTransformOutput(transform_fn_no_cache_dir)

    for vocab_filename in (mi_vocab_name, adjusted_mi_vocab_name,
                           weighted_frequency_vocab_name):
      cache_path = tft_output_cache.vocabulary_file_by_name(vocab_filename)
      no_cache_path = tft_output_no_cache.vocabulary_file_by_name(
          vocab_filename)
      with tf.io.gfile.GFile(cache_path, 'rb') as f1, tf.io.gfile.GFile(
          no_cache_path, 'rb') as f2:
        self.assertEqual(
            f1.readlines(), f2.readlines(),
            'vocab with cache != vocab without cache for: {}'.format(
                vocab_filename))
    'add': tf.FixedLenFeature([], tf.boolean),
     'line_length': tf.FixedLenFeature([], tf.int32])))

input_data_metadata = dataset_metadata.DatasetMetadata(input_data_schema)

with beam.Pipeline() as pipeline:
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        columns = text_fields + ["commented", "add", "line_length"]
        converter = tft.coders.CsvCoder(columns, input_data_schema)
        input_data = (
            pipeline
            | 'ReadInputData' >> beam.io.ReadFromText(train_data_file)
            | 'CleanInputData' >> MapAndFilterErrors(converter.decode))
        input_dataset = (input_data, input_data_metadata)
        transformed_dataset, transform_fn = (
            input_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
        transformed_data, transfored_metadata = transformed_dataset
        transformed_data_coder = tft.coders.ExampleProtoCoder(
          transformed_metadata.schema)

        # Write the resulting data out
        _ = (
          transformed_data
          | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
          | 'WriteTrainData' >> beam.io.WriteToTFRecord(
              os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE)))
        # We'll use the transform function later too
        _ = (
            transform_fn
            | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
Exemplo n.º 21
0
  def _RunBeamImpl(self, inputs: Mapping[Text, Any],
                   outputs: Mapping[Text, Any], preprocessing_fn: Any,
                   input_dataset_metadata: dataset_metadata.DatasetMetadata,
                   raw_examples_data_format: Text, transform_output_path: Text,
                   compute_statistics: bool,
                   materialize_output_paths: Sequence[Text]) -> _Status:
    """Perform data preprocessing with FlumeC++ runner.

    Args:
      inputs: A dictionary of labelled input values.
      outputs: A dictionary of labelled output values.
      preprocessing_fn: The tf.Transform preprocessing_fn.
      input_dataset_metadata: A DatasetMetadata object for the input data.
      raw_examples_data_format: A string describing the raw data format.
      transform_output_path: An absolute path to write the output to.
      compute_statistics: A bool indicating whether or not compute statistics.
      materialize_output_paths: Paths to materialized outputs.

    Raises:
      RuntimeError: If reset() is not being invoked between two run().
      ValueError: If the schema is empty.

    Returns:
      Status of the execution.
    """
    raw_examples_file_format = common.GetSoleValue(
        inputs, labels.EXAMPLES_FILE_FORMAT_LABEL, strict=False)
    analyze_and_transform_data_paths = common.GetValues(
        inputs, labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL)
    transform_only_data_paths = common.GetValues(
        inputs, labels.TRANSFORM_ONLY_DATA_PATHS_LABEL)
    stats_use_tfdv = common.GetSoleValue(inputs,
                                         labels.TFT_STATISTICS_USE_TFDV_LABEL)
    per_set_stats_output_paths = common.GetValues(
        outputs, labels.PER_SET_STATS_OUTPUT_PATHS_LABEL)
    temp_path = common.GetSoleValue(outputs, labels.TEMP_OUTPUT_LABEL)

    input_cache_dir = common.GetSoleValue(
        inputs, labels.CACHE_INPUT_PATH_LABEL, strict=False)
    output_cache_dir = common.GetSoleValue(
        outputs, labels.CACHE_OUTPUT_PATH_LABEL, strict=False)

    tf.logging.info('Analyze and transform data patterns: %s',
                    list(enumerate(analyze_and_transform_data_paths)))
    tf.logging.info('Transform data patterns: %s',
                    list(enumerate(transform_only_data_paths)))
    tf.logging.info('Transform materialization output paths: %s',
                    list(enumerate(materialize_output_paths)))
    tf.logging.info('Transform output path: %s', transform_output_path)

    feature_spec = schema_utils.schema_as_feature_spec(
        _GetSchemaProto(input_dataset_metadata)).feature_spec
    try:
      analyze_input_columns = tft.get_analyze_input_columns(
          preprocessing_fn, feature_spec)
      transform_input_columns = (
          tft.get_transform_input_columns(preprocessing_fn, feature_spec))
    except AttributeError:
      # If using TFT 1.12, fall back to assuming all features are used.
      analyze_input_columns = feature_spec.keys()
      transform_input_columns = feature_spec.keys()
    # Use the same dataset (same columns) for AnalyzeDataset and computing
    # pre-transform stats so that the data will only be read once for these
    # two operations.
    if compute_statistics:
      analyze_input_columns = list(
          set(list(analyze_input_columns) + list(transform_input_columns)))
    if input_dataset_metadata.schema is _RAW_EXAMPLE_SCHEMA:
      analyze_input_dataset_metadata = input_dataset_metadata
      transform_input_dataset_metadata = input_dataset_metadata
    else:
      analyze_input_dataset_metadata = dataset_metadata.DatasetMetadata(
          dataset_schema.from_feature_spec(
              {feature: feature_spec[feature]
               for feature in analyze_input_columns}))
      transform_input_dataset_metadata = dataset_metadata.DatasetMetadata(
          dataset_schema.from_feature_spec(
              {feature: feature_spec[feature]
               for feature in transform_input_columns}))

    can_process_jointly = not bool(per_set_stats_output_paths or
                                   materialize_output_paths or output_cache_dir)
    analyze_data_list = self._MakeDatasetList(
        analyze_and_transform_data_paths, raw_examples_file_format,
        raw_examples_data_format, analyze_input_dataset_metadata,
        can_process_jointly)
    transform_data_list = self._MakeDatasetList(
        list(analyze_and_transform_data_paths) +
        list(transform_only_data_paths), raw_examples_file_format,
        raw_examples_data_format, transform_input_dataset_metadata,
        can_process_jointly)

    desired_batch_size = self._GetDesiredBatchSize(raw_examples_data_format)

    with self._CreatePipeline(outputs) as p:
      with tft_beam.Context(
          temp_dir=temp_path,
          desired_batch_size=desired_batch_size,
          passthrough_keys={_TRANSFORM_INTERNAL_FEATURE_FOR_KEY},
          use_deep_copy_optimization=True):
        # pylint: disable=expression-not-assigned
        # pylint: disable=no-value-for-parameter

        _ = (
            p | self._IncrementColumnUsageCounter(
                len(feature_spec.keys()), len(analyze_input_columns),
                len(transform_input_columns)))

        (new_analyze_data_dict, input_cache, flat_data_required) = (
            p | self._OptimizeRun(input_cache_dir, output_cache_dir,
                                  analyze_data_list, feature_spec,
                                  preprocessing_fn, self._GetCacheSource()))
        # Removing unneeded datasets if they won't be needed for
        # materialization. This means that these datasets won't be included in
        # the statistics computation or profiling either.
        if not materialize_output_paths:
          analyze_data_list = [
              d for d in new_analyze_data_dict.values() if d is not None
          ]

        analyze_decode_fn = (
            self._GetDecodeFunction(raw_examples_data_format,
                                    analyze_input_dataset_metadata.schema))

        for (idx, dataset) in enumerate(analyze_data_list):
          dataset.encoded = (
              p | 'ReadAnalysisDataset[{}]'.format(idx) >>
              self._ReadExamples(dataset))
          dataset.decoded = (
              dataset.encoded
              | 'DecodeAnalysisDataset[{}]'.format(idx) >>
              self._DecodeInputs(analyze_decode_fn))

        input_analysis_data = {}
        for key, dataset in six.iteritems(new_analyze_data_dict):
          if dataset is None:
            input_analysis_data[key] = None
          else:
            input_analysis_data[key] = dataset.decoded

        if flat_data_required:
          flat_input_analysis_data = (
              [dataset.decoded for dataset in analyze_data_list]
              | 'FlattenAnalysisDatasets' >> beam.Flatten(pipeline=p))
        else:
          flat_input_analysis_data = None
        if input_cache:
          tf.logging.info('Analyzing data with cache.')
        transform_fn, cache_output = (
            (flat_input_analysis_data, input_analysis_data, input_cache,
             input_dataset_metadata)
            | 'AnalyzeDataset' >> tft_beam.AnalyzeDatasetWithCache(
                preprocessing_fn, pipeline=p))

        # Write the raw/input metadata.
        (input_dataset_metadata
         | 'WriteMetadata' >> tft_beam.WriteMetadata(
             os.path.join(transform_output_path,
                          tft.TFTransformOutput.RAW_METADATA_DIR), p))

        # WriteTransformFn writes transform_fn and metadata to subdirectories
        # tensorflow_transform.SAVED_MODEL_DIR and
        # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
        (transform_fn |
         'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_output_path))

        if output_cache_dir is not None and cache_output is not None:
          # TODO(b/37788560): Possibly make this part of the beam graph.
          tf.io.gfile.makedirs(output_cache_dir)
          tf.logging.info('Using existing cache in: %s', input_cache_dir)
          if input_cache_dir is not None:
            # Only copy cache that is relevant to this iteration. This is
            # assuming that this pipeline operates on rolling ranges, so those
            # cache entries may also be relevant for future iterations.
            for span_cache_dir in input_analysis_data:
              full_span_cache_dir = os.path.join(input_cache_dir,
                                                 span_cache_dir)
              if tf.io.gfile.isdir(full_span_cache_dir):
                self._CopyCache(full_span_cache_dir,
                                os.path.join(output_cache_dir, span_cache_dir))

          (cache_output
           | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
               p, output_cache_dir, sink=self._GetCacheSink()))

        if compute_statistics or materialize_output_paths:
          # Do not compute pre-transform stats if the input format is raw proto,
          # as StatsGen would treat any input as tf.Example.
          if (compute_statistics and
              not self._IsDataFormatProto(raw_examples_data_format)):
            # Aggregated feature stats before transformation.
            pre_transform_feature_stats_path = os.path.join(
                transform_output_path,
                tft.TFTransformOutput.PRE_TRANSFORM_FEATURE_STATS_PATH)

            schema_proto = _GetSchemaProto(analyze_input_dataset_metadata)
            ([
                dataset.decoded if stats_use_tfdv else dataset.encoded
                for dataset in analyze_data_list
            ]
             | 'FlattenPreTransformAnalysisDatasets' >> beam.Flatten(pipeline=p)
             | 'GenerateAggregatePreTransformAnalysisStats' >>
             self._GenerateStats(
                 pre_transform_feature_stats_path,
                 schema_proto,
                 use_deep_copy_optimization=True,
                 use_tfdv=stats_use_tfdv))

          transform_decode_fn = (
              self._GetDecodeFunction(raw_examples_data_format,
                                      transform_input_dataset_metadata.schema))
          # transform_data_list is a superset of analyze_data_list, we pay the
          # cost to read the same dataset (analyze_data_list) again here to
          # prevent certain beam runner from doing large temp materialization.
          for (idx, dataset) in enumerate(transform_data_list):
            dataset.encoded = (
                p
                | 'ReadTransformDataset[{}]'.format(idx) >>
                self._ReadExamples(dataset))
            dataset.decoded = (
                dataset.encoded
                | 'DecodeTransformDataset[{}]'.format(idx) >>
                self._DecodeInputs(transform_decode_fn))
            (dataset.transformed,
             metadata) = (((dataset.decoded, transform_input_dataset_metadata),
                           transform_fn)
                          | 'TransformDataset[{}]'.format(idx) >>
                          tft_beam.TransformDataset())

            if materialize_output_paths or not stats_use_tfdv:
              dataset.transformed_and_encoded = (
                  dataset.transformed
                  | 'EncodeTransformedDataset[{}]'.format(idx) >> beam.ParDo(
                      self._EncodeAsExamples(), metadata))

          if compute_statistics:
            # Aggregated feature stats after transformation.
            _, metadata = transform_fn
            post_transform_feature_stats_path = os.path.join(
                transform_output_path,
                tft.TFTransformOutput.POST_TRANSFORM_FEATURE_STATS_PATH)

            # TODO(b/70392441): Retain tf.Metadata (e.g., IntDomain) in
            # schema. Currently input dataset schema only contains dtypes,
            # and other metadata is dropped due to roundtrip to tensors.
            transformed_schema_proto = _GetSchemaProto(metadata)

            ([(dataset.transformed
               if stats_use_tfdv else dataset.transformed_and_encoded)
              for dataset in transform_data_list]
             | 'FlattenPostTransformAnalysisDatasets' >> beam.Flatten()
             | 'GenerateAggregatePostTransformAnalysisStats' >>
             self._GenerateStats(
                 post_transform_feature_stats_path,
                 transformed_schema_proto,
                 use_tfdv=stats_use_tfdv))

            if per_set_stats_output_paths:
              assert len(transform_data_list) == len(per_set_stats_output_paths)
              # TODO(b/67632871): Remove duplicate stats gen compute that is
              # done both on a flattened view of the data, and on each span
              # below.
              bundles = zip(transform_data_list, per_set_stats_output_paths)
              for (idx, (dataset, output_path)) in enumerate(bundles):
                if stats_use_tfdv:
                  data = dataset.transformed
                else:
                  data = dataset.transformed_and_encoded
                (data
                 | 'GeneratePostTransformStats[{}]'.format(idx) >>
                 self._GenerateStats(
                     output_path,
                     transformed_schema_proto,
                     use_tfdv=stats_use_tfdv))

          if materialize_output_paths:
            assert len(transform_data_list) == len(materialize_output_paths)
            bundles = zip(transform_data_list, materialize_output_paths)
            for (idx, (dataset, output_path)) in enumerate(bundles):
              (dataset.transformed_and_encoded
               | 'Materialize[{}]'.format(idx) >> self._WriteExamples(
                   raw_examples_file_format, output_path))

    return _Status.OK()
Exemplo n.º 22
0
def transform_data(train_data_file, test_data_file, working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    working_dir: Directory to write transformed data and metadata to
  """
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        # Since we are modifying some features and leaving others unchanged, we
        # start by setting `outputs` to a copy of `inputs.
        outputs = inputs.copy()

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        for key in OPTIONAL_NUMERIC_FEATURE_KEYS:
            # This is a SparseTensor because it is optional. Here we fill in a default
            # value when it is missing.
            sparse = tf.sparse.SparseTensor(inputs[key].indices,
                                            inputs[key].values,
                                            [inputs[key].dense_shape[0], 1])
            dense = tf.sparse.to_dense(sp_input=sparse, default_value=0.)
            # Reshaping from a batch of vectors of size 1 to a batch to scalars.
            dense = tf.squeeze(dense, axis=1)
            outputs[key] = tft.scale_to_0_1(dense)

        # For all categorical columns except the label column, we generate a
        # vocabulary but do not modify the feature.  This vocabulary is instead
        # used in the trainer, by means of a feature column, to convert the feature
        # from a string to an integer id.
        for key in CATEGORICAL_FEATURE_KEYS:
            outputs[key] = tft.compute_and_apply_vocabulary(tf.strings.strip(
                inputs[key]),
                                                            num_oov_buckets=1,
                                                            vocab_filename=key)

        # For the label column we provide the mapping from string to index.
        table_keys = ['>50K', '<=50K']
        initializer = tf.lookup.KeyValueTensorInitializer(
            keys=table_keys,
            values=tf.cast(tf.range(len(table_keys)), tf.int64),
            key_dtype=tf.string,
            value_dtype=tf.int64)
        table = tf.lookup.StaticHashTable(initializer, default_value=-1)
        # Romove trailing periods for test data when the data is read with tf.data.
        label_str = tf.strings.regex_replace(inputs[LABEL_KEY], r'\.', '')
        label_str = tf.strings.strip(label_str)
        data_labels = table.lookup(label_str)
        transformed_label = tf.one_hot(indices=data_labels,
                                       depth=len(table_keys),
                                       on_value=1.0,
                                       off_value=0.0)
        outputs[LABEL_KEY] = tf.reshape(transformed_label,
                                        [-1, len(table_keys)])

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            # Create a TFXIO to read the census data with the schema. To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            # We first read CSV files and use BeamRecordCsvTFXIO whose .BeamSource()
            # accepts a PCollection[bytes] because we need to patch the records first
            # (see "FixCommasTrainData" below). Otherwise, tfxio.CsvTFXIO can be used
            # to both read the CSV files and parse them to TFT inputs:
            # csv_tfxio = tfxio.CsvTFXIO(...)
            # raw_data = (pipeline | 'ToRecordBatches' >> csv_tfxio.BeamSource())
            csv_tfxio = tfxio.BeamRecordCsvTFXIO(
                physical_format='text',
                column_names=ORDERED_CSV_COLUMNS,
                schema=SCHEMA)

            # Read in raw data and convert using CSV TFXIO.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do the from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV TFXIO can read, in particular
            # removing spaces after commas.
            raw_data = (pipeline
                        | 'ReadTrainData' >> beam.io.ReadFromText(
                            train_data_file, coder=beam.coders.BytesCoder())
                        | 'FixCommasTrainData' >>
                        beam.Map(lambda line: line.replace(b', ', b','))
                        | 'DecodeTrainData' >> csv_tfxio.BeamSource())

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, csv_tfxio.TensorAdapterConfig())
            transformed_dataset, transform_fn = (
                raw_dataset
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            _ = (transformed_data
                 | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            # Now apply transform function to test data.  In this case we remove the
            # trailing period at the end of each line, and also ignore the header line
            # that is present in the test data file.
            raw_test_data = (pipeline
                             | 'ReadTestData' >> beam.io.ReadFromText(
                                 test_data_file,
                                 skip_header_lines=1,
                                 coder=beam.coders.BytesCoder())
                             | 'FixCommasTestData' >>
                             beam.Map(lambda line: line.replace(b', ', b','))
                             | 'RemoveTrailingPeriodsTestData' >>
                             beam.Map(lambda line: line[:-1])
                             | 'DecodeTestData' >> csv_tfxio.BeamSource())

            raw_test_dataset = (raw_test_data, csv_tfxio.TensorAdapterConfig())

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | tft_beam.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = (
                transformed_test_data
                | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                | 'WriteTestData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to working_dir, which can then
            # be read by the tft.TFTransformOutput class.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
Exemplo n.º 23
0
def transform_data(train_data_file, test_data_file, working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    working_dir: Directory to write transformed data and metadata to
  """
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        # Since we are modifying some features and leaving others unchanged, we
        # start by setting `outputs` to a copy of `inputs.
        outputs = inputs.copy()

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(outputs[key])

        for key in OPTIONAL_NUMERIC_FEATURE_KEYS:
            # This is a SparseTensor because it is optional. Here we fill in a default
            # value when it is missing.
            dense = tf.compat.v1.sparse_to_dense(
                outputs[key].indices, [outputs[key].dense_shape[0], 1],
                outputs[key].values,
                default_value=0.)
            # Reshaping from a batch of vectors of size 1 to a batch to scalars.
            dense = tf.squeeze(dense, axis=1)
            outputs[key] = tft.scale_to_0_1(dense)

        # For all categorical columns except the label column, we generate a
        # vocabulary but do not modify the feature.  This vocabulary is instead
        # used in the trainer, by means of a feature column, to convert the feature
        # from a string to an integer id.
        for key in CATEGORICAL_FEATURE_KEYS:
            tft.vocabulary(inputs[key], vocab_filename=key)

        # For the label column we provide the mapping from string to index.
        table_keys = ['>50K', '<=50K']
        initializer = tf.lookup.KeyValueTensorInitializer(
            keys=table_keys,
            values=tf.cast(tf.range(len(table_keys)), tf.int64),
            key_dtype=tf.string,
            value_dtype=tf.int64)
        table = tf.lookup.StaticHashTable(initializer, default_value=-1)
        outputs[LABEL_KEY] = table.lookup(outputs[LABEL_KEY])

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            # Create a coder to read the census data with the schema.  To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            ordered_columns = [
                'age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'label'
            ]
            converter = tft.coders.CsvCoder(ordered_columns,
                                            RAW_DATA_METADATA.schema)

            # Read in raw data and convert using CSV converter.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do the from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV converter can read, in particular
            # removing spaces after commas.
            #
            # We use MapAndFilterErrors instead of Map to filter out decode errors in
            # convert.decode which should only occur for the trailing blank line.
            raw_data = (
                pipeline
                | 'ReadTrainData' >> beam.io.ReadFromText(train_data_file)
                | 'FixCommasTrainData' >>
                beam.Map(lambda line: line.replace(', ', ','))
                | 'DecodeTrainData' >> MapAndFilterErrors(converter.decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, RAW_DATA_METADATA)
            transformed_dataset, transform_fn = (
                raw_dataset
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            _ = (transformed_data
                 | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            # Now apply transform function to test data.  In this case we remove the
            # trailing period at the end of each line, and also ignore the header line
            # that is present in the test data file.
            raw_test_data = (
                pipeline
                | 'ReadTestData' >> beam.io.ReadFromText(test_data_file,
                                                         skip_header_lines=1)
                | 'FixCommasTestData' >>
                beam.Map(lambda line: line.replace(', ', ','))
                | 'RemoveTrailingPeriodsTestData' >>
                beam.Map(lambda line: line[:-1])
                | 'DecodeTestData' >> MapAndFilterErrors(converter.decode))

            raw_test_dataset = (raw_test_data, RAW_DATA_METADATA)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | tft_beam.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = (
                transformed_test_data
                | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                | 'WriteTestData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to working_dir, which can then
            # be read by the tft.TFTransformOutput class.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
Exemplo n.º 24
0
def transform_data(input_features, preprocessing_fn, pipeline_args,
                   train_data_file, cv_data_file, test_data_file, working_dir):
    """Transform the data and write out as a TFRecord of Example protos.
    Read in the data using the parquet io, and transform it using a
    preprocessing pipeline that scales numeric data and converts categorical data
    from strings to int64 values indices, by creating a vocabulary for each
    category.
    Args:
      train_data_file: File containing training data
      test_data_file: File containing test data
      feature_config: named tuple with feature types
      working_dir: Directory to write transformed data and metadata to
    """

    # Input schema definition
    RAW_DATA_METADATA = _get_raw_metadata(input_features)

    # pipeline args to read from gcs, currently unused because reading local file
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # create a beam pipeline
    with beam.Pipeline(options=pipeline_options) as pipeline:
        # Needs to be GCS location if the process is running on Dataflow, otherwise it can't share model files
        temp_dir = pipeline_options.get_all_options().get(
            'temp_location') or tempfile.mkdtemp()
        with tft_beam.Context(temp_dir=temp_dir):
            raw_data = (
                pipeline
                | 'ReadTrainData' >> beam.io.ReadFromParquet(train_data_file))

            # Combine data and schema into a dataset tuple.
            raw_dataset = (raw_data, RAW_DATA_METADATA)
            transformed_dataset, transform_fn = (
                raw_dataset
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            # write to tf record
            _ = (transformed_data
                 | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, "train_tfrecord")))

            def encode_data(data_path, prefix, output_filename):
                # Apply transform function to test data.
                raw_data = (
                    pipeline
                    |
                    'ReadData' + prefix >> beam.io.ReadFromParquet(data_path))

                raw_dataset = (raw_data, RAW_DATA_METADATA)

                transformed_dataset = (
                    (raw_dataset, transform_fn)
                    | 'Transform' + prefix >> tft_beam.TransformDataset())

                # Don't need transformed data schema, it's the same as before.
                transformed_data, _ = transformed_dataset

                _ = (transformed_data
                     | 'EncodeData' + prefix >> beam.Map(
                         transformed_data_coder.encode)
                     | 'WriteData' + prefix >> beam.io.WriteToTFRecord(
                         os.path.join(working_dir, output_filename)))

            encode_data(cv_data_file, "-cv", "cv_tfrecord")
            encode_data(test_data_file, "-test", "test_tfrecord")

            # Will write a SavedModel and metadata to working_dir, which can then
            # be read by the tft.TFTransformOutput class.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
Exemplo n.º 25
0
    def test_single_phase_mixed_analyzer_run_once(self):
        span_0_key = 'span-0'
        span_1_key = 'span-1'

        def preprocessing_fn(inputs):

            integerized_s = tft.compute_and_apply_vocabulary(inputs['s'])

            _ = tft.bucketize(inputs['x'], 2, name='bucketize')

            return {
                'integerized_s':
                integerized_s,
                'x_min':
                tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'x_mean':
                tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'y_min':
                tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
                'y_mean':
                tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
            }

        # Run AnalyzeAndTransform on some input data and compare with expected
        # output.
        input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.io.FixedLenFeature([], tf.float32),
                'y':
                tf.io.FixedLenFeature([], tf.float32),
                's':
                tf.io.FixedLenFeature([], tf.string),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
                'y': 1,
                's': 'b',
            }, {
                'x': 4,
                'y': -4,
                's': 'b',
            }],
            span_1_key:
            input_data,
        }

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # TODO(b/37788560): Get these names programmatically.
                cache_dict = {
                    span_0_key: {
                        '__v0__CacheableCombineAccumulate--x_1-mean_and_var--':
                        p
                        | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0, 0.0]']),
                        '__v0__CacheableCombineAccumulate--x-x--':
                        p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']),
                        '__v0__CacheableCombineAccumulate--y_1-mean_and_var--':
                        p |
                        'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25, 0.0]']),
                        '__v0__CacheableCombineAccumulate--y-y--':
                        p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']),
                    },
                    span_1_key: {},
                }

                transform_fn, cache_output = (
                    (flat_data, input_data_dict, cache_dict, input_metadata)
                    | 'Analyze' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
                _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
                    self._cache_dir)

                transformed_dataset = (
                    ((input_data_dict[span_1_key], input_metadata),
                     transform_fn)
                    | 'Transform' >> beam_impl.TransformDataset())

                dot_string = nodes.get_dot_graph(
                    [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
                self.WriteRenderedDotFile(dot_string)

                transformed_data, unused_transformed_metadata = transformed_dataset

                expected_transformed = [
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                        'integerized_s': 0,
                    },
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                        'integerized_s': 0,
                    },
                ]
                beam_test_util.assert_that(
                    transformed_data,
                    beam_test_util.equal_to(expected_transformed))

                transform_fn_dir = os.path.join(self.base_test_dir,
                                                'transform_fn')
                _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)
Exemplo n.º 26
0
    def test_single_phase_mixed_analyzer_run_once(self):
        cache_location = self._make_cache_location()

        span_0_key = 'span-0'
        span_1_key = 'span-1'

        _write_cache('__v0__CacheableCombineAccumulate--x_1-mean_and_var--',
                     span_0_key, [2.0, 1.0, 9.0],
                     cache_location.input_cache_dir)
        _write_cache('__v0__CacheableCombineAccumulate--x-x--', span_0_key,
                     [2.0, 4.0], cache_location.input_cache_dir)
        _write_cache('__v0__CacheableCombineAccumulate--y_1-mean_and_var--',
                     span_0_key, [2.0, -1.5, 6.25],
                     cache_location.input_cache_dir)
        _write_cache('__v0__CacheableCombineAccumulate--y-y--', span_0_key,
                     [4.0, 1.0], cache_location.input_cache_dir)

        def preprocessing_fn(inputs):

            integerized_s = tft.compute_and_apply_vocabulary(inputs['s'])

            _ = tft.bucketize(inputs['x'], 2, name='bucketize')

            return {
                'integerized_s':
                integerized_s,
                'x_min':
                tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'x_mean':
                tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'y_min':
                tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
                'y_mean':
                tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
            }

        # Run AnalyzeAndTransform on some input data and compare with expected
        # output.
        input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.FixedLenFeature([], tf.float32),
                'y':
                tf.FixedLenFeature([], tf.float32),
                's':
                tf.FixedLenFeature([], tf.string),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
                'y': 1,
                's': 'b',
            }, {
                'x': 4,
                'y': -4,
                's': 'b',
            }],
            span_1_key:
            input_data,
        }
        with beam_impl.Context(temp_dir=self.get_temp_dir()):

            flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten()

            transform_fn = ((flat_data, input_data_dict, input_metadata) |
                            (beam_impl.AnalyzeDatasetWithCache(
                                preprocessing_fn, cache_location)))

        transformed_dataset = ((
            (input_data_dict[span_1_key], input_metadata), transform_fn)
                               | beam_impl.TransformDataset())

        transformed_data, unused_transformed_metadata = transformed_dataset

        exepected_transformed_data = [
            {
                'x_mean': 6.0,
                'x_min': -2.0,
                'y_mean': -0.25,
                'y_min': -4.0,
                'integerized_s': 0,
            },
            {
                'x_mean': 6.0,
                'x_min': -2.0,
                'y_mean': -0.25,
                'y_min': -4.0,
                'integerized_s': 0,
            },
        ]
        self.assertDataCloseOrEqual(transformed_data,
                                    exepected_transformed_data)

        transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn')
        _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)
Exemplo n.º 27
0
    def test_non_frequency_vocabulary_merge(self):
        """This test compares vocabularies produced with and without cache."""

        mi_vocab_name = 'mutual_information_vocab'
        adjusted_mi_vocab_name = 'adjusted_mutual_information_vocab'
        weighted_frequency_vocab_name = 'weighted_frequency_vocab'

        def preprocessing_fn(inputs):
            _ = tft.vocabulary(inputs['s'],
                               labels=inputs['label'],
                               store_frequency=True,
                               vocab_filename=mi_vocab_name,
                               min_diff_from_avg=0.1,
                               use_adjusted_mutual_info=False)

            _ = tft.vocabulary(inputs['s'],
                               labels=inputs['label'],
                               store_frequency=True,
                               vocab_filename=adjusted_mi_vocab_name,
                               min_diff_from_avg=1.0,
                               use_adjusted_mutual_info=True)

            _ = tft.vocabulary(inputs['s'],
                               weights=inputs['weight'],
                               store_frequency=True,
                               vocab_filename=weighted_frequency_vocab_name,
                               use_adjusted_mutual_info=False)
            return inputs

        span_0_key = 'span-0'
        span_1_key = 'span-1'

        input_data = [
            dict(s='a', weight=1, label=1),
            dict(s='a', weight=0.5, label=1),
            dict(s='b', weight=0.75, label=1),
            dict(s='b', weight=1, label=0),
        ]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                's':
                tf.io.FixedLenFeature([], tf.string),
                'label':
                tf.io.FixedLenFeature([], tf.int64),
                'weight':
                tf.io.FixedLenFeature([], tf.float32),
            }))
        input_data_dict = {
            span_0_key: input_data,
            span_1_key: input_data,
        }
        with beam_impl.Context(temp_dir=self.get_temp_dir()):

            flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten()

            transform_fn_with_cache, output_cache = (
                (flat_data, input_data_dict, {}, input_metadata) |
                (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

            expected_accumulators = {
                '__v0__VocabularyAccumulate--vocabulary--':
                [b'["a", [2, 1.0, 0.0, 1.0]]', b'["b", [2, 0.5, 0.0, 1.0]]'],
                '__v0__VocabularyAccumulate--vocabulary_1--':
                [b'["a", [2, 1.0, 0.0, 1.0]]', b'["b", [2, 0.5, 0.0, 1.0]]'],
                '__v0__VocabularyAccumulate--vocabulary_2--':
                [b'["a", 1.5]', b'["b", 1.75]'],
            }
            spans = [span_0_key, span_1_key]
            self.assertCountEqual(output_cache.keys(), spans)
            for span in spans:
                self.assertCountEqual(output_cache[span].keys(),
                                      expected_accumulators.keys())
                for key, value in six.iteritems(expected_accumulators):
                    self.assertCountEqual(output_cache[span][key], value)

            transform_fn_no_cache = (
                (input_data * 2, input_metadata) |
                (beam_impl.AnalyzeDataset(preprocessing_fn)))

        transform_fn_with_cache_dir = os.path.join(self.base_test_dir,
                                                   'transform_fn_with_cache')
        _ = transform_fn_with_cache | tft_beam.WriteTransformFn(
            transform_fn_with_cache_dir)

        transform_fn_no_cache_dir = os.path.join(self.base_test_dir,
                                                 'transform_fn_no_cache')
        _ = transform_fn_no_cache | tft_beam.WriteTransformFn(
            transform_fn_no_cache_dir)

        tft_output_cache = tft.TFTransformOutput(transform_fn_with_cache_dir)
        tft_output_no_cache = tft.TFTransformOutput(transform_fn_no_cache_dir)

        for vocab_filename in (mi_vocab_name, adjusted_mi_vocab_name,
                               weighted_frequency_vocab_name):
            cache_path = tft_output_cache.vocabulary_file_by_name(
                vocab_filename)
            no_cache_path = tft_output_no_cache.vocabulary_file_by_name(
                vocab_filename)
            with tf.io.gfile.GFile(cache_path, 'rb') as f1, tf.io.gfile.GFile(
                    no_cache_path, 'rb') as f2:
                self.assertEqual(
                    f1.readlines(), f2.readlines(),
                    'vocab with cache != vocab without cache for: {}'.format(
                        vocab_filename))
Exemplo n.º 28
0
    def test_single_phase_run_twice(self):

        cache_location = self._make_cache_location('input_cache_1',
                                                   'output_cache_1')

        span_0_key = 'span-0'
        span_1_key = 'span-1'

        def preprocessing_fn(inputs):

            _ = tft.vocabulary(inputs['s'])

            _ = tft.bucketize(inputs['x'], 2, name='bucketize')

            return {
                'x_min':
                tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'x_mean':
                tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'y_min':
                tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
                'y_mean':
                tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
            }

        input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.FixedLenFeature([], tf.float32),
                'y':
                tf.FixedLenFeature([], tf.float32),
                's':
                tf.FixedLenFeature([], tf.string),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
                'y': 1,
                's': 'a',
            }, {
                'x': 4,
                'y': -4,
                's': 'a',
            }],
            span_1_key:
            input_data,
        }
        with beam_impl.Context(temp_dir=self.get_temp_dir()):

            flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten()

            transform_fn = ((flat_data, input_data_dict, input_metadata) |
                            (beam_impl.AnalyzeDatasetWithCache(
                                preprocessing_fn, cache_location)))

        transformed_dataset = ((
            (input_data_dict[span_1_key], input_metadata), transform_fn)
                               | beam_impl.TransformDataset())

        transformed_data, unused_transformed_metadata = transformed_dataset

        exepected_transformed_data = [
            {
                'x_mean': 6.0,
                'x_min': -2.0,
                'y_mean': -0.25,
                'y_min': -4.0,
            },
            {
                'x_mean': 6.0,
                'x_min': -2.0,
                'y_mean': -0.25,
                'y_min': -4.0,
            },
        ]
        self.assertDataCloseOrEqual(transformed_data,
                                    exepected_transformed_data)

        transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn')
        _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)

        for key in input_data_dict:
            key_cache_dir = os.path.join(cache_location.output_cache_dir, key)
            self.assertTrue(tf.gfile.IsDirectory(key_cache_dir))
            self.assertEqual(len(tf.gfile.ListDirectory(key_cache_dir)), 6)

        cache_location = self._make_cache_location('output_cache_1',
                                                   'output_cache_2')

        with beam_impl.Context(temp_dir=self.get_temp_dir()):

            flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten()

            transform_fn = ((flat_data, input_data_dict, input_metadata) |
                            (beam_impl.AnalyzeDatasetWithCache(
                                preprocessing_fn, cache_location)))

        transformed_dataset = ((
            (input_data_dict[span_1_key], input_metadata), transform_fn)
                               | beam_impl.TransformDataset())
        transformed_data, unused_transformed_metadata = transformed_dataset
        self.assertDataCloseOrEqual(transformed_data,
                                    exepected_transformed_data)

        self.assertFalse(tf.gfile.IsDirectory(cache_location.output_cache_dir))
Exemplo n.º 29
0
def transform_data(working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    working_dir: Directory to read shuffled data from and write transformed data
        and metadata to.
  """

    with beam.Pipeline() as pipeline:
        with tft_beam.Context(
                temp_dir=os.path.join(working_dir, TRANSFORM_TEMP_DIR)):
            coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema)
            train_data = (pipeline
                          | 'ReadTrain' >> beam.io.ReadFromTFRecord(
                              os.path.join(working_dir,
                                           SHUFFLED_TRAIN_DATA_FILEBASE + '*'))
                          | 'DecodeTrain' >> beam.Map(coder.decode))

            test_data = (pipeline
                         | 'ReadTest' >> beam.io.ReadFromTFRecord(
                             os.path.join(working_dir,
                                          SHUFFLED_TEST_DATA_FILEBASE + '*'))
                         | 'DecodeTest' >> beam.Map(coder.decode))

            def preprocessing_fn(inputs):
                """Preprocess input columns into transformed columns."""
                review = inputs[REVIEW_KEY]

                # Here tf.compat.v1.string_split behaves differently from
                # tf.strings.split.
                review_tokens = tf.compat.v1.string_split(review, DELIMITERS)
                review_indices = tft.compute_and_apply_vocabulary(
                    review_tokens, top_k=VOCAB_SIZE)
                # Add one for the oov bucket created by compute_and_apply_vocabulary.
                review_bow_indices, review_weight = tft.tfidf(
                    review_indices, VOCAB_SIZE + 1)
                return {
                    REVIEW_KEY: review_bow_indices,
                    REVIEW_WEIGHT_KEY: review_weight,
                    LABEL_KEY: inputs[LABEL_KEY]
                }

            (transformed_train_data, transformed_metadata), transform_fn = (
                (train_data, RAW_DATA_METADATA)
                | 'AnalyzeAndTransform' >>
                tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            transformed_test_data, _ = (
                ((test_data, RAW_DATA_METADATA), transform_fn)
                | 'Transform' >> tft_beam.TransformDataset())

            _ = (transformed_train_data
                 | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            _ = (
                transformed_test_data
                | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                | 'WriteTestData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to two subdirectories of
            # working_dir, given by tft.TRANSFORM_FN_DIR and
            # tft.TRANSFORMED_METADATA_DIR respectively.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
Exemplo n.º 30
0
    def _RunBeamImpl(self, inputs, outputs, preprocessing_fn,
                     input_dataset_metadata, raw_examples_data_format,
                     transform_output_path, compute_statistics,
                     materialize_output_paths):
        """Perform data preprocessing with FlumeC++ runner.

    Args:
      inputs: A dictionary of labelled input values.
      outputs: A dictionary of labelled output values.
      preprocessing_fn: The tf.Transform preprocessing_fn.
      input_dataset_metadata: A DatasetMetadata object for the input data.
      raw_examples_data_format: A string describing the raw data format.
      transform_output_path: An absolute path to write the output to.
      compute_statistics: A bool indicating whether or not compute statistics.
      materialize_output_paths: Paths to materialized outputs.

    Raises:
      RuntimeError: If reset() is not being invoked between two run().
      ValueError: If the schema is empty.

    Returns:
      Status of the execution.
    """
        raw_examples_file_format = common.GetSoleValue(
            inputs, labels.EXAMPLES_FILE_FORMAT_LABEL, strict=False)
        analyze_and_transform_data_paths = common.GetValues(
            inputs, labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL)
        transform_only_data_paths = common.GetValues(
            inputs, labels.TRANSFORM_ONLY_DATA_PATHS_LABEL)
        stats_use_tfdv = common.GetSoleValue(
            inputs, labels.TFT_STATISTICS_USE_TFDV_LABEL)
        per_set_stats_output_paths = common.GetValues(
            outputs, labels.PER_SET_STATS_OUTPUT_PATHS_LABEL)
        temp_path = common.GetSoleValue(outputs, labels.TEMP_OUTPUT_LABEL)

        tf.logging.info('Analyze and transform data patterns: %s',
                        list(enumerate(analyze_and_transform_data_paths)))
        tf.logging.info('Transform data patterns: %s',
                        list(enumerate(transform_only_data_paths)))
        tf.logging.info('Transform materialization output paths: %s',
                        list(enumerate(materialize_output_paths)))
        tf.logging.info('Transform output path: %s', transform_output_path)

        feature_spec = input_dataset_metadata.schema.as_feature_spec()
        try:
            analyze_input_columns = tft.get_analyze_input_columns(
                preprocessing_fn, feature_spec)
            transform_input_columns = (tft.get_transform_input_columns(
                preprocessing_fn, feature_spec))
        except AttributeError:
            # If using TFT 1.12, fall back to assuming all features are used.
            analyze_input_columns = feature_spec.keys()
            transform_input_columns = feature_spec.keys()
        # Use the same dataset (same columns) for AnalyzeDataset and computing
        # pre-transform stats so that the data will only be read once for these
        # two operations.
        if compute_statistics:
            analyze_input_columns = list(
                set(
                    list(analyze_input_columns) +
                    list(transform_input_columns)))
        analyze_input_dataset_metadata = copy.deepcopy(input_dataset_metadata)
        transform_input_dataset_metadata = copy.deepcopy(
            input_dataset_metadata)
        if input_dataset_metadata.schema is not _RAW_EXAMPLE_SCHEMA:
            analyze_input_dataset_metadata.schema = dataset_schema.from_feature_spec(
                {
                    feature: feature_spec[feature]
                    for feature in analyze_input_columns
                })
            transform_input_dataset_metadata.schema = (
                dataset_schema.from_feature_spec({
                    feature: feature_spec[feature]
                    for feature in transform_input_columns
                }))

        can_process_jointly = not bool(per_set_stats_output_paths
                                       or materialize_output_paths)
        analyze_data_list = self._MakeDatasetList(
            analyze_and_transform_data_paths, raw_examples_file_format,
            raw_examples_data_format, analyze_input_dataset_metadata,
            can_process_jointly)
        transform_data_list = self._MakeDatasetList(
            list(analyze_and_transform_data_paths) +
            list(transform_only_data_paths), raw_examples_file_format,
            raw_examples_data_format, transform_input_dataset_metadata,
            can_process_jointly)

        desired_batch_size = self._GetDesiredBatchSize(
            raw_examples_data_format)

        with self._CreatePipeline(outputs) as p:
            with tft_beam.Context(
                    temp_dir=temp_path,
                    desired_batch_size=desired_batch_size,
                    passthrough_keys={_TRANSFORM_INTERNAL_FEATURE_FOR_KEY},
                    use_deep_copy_optimization=True):
                # pylint: disable=expression-not-assigned
                # pylint: disable=no-value-for-parameter

                analyze_decode_fn = (self._GetDecodeFunction(
                    raw_examples_data_format,
                    analyze_input_dataset_metadata.schema))

                for (idx, dataset) in enumerate(analyze_data_list):
                    dataset.encoded = (p
                                       | 'ReadAnalysisDataset[{}]'.format(idx)
                                       >> self._ReadExamples(dataset))
                    dataset.decoded = (
                        dataset.encoded
                        | 'DecodeAnalysisDataset[{}]'.format(idx) >>
                        self._DecodeInputs(analyze_decode_fn))

                input_analysis_data = (
                    [dataset.decoded for dataset in analyze_data_list]
                    | 'FlattenAnalysisDatasets' >> beam.Flatten())
                transform_fn = ((input_analysis_data, input_dataset_metadata)
                                | 'AnalyzeDataset' >>
                                tft_beam.AnalyzeDataset(preprocessing_fn))
                # Write the raw/input metadata.
                (input_dataset_metadata
                 | 'WriteMetadata' >> tft_beam.WriteMetadata(
                     os.path.join(transform_output_path,
                                  tft.TFTransformOutput.RAW_METADATA_DIR), p))

                # WriteTransformFn writes transform_fn and metadata to subdirectories
                # tensorflow_transform.SAVED_MODEL_DIR and
                # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
                (transform_fn | 'WriteTransformFn' >>
                 tft_beam.WriteTransformFn(transform_output_path))

                if compute_statistics or materialize_output_paths:
                    # Do not compute pre-transform stats if the input format is raw proto,
                    # as StatsGen would treat any input as tf.Example.
                    if (compute_statistics and not self._IsDataFormatProto(
                            raw_examples_data_format)):
                        # Aggregated feature stats before transformation.
                        pre_transform_feature_stats_path = os.path.join(
                            transform_output_path, tft.TFTransformOutput.
                            PRE_TRANSFORM_FEATURE_STATS_PATH)

                        # TODO(b/70392441): Retain tf.Metadata (e.g., IntDomain) in
                        # schema. Currently input dataset schema only contains dtypes,
                        # and other metadata is dropped due to roundtrip to tensors.
                        schema_proto = schema_utils.schema_from_feature_spec(
                            analyze_input_dataset_metadata.schema.
                            as_feature_spec())
                        ([
                            dataset.decoded
                            if stats_use_tfdv else dataset.encoded
                            for dataset in analyze_data_list
                        ]
                         | 'FlattenPreTransformAnalysisDatasets' >>
                         beam.Flatten()
                         | 'GenerateAggregatePreTransformAnalysisStats' >>
                         self._GenerateStats(pre_transform_feature_stats_path,
                                             schema_proto,
                                             use_deep_copy_optimization=True,
                                             use_tfdv=stats_use_tfdv))

                    transform_decode_fn = (self._GetDecodeFunction(
                        raw_examples_data_format,
                        transform_input_dataset_metadata.schema))
                    # transform_data_list is a superset of analyze_data_list, we pay the
                    # cost to read the same dataset (analyze_data_list) again here to
                    # prevent certain beam runner from doing large temp materialization.
                    for (idx, dataset) in enumerate(transform_data_list):
                        dataset.encoded = (
                            p
                            | 'ReadTransformDataset[{}]'.format(idx) >>
                            self._ReadExamples(dataset))
                        dataset.decoded = (
                            dataset.encoded
                            | 'DecodeTransformDataset[{}]'.format(idx) >>
                            self._DecodeInputs(transform_decode_fn))
                        (dataset.transformed, metadata) = (
                            ((dataset.decoded,
                              transform_input_dataset_metadata), transform_fn)
                            | 'TransformDataset[{}]'.format(idx) >>
                            tft_beam.TransformDataset())

                        if materialize_output_paths or not stats_use_tfdv:
                            dataset.transformed_and_encoded = (
                                dataset.transformed
                                | 'EncodeTransformedDataset[{}]'.format(idx) >>
                                beam.ParDo(self._EncodeAsExamples(), metadata))

                    if compute_statistics:
                        # Aggregated feature stats after transformation.
                        _, metadata = transform_fn
                        post_transform_feature_stats_path = os.path.join(
                            transform_output_path, tft.TFTransformOutput.
                            POST_TRANSFORM_FEATURE_STATS_PATH)

                        # TODO(b/70392441): Retain tf.Metadata (e.g., IntDomain) in
                        # schema. Currently input dataset schema only contains dtypes,
                        # and other metadata is dropped due to roundtrip to tensors.
                        transformed_schema_proto = schema_utils.schema_from_feature_spec(
                            metadata.schema.as_feature_spec())

                        ([(dataset.transformed if stats_use_tfdv else
                           dataset.transformed_and_encoded)
                          for dataset in transform_data_list]
                         | 'FlattenPostTransformAnalysisDatasets' >>
                         beam.Flatten()
                         | 'GenerateAggregatePostTransformAnalysisStats' >>
                         self._GenerateStats(post_transform_feature_stats_path,
                                             transformed_schema_proto,
                                             use_tfdv=stats_use_tfdv))

                        if per_set_stats_output_paths:
                            assert len(transform_data_list) == len(
                                per_set_stats_output_paths)
                            # TODO(b/67632871): Remove duplicate stats gen compute that is
                            # done both on a flattened view of the data, and on each span
                            # below.
                            bundles = zip(transform_data_list,
                                          per_set_stats_output_paths)
                            for (idx, (dataset,
                                       output_path)) in enumerate(bundles):
                                if stats_use_tfdv:
                                    data = dataset.transformed
                                else:
                                    data = dataset.transformed_and_encoded
                                (data
                                 | 'GeneratePostTransformStats[{}]'.format(idx)
                                 >> self._GenerateStats(
                                     output_path,
                                     transformed_schema_proto,
                                     use_tfdv=stats_use_tfdv))

                    if materialize_output_paths:
                        assert len(transform_data_list) == len(
                            materialize_output_paths)
                        bundles = zip(transform_data_list,
                                      materialize_output_paths)
                        for (idx, (dataset,
                                   output_path)) in enumerate(bundles):
                            (dataset.transformed_and_encoded
                             | 'Materialize[{}]'.format(idx) >>
                             self._WriteExamples(raw_examples_file_format,
                                                 output_path))

        return _Status.OK()