예제 #1
0
    def pipeline(root):
        """Pipeline instantiation function.

    Args:
      root: Source pipeline from which to extend.
    """

        # This pipeline is concerned only with searching the sparse features.

        with tft_beam.Context(temp_dir=FLAGS.temp_dir):
            processed_lines = (
                root
                # Read in TSV data.
                | "ReadData" >> beam.io.ReadFromText(data_path)
                # For categorical features, search for the given values, as integers.
                | "HexSearchFilter" >> beam.ParDo(HexSearchFilter(), 1, [
                    14198776, 26023586, 21084594
                ]).with_outputs("malformed_entries", main="filtered_outputs"))

            malformed_lines = processed_lines.malformed_entries
            processed_lines = processed_lines.filtered_outputs

            _ = (processed_lines
                 | "WriteData" >> beam.io.WriteToText(output_path))

            _ = (malformed_lines
                 | "WriteDataMalformed" >>
                 beam.io.WriteToText(output_path + "_malformed"))
예제 #2
0
  def testPreprocessingFn(self):
    schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt')
    schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
    feature_spec = taxi_utils._get_raw_feature_spec(schema)
    working_dir = self.get_temp_dir()
    transform_graph_path = os.path.join(working_dir, 'transform_graph')
    transformed_examples_path = os.path.join(
        working_dir, 'transformed_examples')

    # Run very simplified version of executor logic.
    # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults.
    # Generate legacy `DatasetMetadata` object.  Future version of Transform
    # will accept the `Schema` proto directly.
    legacy_metadata = dataset_metadata.DatasetMetadata(
        schema_utils.schema_from_feature_spec(feature_spec))
    tfxio = tf_example_record.TFExampleRecord(
        file_pattern=os.path.join(self._testdata_path,
                                  'csv_example_gen/Split-train/*'),
        telemetry_descriptors=['Tests'],
        schema=legacy_metadata.schema)
    with beam.Pipeline() as p:
      with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')):
        examples = p | 'ReadTrainData' >> tfxio.BeamSource()
        (transformed_examples, transformed_metadata), transform_fn = (
            (examples, tfxio.TensorAdapterConfig())
            | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(
                taxi_utils.preprocessing_fn))

        # WriteTransformFn writes transform_fn and metadata to subdirectories
        # tensorflow_transform.SAVED_MODEL_DIR and
        # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
        # pylint: disable=expression-not-assigned
        (transform_fn
         |
         'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_graph_path))

        encoder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)
        (transformed_examples
         | 'EncodeTrainData' >> beam.Map(encoder.encode)
         | 'WriteTrainData' >> beam.io.WriteToTFRecord(
             os.path.join(transformed_examples_path,
                          'Split-train/transformed_examples.gz'),
             coder=beam.coders.BytesCoder()))
        # pylint: enable=expression-not-assigned

    # Verify the output matches golden output.
    # NOTE: we don't verify that transformed examples match golden output.
    expected_transformed_schema = io_utils.parse_pbtxt_file(
        os.path.join(
            self._testdata_path,
            'transform/transform_graph/transformed_metadata/schema.pbtxt'),
        schema_pb2.Schema())
    transformed_schema = io_utils.parse_pbtxt_file(
        os.path.join(transform_graph_path, 'transformed_metadata/schema.pbtxt'),
        schema_pb2.Schema())
    # Clear annotations so we only have to test main schema.
    transformed_schema.ClearField('annotation')
    for feature in transformed_schema.feature:
      feature.ClearField('annotation')
    self.assertEqual(transformed_schema, expected_transformed_schema)
    def test_train(self):
        """Tests case where training data is passed."""

        with self.pipeline as p:
            with tft_beam.Context(temp_dir=os.path.join(self.test_dir, 'tmp')):
                df = self.pre_tft_df[self.pre_tft_df.split == 'TRAIN']
                dataset = self._get_dataset(p, df)
                preprocessing_fn = functools.partial(
                    beam_pipeline._preprocessing_fn,
                    schema_map=self.schema.pre_tft_schema_map)
                transform_fn = (beam_pipeline._transform_and_write_tfr(
                    dataset,
                    self.tfr_writer,
                    preprocessing_fn=preprocessing_fn,
                    metadata=self.pre_tft_metadata,
                    label='Train'))
                _ = transform_fn | tft_beam.WriteTransformFn(self.test_dir)

        self.assertTrue(
            os.path.isdir(os.path.join(self.test_dir, 'transform_fn')))
        self.assertTrue(
            os.path.isdir(os.path.join(self.test_dir, 'transformed_metadata')))
        self.assertTrue(glob.glob(os.path.join(self.test_dir, 'train*.gz')))
        self.assertFalse(
            glob.glob(os.path.join(self.test_dir, 'validation*.gz')))
        self.assertFalse(glob.glob(os.path.join(self.test_dir, 'test*.gz')))
예제 #4
0
def _main(argv=None):
    logging.getLogger().setLevel(logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument('--raw_examples_path', required=True)
    parser.add_argument('--raw_examples_schema_path', required=True)
    parser.add_argument('--preprocessing_module_path', required=True)
    parser.add_argument('--transform_fn_dir', required=True)
    known_args, pipeline_args = parser.parse_known_args(argv)

    raw_examples_schema = load_schema(known_args.raw_examples_schema_path)
    raw_examples_coder = tft.coders.ExampleProtoCoder(raw_examples_schema)
    raw_examples_metadata = dataset_metadata.DatasetMetadata(
        raw_examples_schema)

    tft_preprocessing = load_module_from_file_path(
        'tft_preprocessing', known_args.preprocessing_module_path)
    preprocessing_fn = tft_preprocessing.preprocessing_fn

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as pipeline:
        with tft_beam.Context(temp_dir=get_beam_temp_dir(pipeline_options)):
            raw_examples = pipeline | 'ReadRawExamples' >> beam.io.ReadFromTFRecord(
                known_args.raw_examples_path, coder=raw_examples_coder)
            raw_examples_dataset = (raw_examples, raw_examples_metadata)
            transform_fn = raw_examples_dataset | tft_beam.AnalyzeDataset(
                preprocessing_fn)
            transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(
                known_args.transform_fn_dir)
예제 #5
0
  def run_metrics():
    """Creates a pipeline to measure wordpiece vocab metrics over a corpus."""

    metrics_pipeline = beam.Pipeline()

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
      # Read raw data and convert to TF Transform encoded dict.
      raw_data = (
          metrics_pipeline
          | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
              data_file, coder=beam.coders.ProtoCoder(tf.train.Example))
          | 'DecodeInputData' >> beam.Map(example_converter.decode))

      # Apply transform to wordpiece-tokenize input.
      (metrics_transformed_data, _), _ = (
          (raw_data, raw_metadata)
          | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset(
              utils.metrics_preprocessing_fn(FLAGS.vocab_file,
                                             FLAGS.text_key,
                                             FLAGS.language_code_key)))

      # Initialize CSV coder. Aggregate values for each lang, calculate metrics,
      # and write to output to a CSV file.
      csv_converter = tft.coders.CsvCoder(columns, csv_schema)
      _ = (
          metrics_transformed_data
          | 'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo())
          | 'CombineStatsForLang' >> beam.CombineGlobally(utils.AggregateLang())
          | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics())
          | 'EncodeMetrics' >> beam.Map(csv_converter.encode)
          | 'WriteMetrics' >> beam.io.WriteToText(
              metrics_file, shard_name_template='', header=','.join(columns)))
    return metrics_pipeline
예제 #6
0
  def expand(self, pipeline):
    # TODO(b/147620802): Consider making this (and other parameters)
    # configurable to test more variants (e.g. with and without deep-copy
    # optimisation, with and without cache, etc).
    with tft_beam.Context(
        temp_dir=tempfile.mkdtemp(),
        force_tf_compat_v1=self._force_tf_compat_v1):
      raw_data = (
          pipeline
          | "ReadDataset" >> beam.Create(
              self._dataset.read_raw_dataset(
                  deserialize=False, limit=self._max_num_examples))
          | "Decode" >> self._tfxio.BeamSource())
      transform_fn, output_metadata = (
          (raw_data, self._tfxio.TensorAdapterConfig())
          | "AnalyzeDataset" >> tft_beam.AnalyzeDataset(self._preprocessing_fn))

      if self._generate_dataset:
        _ = transform_fn | "CopySavedModel" >> _CopySavedModel(
            dest_path=self._dataset.tft_saved_model_path(
                self._force_tf_compat_v1))

      (transformed_dataset, transformed_metadata) = (
          ((raw_data, self._tfxio.TensorAdapterConfig()),
           (transform_fn, output_metadata))
          | "TransformDataset" >> tft_beam.TransformDataset())
      return transformed_dataset, transformed_metadata
예제 #7
0
def word_count(input_path, output_path, raw_metadata, min_token_frequency=2):
    """Returns a pipeline counting words and writing the output.

  Args:
    input_path: recordio file to read
    output_path: path in which to write the output
    raw_metadata: metadata of input tf.Examples
    min_token_frequency: the min frequency for a token to be included
  """

    lang_set = set(FLAGS.lang_set.split(','))

    # Create pipeline.
    pipeline = beam.Pipeline()

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        converter = tft.coders.ExampleProtoCoder(raw_metadata.schema,
                                                 serialized=False)

        # Read raw data and convert to TF Transform encoded dict.
        raw_data = (
            pipeline
            | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
                input_path, coder=beam.coders.ProtoCoder(tf.train.Example))
            | 'DecodeInputData' >> beam.Map(converter.decode))

        # Apply TF Transform.
        (transformed_data, _), _ = (
            (raw_data, raw_metadata)
            |
            'FilterLangAndExtractToken' >> tft_beam.AnalyzeAndTransformDataset(
                utils.count_preprocessing_fn(FLAGS.text_key,
                                             FLAGS.language_code_key)))

        # Filter by languages.
        tokens = (
            transformed_data
            | 'FilterByLang' >> beam.ParDo(utils.FilterTokensByLang(lang_set)))

        # Calculate smoothing coefficients.
        coeffs = (tokens
                  | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally(
                      utils.CalculateCoefficients(FLAGS.smoothing_exponent)))

        # Apply smoothing, aggregate counts, and sort words by count.
        _ = (tokens
             | 'ApplyExponentialSmoothing' >> beam.ParDo(
                 utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs))
             | 'SumCounts' >> beam.CombinePerKey(sum)
             | 'FilterLowCounts' >> beam.ParDo(
                 utils.FilterByCount(FLAGS.max_word_length,
                                     min_token_frequency))
             |
             'MergeAndSortCounts' >> beam.CombineGlobally(utils.SortByCount())
             | 'Flatten' >> beam.FlatMap(lambda x: x)
             | 'FormatCounts' >> beam.Map(lambda tc: '%s\t%s' % (tc[0], tc[1]))
             | 'WriteSortedCount' >> beam.io.WriteToText(
                 output_path, shard_name_template=''))

    return pipeline
예제 #8
0
    def expand(self, pipeline):
        # TODO(b/147620802): Consider making this (and other parameters)
        # configurable to test more variants (e.g. with and without deep-copy
        # optimisation, with and without cache, etc).
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            converter = tft.coders.ExampleProtoCoder(self._tf_metadata_schema,
                                                     serialized=False)
            raw_data = (
                pipeline
                |
                "ReadDataset" >> beam.Create(self._dataset.read_raw_dataset())
                | "Decode" >> beam.Map(converter.decode))
            transform_fn, output_metadata = (
                (raw_data, self._transform_input_dataset_metadata)
                | "AnalyzeDataset" >> tft_beam.AnalyzeDataset(
                    self._preprocessing_fn))

            if self._generate_dataset:
                _ = transform_fn | "CopySavedModel" >> _CopySavedModel(
                    dest_path=self._dataset.tft_saved_model_path())

            (transformed_dataset, transformed_metadata) = (
                ((raw_data, self._transform_input_dataset_metadata),
                 (transform_fn, output_metadata))
                | "TransformDataset" >> tft_beam.TransformDataset())
            return transformed_dataset, transformed_metadata
def run():
    pipeline_options = PipelineOptions(['--runner=DirectRunner'])

    def preprocessing_fn(inputs):
        word = inputs['word']
        count = inputs['count']
        return {
            'word': word,
            'count': count,
            'count_normalized': tft.scale_to_0_1(count)
        }

    with beam.Pipeline(options=pipeline_options) as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            counts_data = (pipeline
                           | "Load" >> ReadFromText(INPUT_FILE)
                           | "CountWords" >> CountWordsTransform())

            (transformed_data, transformed_metadata), _ = (
                (counts_data, COUNTS_METADATA)
                | "AnalyzeAndTransform" >>
                tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

            output_column_names = ['word', 'count', 'count_normalized']
            transformed_data_coder = tft.coders.CsvCoder(
                output_column_names, transformed_metadata.schema)

            _ = (transformed_data
                 | "EncodeToCsv" >> beam.Map(transformed_data_coder.encode)
                 | "Save" >> WriteToText(OUTPUT_FILE))
예제 #10
0
def encode():
    """
    Creates a Beam pipeline that generates data, transforms it and encodes it in ELWC
    """
    output_path = "./output"
    options = PipelineOptions()
    options.view_as(StandardOptions).runner = "DirectRunner"

    with beam.Pipeline(options=options) as pipeline:
        with tft_beam.Context(temp_dir="./tmp"):
            raw_data = generate_data(100)
            input_data = (pipeline | beam.Create(raw_data))

            transformed_data, transform_fn = (
                (input_data, raw_metadata)
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

            elwc_coder = ELWCProtoCoder(context_specs, examples_specs)
            data, metadata = transformed_data

            _ = (data | beam.Map(elwc_coder.encode) | beam.io.WriteToTFRecord(
                file_path_prefix="{}/data".format(output_path),
                file_name_suffix=".tfrecords"))

            _ = (transform_fn | tft_beam.WriteTransformFn(output_path))
예제 #11
0
    def testNestedContextCreateBaseTempDir(self):

        level_1_dir = self.get_temp_dir()
        with tft_beam.Context(temp_dir=level_1_dir):
            self.assertEqual(
                os.path.join(level_1_dir, tft_beam.Context._TEMP_SUBDIR),
                tft_beam.Context.create_base_temp_dir())
            level_2_dir = self.get_temp_dir()
            with tft_beam.Context(temp_dir=level_2_dir):
                self.assertEqual(
                    os.path.join(level_2_dir, tft_beam.Context._TEMP_SUBDIR),
                    tft_beam.Context.create_base_temp_dir())
            self.assertEqual(
                os.path.join(level_1_dir, tft_beam.Context._TEMP_SUBDIR),
                tft_beam.Context.create_base_temp_dir())
        with self.assertRaises(ValueError):
            tft_beam.Context.create_base_temp_dir()
예제 #12
0
def run(pipeline_options, known_args):
    global force_tf_compat_v1
    argv = None  # if None, uses sys.argv
    pipeline_options = PipelineOptions(argv)
    pipeline = beam.Pipeline(options=pipeline_options)

    if "universal-sentence-encoder" in MODEL_URL and int(
            MODEL_URL.split("/")[-1]) <= 2:
        # https://github.com/tensorflow/transform/issues/160
        force_tf_compat_v1 = True

    with tft_beam.Context(temp_dir=tempfile.mkdtemp(),
                          force_tf_compat_v1=force_tf_compat_v1):
        print("Context force_tf_compat_v1: {}".format(
            tft_beam.Context.get_use_tf_compat_v1()))
        articles = (
            pipeline
            | beam.Create([
                {
                    "id": "01",
                    "text": "To be, or not to be: that is the question: "
                },
                {
                    "id": "02",
                    "text": "Whether 'tis nobler in the mind to suffer "
                },
                {
                    "id": "03",
                    "text": "The slings and arrows of outrageous fortune, "
                },
                {
                    "id": "04",
                    "text": "Or to take arms against a sea of troubles, "
                },
            ]))

        articles_dataset = (articles, get_metadata())

        transformed_dataset, transform_fn = (
            articles_dataset
            | "Extract embeddings" >>
            tft_beam.AnalyzeAndTransformDataset(preprocess_fn))

        transformed_data, transformed_metadata = transformed_dataset

        _ = (transformed_data
             | "Print embeddings" >> beam.Map(print_pass)
             | "Write embeddings to TFRecords" >>
             beam.io.tfrecordio.WriteToTFRecord(
                 file_path_prefix="{0}".format(known_args.output_dir),
                 file_name_suffix=".tfrecords",
                 coder=tft_coders.example_proto_coder.ExampleProtoCoder(
                     transformed_metadata.schema),
                 num_shards=1))

    job = pipeline.run()
    if pipeline_options.get_all_options()["runner"] == "DirectRunner":
        job.wait_until_finish()
예제 #13
0
    def setUp(self):
        super(CachedImplTest, self).setUp()
        self.base_test_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        self._cache_dir = os.path.join(self.base_test_dir, 'cache')

        self._context = tft_beam.Context(temp_dir=self.get_temp_dir())
        self._context.__enter__()
예제 #14
0
def main():

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
            (_RAW_DATA, _RAW_DATA_METADATA)
            | tft_beam.AnalyzeAndTransformDataset(_preprocessing_fn))

    transformed_data, transformed_metadata = transformed_dataset  # pylint: disable=unused-variable

    pprint.pprint(transformed_data)
def data_transform():
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = (
            (dict_features, data_metadata)
            | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

    transformed_data, transformed_metadata = transformed_dataset

    for i in range(len(transformed_data)):
        print("Initial: ", dict_features[i])
        print("Transformed: ", transformed_data[i])
def main():
    with beam.Pipeline() as p:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            converter = tft.coders.CsvCoder(['f1', 'y'],
                                            raw_data_metadata.schema)
            coder = tft.coders.ExampleProtoCoder(raw_data_metadata.schema)
            raw_data = (p
                        | beam.io.ReadFromText('./train.csv')
                        | beam.Map(lambda line: line.replace(', ', ','))
                        | beam.Map(converter.decode)
                        | beam.io.WriteToTFRecord('./train_tx', coder))
def generate_skipgrams(data_uri,
                       feature_names,
                       vocabulary_size=10,
                       window_size=2,
                       negative_samples=0.,
                       save_path="temp"):
    def parse_tensor_f(x):
        xp = tf.io.parse_tensor(x, tf.int64)
        xp.set_shape([None])
        return {fname: xp[i] for i, fname in enumerate(feature_names)}

    raw_data = tf.data.TFRecordDataset(data_uri).map(
        parse_tensor_f).as_numpy_iterator()
    raw_data_schema = dataset_metadata.DatasetMetadata(
        schema_utils.schema_from_feature_spec({
            fname: tf.io.FixedLenFeature([], tf.int64)
            for fname in feature_names
        }))
    dataset = (raw_data, raw_data_schema)

    # Make the preprocessing_fn
    preprocessing_fn = make_preproc_func(vocabulary_size, window_size,
                                         negative_samples, feature_names)

    # Run the beam pipeline
    with beam.Pipeline() as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp(),
                              desired_batch_size=2):
            transformed_dataset, transform_fn = (
                dataset | "Make Skipgrams" >>
                tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            print('Transformed dataset:\n{}'.format(
                pprint.pformat(transformed_dataset)))

            # pylint: disable=unused-variable
            transformed_data, transformed_metadata = transformed_dataset
            saved_results = (
                transformed_data
                | "Write to TFRecord" >> beam.io.tfrecordio.WriteToTFRecord(
                    file_path_prefix=save_path,
                    file_name_suffix=".tfrecords",
                    coder=tft.coders.example_proto_coder.ExampleProtoCoder(
                        transformed_metadata.schema)))
            print('\nRaw data:\n{}\n'.format(pprint.pformat(raw_data)))
            print('Transformed data:\n{}'.format(
                pprint.pformat(transformed_data)))
            # Return the list of paths of tfrecords
            num_rows_saved = len(transformed_data)

    return saved_results, num_rows_saved
예제 #18
0
    def run_vocab():
        """Creates a pipeline to generate wordpiece vocab over a corpus."""

        vocab_pipeline = beam.Pipeline()

        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            # Read raw data and convert to TF Transform encoded dict.
            raw_data = (
                vocab_pipeline
                | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
                    data_file, coder=beam.coders.ProtoCoder(tf.train.Example))
                | 'DecodeInputData' >> beam.Map(example_converter.decode))

            # Apply TF Transform.
            (transformed_data,
             _), _ = ((raw_data, raw_metadata)
                      | 'FilterLangAndExtractToken' >>
                      tft_beam.AnalyzeAndTransformDataset(
                          utils.count_preprocessing_fn(
                              FLAGS.text_key, FLAGS.language_code_key)))

            # Filter by languages.
            tokens = (transformed_data
                      | 'FilterByLang' >> beam.ParDo(
                          utils.FilterTokensByLang(lang_set)))

            # Calculate smoothing coefficients.
            coeffs = (
                tokens
                | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally(
                    utils.CalculateCoefficients(FLAGS.smoothing_exponent)))

            # Apply smoothing, aggregate counts, and sort words by count.
            _ = (tokens
                 | 'ApplyExponentialSmoothing' >> beam.ParDo(
                     utils.ExponentialSmoothing(),
                     beam.pvalue.AsSingleton(coeffs))
                 | 'SumCounts' >> beam.CombinePerKey(sum)
                 | 'FilterLowCounts' >> beam.ParDo(
                     utils.FilterByCount(FLAGS.max_word_length,
                                         min_token_frequency))
                 | 'MergeAndSortCounts' >> beam.CombineGlobally(
                     utils.SortByCount())
                 | 'LearnVocab' >> beam.ParDo(utils.LearnVocab(params))
                 | 'Flatten' >> beam.FlatMap(lambda x: x + '\n')
                 | 'WriteVocab' >> beam.io.WriteToText(
                     vocab_file,
                     shard_name_template='',
                     append_trailing_newlines=False))
        return vocab_pipeline
예제 #19
0
def transform_tft(train_data, test_data, working_dir):
    options = PipelineOptions()
    options.view_as(StandardOptions).runner = 'DirectRunner'
    with beam.Pipeline(options=options) as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            data_shape = train_data[0][0].shape
            raw_data = (
                pipeline | 'ReadTrainData' >> beam.Create(train_data)
                | 'CreateTrainData' >> beam.Map(lambda data: format(data)))
            raw_data_metadata = dataset_metadata.DatasetMetadata(
                dataset_schema.from_feature_spec({
                    IMAGE_KEY:
                    tf.FixedLenFeature(list(data_shape), tf.float32),
                    LABEL_KEY:
                    tf.FixedLenFeature([], tf.int64)
                }))
            raw_dataset = (raw_data, raw_data_metadata)
            transformed_dataset, transform_fn = (
                raw_dataset
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            _ = (
                transformed_data
                | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE),
                    file_name_suffix='.tfrecords'))

            raw_test_data = (
                pipeline | 'ReadTestData' >> beam.Create(test_data)
                | 'CreateTestData' >> beam.Map(lambda data: format(data)))
            raw_test_dataset = (raw_test_data, raw_data_metadata)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | tft_beam.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = (transformed_test_data
                 | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTestData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE),
                     file_name_suffix='.tfrecords'))

            _ = (transform_fn |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
예제 #20
0
def transformed_data(working_dir):
    """数据处理与生成transform_fn"""
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        xi, yi = inputs["x"], inputs["y"]
        x_integerized = tft.compute_and_apply_vocabulary(xi, default_value=0, name="vocab")  # , top_k=VOCAB_SIZE)
        y_integerized = tft.compute_and_apply_vocabulary(yi, default_value=0, name="label")  # ,top_k=LABEL_SIZE
        return {"x": x_integerized, "y": y_integerized}

    # path_transform
    with tft_beam.Context(temp_dir=path_transform):
        transformed_dataset, transform_fn = ((xys, DATA_STRING_FEATURE_SPEC) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
        transformed_train_data, transformed_metadata = transformed_dataset
        _ = (transform_fn | tft_beam.WriteTransformFn(working_dir))
    return transformed_train_data
예제 #21
0
 def test_preprocessing_fn(self, with_deep_copy, features_config):
     # Fake features_config global variables to test their transformed values.
     features_config.TARGET_FEATURE = 't'
     features_config.ID_FEATURE = 'i'
     features_config.NUMERIC_FEATURES = ['n1', 'n2']
     features_config.CATEGORICAL_FEATURES = ['c1', 'c2']
     features_config.OOV_SIZE = 5
     features_config.VOCAB_SIZE = 10
     input_metadata = _create_input_metadata(features_config)
     input_data = [{
         't': [0.0],
         'i': [0],
         'n1': [1.0],
         'n2': [2.0],
         'c1': ['test1'],
         'c2': ['test2']
     }, {
         't': [1.0],
         'i': [1],
         'n1': [3.0],
         'n2': [4.0],
         'c1': ['test2'],
         'c2': ['test1']
     }]
     expected_data = [{
         't': 0.0,
         'i': [0],
         'tr_n1': -1.0,
         'tr_n2': -1.0,
         'tr_c1': 1,
         'tr_c2': 0
     }, {
         't': 1.0,
         'i': [1],
         'tr_n1': 1.0,
         'tr_n2': 1.0,
         'tr_c1': 0,
         'tr_c2': 1
     }]
     expected_metadata = _create_output_metadata(features_config, 0, 6)
     # Assert that transformed result matches expected_data & expected_metadata.
     with tft_beam.Context(use_deep_copy_optimization=with_deep_copy):
         self.assertAnalyzeAndTransformResults(
             input_data=input_data,
             input_metadata=input_metadata,
             preprocessing_fn=transformer.preprocessing_fn,
             expected_data=expected_data,
             expected_metadata=expected_metadata)
예제 #22
0
  def test_non_training(self):
    """Tests case where dataset contains non-training (e.g. test) data."""

    with self.pipeline as p:
      with tft_beam.Context(temp_dir=os.path.join(self.test_dir, 'tmp')):

        df = self.raw_df[self.raw_df.split == 'TEST']
        dataset = self._get_dataset(p, df)
        transform_fn = p | tft_beam.ReadTransformFn(self.transform_fn_path)
        beam_pipeline._transform_and_write_tfr(
            dataset, self.tfr_writer, transform_fn=transform_fn,
            raw_metadata=self.raw_metadata, label='Test')

    self.assertFalse(glob.glob(os.path.join(self.test_dir, 'train*.gz')))
    self.assertFalse(glob.glob(os.path.join(self.test_dir, 'validation*.gz')))
    self.assertTrue(glob.glob(os.path.join(self.test_dir, 'test*.gz')))
    def pipeline(root):
        """Pipeline instantiation function.

    Args:
      root: Source pipeline from which to extend.
    """

        preprocessing_fn = compute_vocab_fn if FLAGS.vocab_gen_mode else apply_vocab_fn

        with tft_beam.Context(temp_dir=FLAGS.temp_dir):
            processed_lines = (
                root
                # Read in TSV data.
                | beam.io.ReadFromText(data_path)
                # Fill in missing elements with the defaults (zeros).
                | "FillMissing" >> beam.ParDo(FillMissing())
                # For numerical features, set negatives to zero. Then take log(x+1).
                | "NegsToZeroLog" >> beam.ParDo(NegsToZeroLog())
                # For categorical features, mod the values with vocab size.
                | "HexToIntModRange" >> beam.ParDo(HexToIntModRange()))

            # CSV reader: List the cols in order, as dataset schema is not ordered.
            ordered_columns = [
                LABEL_KEY
            ] + NUMERIC_FEATURE_KEYS + CATEGORICAL_FEATURE_KEYS
            converter = tft.coders.CsvCoder(ordered_columns,
                                            INPUT_METADATA.schema,
                                            delimiter=FLAGS.csv_delimeter)

            converted_data = (processed_lines
                              | "DecodeData" >> beam.Map(converter.decode))

            transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
                (converted_data, INPUT_METADATA)
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset

            if not FLAGS.vocab_gen_mode:
                # Write to CSV.
                transformed_csv_coder = tft.coders.CsvCoder(
                    ordered_columns,
                    transformed_metadata.schema,
                    delimiter=FLAGS.csv_delimeter)
                _ = (transformed_data
                     |
                     "EncodeDataCsv" >> beam.Map(transformed_csv_coder.encode)
                     | "WriteDataCsv" >> beam.io.WriteToText(output_path))
예제 #24
0
def transform_data(train_data_file, working_dir):
    with beam.Pipeline() as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            ordered_columns = ['C' + str(i) for i in range(10)]
            print(ordered_columns)
            converter = tft.coders.CsvCoder(ordered_columns,
                                            RAW_DATA_METADATA.schema)

            raw_data = (
                pipeline
                | 'Read Train Data' >> beam.io.ReadFromText(
                    train_data_file, skip_header_lines=1)
                | 'Fix Commas in Train Data' >>
                beam.Map(lambda line: line.replace(', ', ','))
                | 'Decode Train Data' >> MapAndFilterErrors(converter.decode))
            print("\n\n\n", raw_data.__dict__)
            print("\n\n\n", raw_data.producer)
            print("\n\n\n", raw_data.producer.__dict__)
            raw_dataset = (raw_data, RAW_DATA_METADATA)
            transformed_dataset, transform_fn = (
                raw_dataset
                | tft.beam.AnalyzeAndTransformDataset(preprocessing_fn))

            transformed_data, transformed_metadata = transformed_dataset

            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            _ = (transformed_data
                 |
                 'Encode Train Data' >> beam.Map(transformed_data_coder.encode)
                 | 'Write Train Data' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            _ = (transform_fn
                 |
                 'Write TransformFn' >> tft.beam.WriteTransformFn(working_dir))
            print("YOOHOO\n")
예제 #25
0
def main():
  def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    x = inputs['x']
    y = inputs['y']
    s = inputs['s']
    x_centered = x - tft.mean(x)
    y_normalized = tft.scale_to_0_1(y)
    s_integerized = tft.compute_and_apply_vocabulary(s)
    x_centered_times_y_normalized = (x_centered * y_normalized)
    return {
        'x_centered': x_centered,
        'y_normalized': y_normalized,
        'x_centered_times_y_normalized': x_centered_times_y_normalized,
        's_integerized': s_integerized
    }

  raw_data = [
      {'x': 1, 'y': 1, 's': 'hello'},
      {'x': 2, 'y': 2, 's': 'world'},
      {'x': 3, 'y': 3, 's': 'hello'}
  ]

  raw_data_metadata = dataset_metadata.DatasetMetadata(
      schema_utils.schema_from_feature_spec({
          's': tf.io.FixedLenFeature([], tf.string),
          'y': tf.io.FixedLenFeature([], tf.float32),
          'x': tf.io.FixedLenFeature([], tf.float32),
      }))

  with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
    transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
        (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(
            preprocessing_fn))

  transformed_data, transformed_metadata = transformed_dataset  # pylint: disable=unused-variable

  pprint.pprint(transformed_data)
def _main(argv=None):
    logging.getLogger().setLevel(logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument('--raw_examples_path', required=True)
    parser.add_argument('--raw_examples_schema_path', required=True)
    parser.add_argument('--transform_fn_dir', required=True)
    parser.add_argument('--transformed_examples_path_prefix', required=True)
    known_args, pipeline_args = parser.parse_known_args(argv)

    raw_examples_schema = load_schema(known_args.raw_examples_schema_path)
    raw_examples_coder = tft.coders.ExampleProtoCoder(raw_examples_schema)
    raw_examples_metadata = dataset_metadata.DatasetMetadata(
        raw_examples_schema)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as pipeline:
        with tft_beam.Context(temp_dir=get_beam_temp_dir(pipeline_options)):
            transform_fn = pipeline | tft_beam.ReadTransformFn(
                known_args.transform_fn_dir)
            raw_examples = (
                pipeline
                | 'ReadRawExamples' >> beam.io.ReadFromTFRecord(
                    known_args.raw_examples_path, coder=raw_examples_coder))
            raw_examples_dataset = (raw_examples, raw_examples_metadata)
            transformed_examples, transform_examples_metadata = (
                (raw_examples_dataset, transform_fn)
                | tft_beam.TransformDataset())
            transformed_examples_coder = tft.coders.ExampleProtoCoder(
                transform_examples_metadata.schema)
            transformed_examples | 'WriteTransformedExamples' >> beam.io.WriteToTFRecord(
                known_args.transformed_examples_path_prefix,
                file_name_suffix='.tfrecord.gz',
                coder=transformed_examples_coder)
예제 #27
0
def run_hub2emb(args):
    '''Runs the embedding generation pipeline'''

    options = beam.options.pipeline_options.PipelineOptions(**args)
    args = namedtuple("options", args.keys())(*args.values())

    raw_metadata = create_metadata()
    converter = tft.coders.CsvCoder(column_names=['text'],
                                    schema=raw_metadata.schema)

    with beam.Pipeline(args.runner, options=options) as pipeline:
        with tft_beam.Context(args.temporary_dir):
            # Read the sentences from the input file
            sentences = (
                pipeline
                | 'Read sentences from files' >>
                beam.io.ReadFromText(file_pattern='corpus/text.txt')
                # | 'Convert to dictionary' >> beam.Map(converter.decode)
            )

            sentences_dataset = (sentences, raw_metadata)
            preprocess_fn = make_preprocess_fn(args.module_url,
                                               args.random_projection_matrix)
            # Generate the embeddings for the sentence using the TF-Hub module
            embeddings_dataset, _ = (
                sentences_dataset
                | 'Extract embeddings' >>
                tft_beam.AnalyzeAndTransformDataset(preprocess_fn))

            embeddings, transformed_metadata = embeddings_dataset
            # Write the embeddings to TFRecords files
            embeddings | 'Write embeddings to TFRecords' >> beam.io.tfrecordio.WriteToTFRecord(
                file_path_prefix='{}/emb'.format(args.output_dir),
                file_name_suffix='.tfrecords',
                coder=tft.coders.ExampleProtoCoder(
                    transformed_metadata.schema))
예제 #28
0
def transform_data(working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    working_dir: Directory to read shuffled data from and write transformed data
        and metadata to.
  """

    with beam.Pipeline() as pipeline:
        with tft_beam.Context(
                temp_dir=os.path.join(working_dir, TRANSFORM_TEMP_DIR)):
            coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema)
            train_data = (pipeline
                          | 'ReadTrain' >> beam.io.ReadFromTFRecord(
                              os.path.join(working_dir,
                                           SHUFFLED_TRAIN_DATA_FILEBASE + '*'))
                          | 'DecodeTrain' >> beam.Map(coder.decode))

            test_data = (pipeline
                         | 'ReadTest' >> beam.io.ReadFromTFRecord(
                             os.path.join(working_dir,
                                          SHUFFLED_TEST_DATA_FILEBASE + '*'))
                         | 'DecodeTest' >> beam.Map(coder.decode))

            def preprocessing_fn(inputs):
                """Preprocess input columns into transformed columns."""
                review = inputs[REVIEW_KEY]

                # Here tf.compat.v1.string_split behaves differently from
                # tf.strings.split.
                review_tokens = tf.compat.v1.string_split(review, DELIMITERS)
                review_indices = tft.compute_and_apply_vocabulary(
                    review_tokens, top_k=VOCAB_SIZE)
                # Add one for the oov bucket created by compute_and_apply_vocabulary.
                review_bow_indices, review_weight = tft.tfidf(
                    review_indices, VOCAB_SIZE + 1)
                return {
                    REVIEW_KEY: review_bow_indices,
                    REVIEW_WEIGHT_KEY: review_weight,
                    LABEL_KEY: inputs[LABEL_KEY]
                }

            (transformed_train_data, transformed_metadata), transform_fn = (
                (train_data, RAW_DATA_METADATA)
                | 'AnalyzeAndTransform' >>
                tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            transformed_test_data, _ = (
                ((test_data, RAW_DATA_METADATA), transform_fn)
                | 'Transform' >> tft_beam.TransformDataset())

            _ = (transformed_train_data
                 | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            _ = (
                transformed_test_data
                | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                | 'WriteTestData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to two subdirectories of
            # working_dir, given by tft.TRANSFORM_FN_DIR and
            # tft.TRANSFORMED_METADATA_DIR respectively.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
예제 #29
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
                _fill_in_missing(inputs[key]))

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[taxi.transformed_name(
                key)] = transform.compute_and_apply_vocabulary(
                    _fill_in_missing(inputs[key]),
                    top_k=taxi.VOCAB_SIZE,
                    num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = transform.bucketize(
                _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

        # Was this passenger a big tipper?
        taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
        tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
        outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
            tf.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                    tf.int64))

        return outputs

    schema = taxi.read_schema(schema_file)
    raw_feature_spec = taxi.get_raw_feature_spec(schema)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        with tft_beam.Context(temp_dir=working_dir):
            if input_handle.lower().endswith('csv'):
                csv_coder = taxi.make_csv_coder(schema)
                raw_data = (pipeline
                            | 'ReadFromText' >> beam.io.ReadFromText(
                                input_handle, skip_header_lines=1))
                decode_transform = beam.Map(csv_coder.decode)
            else:
                query = taxi.make_sql(input_handle, max_rows, for_eval=False)
                raw_data = (pipeline
                            | 'ReadBigQuery' >> beam.io.Read(
                                beam.io.BigQuerySource(query=query,
                                                       use_standard_sql=True)))
                decode_transform = beam.Map(taxi.clean_raw_data_dict,
                                            raw_feature_spec=raw_feature_spec)

            if transform_dir is None:
                decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform
                transform_fn = (
                    (decoded_data, raw_data_metadata) |
                    ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

                _ = (transform_fn
                     | ('WriteTransformFn' >>
                        tft_beam.WriteTransformFn(working_dir)))
            else:
                transform_fn = pipeline | tft_beam.ReadTransformFn(
                    transform_dir)

            # Shuffling the data before materialization will improve Training
            # effectiveness downstream. Here we shuffle the raw_data (as opposed to
            # decoded data) since it has a compact representation.
            shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
            )

            decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform
            (transformed_data, transformed_metadata) = (
                ((decoded_data, raw_data_metadata), transform_fn)
                | 'Transform' >> tft_beam.TransformDataset())

            coder = example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema)
            _ = (transformed_data
                 | 'SerializeExamples' >> beam.Map(coder.encode)
                 | 'WriteExamples' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, outfile_prefix),
                     file_name_suffix='.gz'))
예제 #30
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """Get human review result on a model through Slack channel.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - model_export: exported model from trainer.
        - model_blessing: model blessing path from model_validator.
      output_dict: Output dict from key to a list of artifacts, including:
        - slack_blessing: model blessing result.
      exec_properties: A dict of execution properties, including:
        - slack_token: Token used to setup connection with slack server.
        - slack_channel_id: The id of the Slack channel to send and receive
          messages.
        - timeout_sec: How long do we wait for response, in seconds.

    Returns:
      None

    Raises:
      TimeoutError:
        When there is no decision made within timeout_sec.
      ConnectionError:
        When connection to slack server cannot be established.

    """
    self._log_startup(input_dict, output_dict, exec_properties)
    transform_graph_uri = artifact_utils.get_single_uri(
        input_dict[TRANSFORM_GRAPH_KEY])
    temp_path = os.path.join(transform_graph_uri, _TEMP_DIR_IN_TRANSFORM_OUTPUT)
    # transformed_schema_file = os.path.join(
    #   transform_graph_uri,
    #   tft.TFTransformOutput.TRANSFORMED_METADATA_DIR,
    #   'schema.pbtxt'
    # )
    # transformed_schema_proto = io_utils.parse_pbtxt_file(
    #   transformed_schema_file,
    #   schema_pb2.Schema()
    # )
    transformed_train_output = artifact_utils.get_split_uri(
      output_dict[TRANSFORMED_EXAMPLES_KEY], 'train')
    transformed_eval_output = artifact_utils.get_split_uri(
      output_dict[TRANSFORMED_EXAMPLES_KEY], 'eval')

    tf_transform_output = tft.TFTransformOutput(transform_graph_uri)
    # transform_output_dataset_metadata = dataset_metadata.DatasetMetadata(
    #   schema=transformed_schema_proto
    # )

    # transform_fn = (tf_transform_output.transform_raw_features, transform_output_dataset_metadata)
    # feature_spec = schema_utils.schema_as_feature_spec(schema_proto).feature_spec
    schema_file = io_utils.get_only_uri_in_dir(
        artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))
    schema_proto = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
    transform_input_dataset_metadata = dataset_metadata.DatasetMetadata(
      schema_proto
    )

    train_data_uri = artifact_utils.get_split_uri(
      input_dict[EXAMPLES_KEY],
      'train'
    )
    eval_data_uri = artifact_utils.get_split_uri(
      input_dict[EXAMPLES_KEY],
      'eval'
    )
    analyze_data_paths = [io_utils.all_files_pattern(train_data_uri)]
    transform_data_paths = [
      io_utils.all_files_pattern(train_data_uri),
      io_utils.all_files_pattern(eval_data_uri),
    ]
    materialize_output_paths = [
      os.path.join(transformed_train_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
      os.path.join(transformed_eval_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX)
    ]
    transform_data_list = self._MakeDatasetList(
      transform_data_paths,
      materialize_output_paths
    )
    analyze_data_list = self._MakeDatasetList(
      analyze_data_paths,
    )

    with self._make_beam_pipeline() as pipeline:
      with tft_beam.Context(temp_dir=temp_path):
        # NOTE: Unclear if there is a difference between input_dataset_metadata
        # and transform_input_dataset_metadata. Look at Transform executor.
        decode_fn = tft.coders.ExampleProtoCoder(schema_proto, serialized=True).decode

        input_analysis_data = {}
        for dataset in analyze_data_list:
          infix = 'AnalysisIndex{}'.format(dataset.index)
          dataset.serialized = (
            pipeline
            | 'ReadDataset[{}]'.format(infix) >> self._ReadExamples(
                dataset, transform_input_dataset_metadata))
          dataset.decoded = (
            dataset.serialized
            | 'Decode[{}]'.format(infix)
            >> self._DecodeInputs(decode_fn))
          input_analysis_data[dataset.dataset_key] = dataset.decoded

        if not hasattr(tft_beam.analyzer_cache, 'DatasetKey'):
          input_analysis_data = (
              [
                  dataset for dataset in input_analysis_data.values()
                  if dataset is not None
              ]
              | 'FlattenAnalysisDatasetsBecauseItIsRequired' >>
              beam.Flatten(pipeline=pipeline))

        transform_fn = (
            (input_analysis_data, transform_input_dataset_metadata)
            | 'Analyze' >> tft_beam.AnalyzeDataset(
                tf_transform_output.transform_raw_features, pipeline=pipeline))

        for dataset in transform_data_list:
          infix = 'TransformIndex{}'.format(dataset.index)
          dataset.serialized = (
            pipeline
            | 'ReadDataset[{}]'.format(infix) >> self._ReadExamples(
                dataset, transform_input_dataset_metadata))

          dataset.decoded = (
            dataset.serialized
            | 'Decode[{}]'.format(infix)
            >> self._DecodeInputs(decode_fn))

          dataset.transformed, metadata = (
              ((dataset.decoded, transform_input_dataset_metadata), transform_fn)
              | 'Transform[{}]'.format(infix) >> tft_beam.TransformDataset())

          dataset.transformed_and_serialized = (
              dataset.transformed
              | 'EncodeAndSerialize[{}]'.format(infix)
              >> beam.ParDo(self._EncodeAsSerializedExamples(), _GetSchemaProto(metadata)))

          _ = (
            dataset.transformed_and_serialized
            | 'Materialize[{}]'.format(infix) >> self._WriteExamples(dataset.materialize_output_path))