Exemplo n.º 1
0
    def testWriteMetadataIsRetryable(self):
        tft_test_case.skip_if_external_environment(
            'Retries are currently not available on this environment.')
        original_write_metadata = beam_metadata_io.metadata_io.write_metadata
        write_metadata_called_list = []

        def mock_write_metadata(metadata, path):
            """Mocks metadata_io.write_metadata to fail the first time it is called by this test, thus forcing a retry which should succeed."""
            if not write_metadata_called_list:
                write_metadata_called_list.append(True)
                original_write_metadata(metadata, path)
                raise ArithmeticError('Some error')
            return original_write_metadata(metadata, path)

        # Write metadata to disk using WriteMetadata PTransform.
        with mock.patch(
                'tensorflow_transform.tf_metadata.metadata_io.write_metadata',
                mock_write_metadata):
            with self._makeTestPipeline() as pipeline:
                path = self.get_temp_dir()
                _ = (test_metadata.COMPLETE_METADATA
                     | beam_metadata_io.WriteMetadata(path, pipeline))

            # Load from disk and check that it is as expected.
            metadata = metadata_io.read_metadata(path)
            self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)
Exemplo n.º 2
0
 def testWriteMetadataNonDeferred(self):
     # Write properties as metadata to disk.
     with beam.Pipeline() as pipeline:
         path = self.get_temp_dir()
         _ = (_TEST_METADATA_COMPLETE
              | beam_metadata_io.WriteMetadata(path, pipeline))
     # Load from disk and check that it is as expected.
     metadata = metadata_io.read_metadata(path)
     self.assertMetadataEqual(metadata, _TEST_METADATA_COMPLETE)
Exemplo n.º 3
0
 def testWriteMetadataDeferredProperties(self):
     # Write deferred properties as metadata to disk.
     with beam.Pipeline() as pipeline:
         path = self.get_temp_dir()
         deferred_metadata = pipeline | beam.Create([_FUTURES_DICT])
         _ = ((_TEST_METADATA_WITH_FUTURES, deferred_metadata)
              | beam_metadata_io.WriteMetadata(path, pipeline))
     # Load from disk and check that it is as expected.
     metadata = metadata_io.read_metadata(path)
     self.assertMetadataEqual(metadata, _TEST_METADATA)
Exemplo n.º 4
0
 def testWriteMetadataNonDeferredEmptyDict(self):
     # Write properties as metadata to disk.
     with beam.Pipeline() as pipeline:
         path = self.get_temp_dir()
         property_pcoll = pipeline | beam.Create([{}])
         _ = ((_TEST_METADATA, property_pcoll)
              | beam_metadata_io.WriteMetadata(path, pipeline))
     # Load from disk and check that it is as expected.
     metadata = metadata_io.read_metadata(path)
     self.assertMetadataEqual(metadata, _TEST_METADATA)
Exemplo n.º 5
0
    def testWriteMetadataNonDeferred(self):
        # Write metadata to disk using WriteMetadata PTransform.
        with beam.Pipeline() as pipeline:
            path = self.get_temp_dir()
            _ = (test_metadata.COMPLETE_METADATA
                 | beam_metadata_io.WriteMetadata(path, pipeline))

        # Load from disk and check that it is as expected.
        metadata = metadata_io.read_metadata(path)
        self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)
Exemplo n.º 6
0
 def expand(self, transform_fn):
     saved_model_dir_pcoll, metadata = transform_fn
     # Write metadata in non-deferred manner.  Once metadata contains deferred
     # components, the deferred components will be written in a deferred manner
     # while the non-deferred components will be written in a non-deferred
     # manner.
     _ = metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
         os.path.join(self._path, 'transformed_metadata'),
         pipeline=saved_model_dir_pcoll.pipeline)
     return saved_model_dir_pcoll | 'WriteTransformFn' >> beam.Map(
         _copy_tree, os.path.join(self._path, 'transform_fn'))
Exemplo n.º 7
0
    def testWriteMetadataDeferred(self):
        # Write metadata to disk using WriteMetadata PTransform, combining
        # incomplete metadata with (deferred) complete metadata.
        with beam.Pipeline() as pipeline:
            path = self.get_temp_dir()
            deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create(
                [test_metadata.COMPLETE_METADATA])
            metadata = beam_metadata_io.BeamDatasetMetadata(
                test_metadata.INCOMPLETE_METADATA, deferred_metadata)
            _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline)

        # Load from disk and check that it is as expected.
        metadata = metadata_io.read_metadata(path)
        self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)
Exemplo n.º 8
0
    def expand(self, transform_fn):
        saved_model_dir, metadata = transform_fn
        pipeline = saved_model_dir.pipeline

        # Using a temp dir within `path` ensures that the source and dstination
        # paths for the rename below are in the same file system.
        base_temp_dir = os.path.join(self._path, 'transform_tmp')
        temp_metadata_path = (
            metadata
            | 'WriteMetadataToTemp' >> beam_metadata_io.WriteMetadata(
                base_temp_dir, pipeline, write_to_unique_subdirectory=True))

        temp_transform_fn_path = (
            saved_model_dir
            | 'WriteTransformFnToTemp' >> beam.Map(
                _copy_tree_to_unique_temp_dir, base_temp_dir))

        metadata_path = os.path.join(
            self._path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR)
        transform_fn_path = os.path.join(
            self._path, tft.TFTransformOutput.TRANSFORM_FN_DIR)

        def publish_outputs(unused_element, metadata_source_path,
                            transform_fn_source_path):
            import tensorflow as tf  # pylint: disable=g-import-not-at-top
            if not tf.io.gfile.exists(self._path):
                tf.io.gfile.makedirs(self._path)

            tf.io.gfile.rename(metadata_source_path,
                               metadata_path,
                               overwrite=True)
            tf.io.gfile.rename(transform_fn_source_path,
                               transform_fn_path,
                               overwrite=True)
            # TODO(b/211615643): Remove the exists check once importing TFIO in S3
            # addresses NotFoundError.
            if tf.io.gfile.exists(base_temp_dir):
                tf.io.gfile.rmtree(base_temp_dir)

        # TODO(KesterTong): Move this "must follows" logic into a tfx_bsl helper
        # function or into Beam.
        return (pipeline
                | 'CreateSole' >> beam.Create([None])
                | 'PublishMetadataAndTransformFn' >> beam.Map(
                    publish_outputs,
                    metadata_source_path=beam.pvalue.AsSingleton(
                        temp_metadata_path),
                    transform_fn_source_path=beam.pvalue.AsSingleton(
                        temp_transform_fn_path)))
Exemplo n.º 9
0
    def testWriteMetadataDeferredProperties(self):
        # Write deferred properties as metadata to disk.
        with beam.Pipeline() as pipeline:
            path = self.get_temp_dir()

            # Combine _TEST_METADATA with the complete (deferred) metadata.
            deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create(
                [_TEST_METADATA_COMPLETE])
            metadata = beam_metadata_io.BeamDatasetMetadata(
                _TEST_METADATA, deferred_metadata)

            _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline)
        # Load from disk and check that it is as expected.
        metadata = metadata_io.read_metadata(path)
        self.assertMetadataEqual(metadata, _TEST_METADATA_COMPLETE)
Exemplo n.º 10
0
    def testWriteMetadataDeferredProperties(self):
        # Write deferred properties as metadata to disk.
        with beam.Pipeline() as pipeline:
            path = self.get_temp_dir()

            # Combine test metadata with a dict of PCollections resolving futures.
            metadata = beam_metadata_io.BeamDatasetMetadata(
                _TEST_METADATA_WITH_FUTURES, {
                    'a': pipeline | 'CreateA' >> beam.Create([3]),
                    'b': pipeline | 'CreateB' >> beam.Create([5])
                })

            _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline)
        # Load from disk and check that it is as expected.
        metadata = metadata_io.read_metadata(path)
        self.assertMetadataEqual(metadata, _TEST_METADATA)
Exemplo n.º 11
0
    def expand(self, transform_fn):
        saved_model_dir, properties = transform_fn

        metadata_path = os.path.join(self._path, 'transformed_metadata')
        pipeline = saved_model_dir.pipeline
        write_metadata_done = (
            properties
            | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
                metadata_path, pipeline))

        transform_fn_path = os.path.join(self._path, 'transform_fn')
        write_transform_fn_done = (
            saved_model_dir
            | 'WriteTransformFn' >> beam.Map(_copy_tree, transform_fn_path))

        return (write_transform_fn_done
                | 'WaitOnWriteMetadataDone' >> beam.Map(
                    lambda x, dummy: x,
                    dummy=beam.pvalue.AsSingleton(write_metadata_done)))
Exemplo n.º 12
0
    def expand(self, transform_fn):
        saved_model_dir, metadata = transform_fn

        metadata_path = os.path.join(self._path, TRANSFORMED_METADATA_DIR)
        pipeline = saved_model_dir.pipeline
        write_metadata_done = (
            metadata
            | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
                metadata_path, pipeline))

        transform_fn_path = os.path.join(self._path, TRANSFORM_FN_DIR)
        write_transform_fn_done = (
            saved_model_dir
            | 'WriteTransformFn' >> beam.Map(_copy_tree, transform_fn_path))

        return (write_transform_fn_done
                | 'WaitOnWriteMetadataDone' >> beam.Map(
                    lambda x, dummy: x,
                    dummy=beam.pvalue.AsSingleton(write_metadata_done)))
Exemplo n.º 13
0
    def testTransformFnExportAndImportRoundtrip(self):
        tranform_fn_dir = os.path.join(self.get_temp_dir(),
                                       'export_transform_fn')
        metadata_dir = os.path.join(self.get_temp_dir(), 'export_metadata')

        with beam.Pipeline() as p:

            def preprocessing_fn(inputs):
                return {'x_scaled': tft.scale_to_0_1(inputs['x'])}

            metadata = self.toMetadata(
                {'x': tf.FixedLenFeature((), tf.float32, 0)})
            columns = p | 'CreateTrainingData' >> beam.Create([{
                'x': v
            } for v in [4, 1, 5, 2]])
            with beam_impl.Context(temp_dir=self.get_temp_dir()):
                _, transform_fn = (
                    (columns, metadata)
                    | 'Analyze and Transform' >>
                    beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

            _ = transform_fn | transform_fn_io.WriteTransformFn(
                tranform_fn_dir)
            _ = metadata | beam_metadata_io.WriteMetadata(metadata_dir,
                                                          pipeline=p)

        with beam.Pipeline() as p:
            transform_fn = p | transform_fn_io.ReadTransformFn(tranform_fn_dir)
            metadata = p | beam_metadata_io.ReadMetadata(metadata_dir)
            # Run transform_columns on some eval dataset.
            eval_data = p | 'CreateEvalData' >> beam.Create([{
                'x': v
            } for v in [6, 3]])
            transformed_eval_data, _ = (
                ((eval_data, metadata), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())
            expected_transformed_eval_data = [{
                'x_scaled': v
            } for v in [1.25, 0.5]]
            beam_test_util.assert_that(
                transformed_eval_data,
                beam_test_util.equal_to(expected_transformed_eval_data))
Exemplo n.º 14
0
    def testWriteMetadataDeferred(self):
        # Write metadata to disk using WriteMetadata PTransform, combining
        # incomplete metadata with (deferred) complete metadata.
        expected_asset_map = {'key': 'value'}
        with beam.Pipeline() as pipeline:
            path = self.get_temp_dir()
            deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create(
                [test_metadata.COMPLETE_METADATA])
            metadata = beam_metadata_io.BeamDatasetMetadata(
                test_metadata.INCOMPLETE_METADATA, deferred_metadata,
                expected_asset_map)
            _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline)

        # Load from disk and check that it is as expected.
        metadata = metadata_io.read_metadata(path)
        self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)

        with tf.io.gfile.GFile(
                os.path.join(path,
                             output_wrapper.TFTransformOutput.ASSET_MAP)) as f:
            asset_map = json.loads(f.read())
            self.assertDictEqual(asset_map, expected_asset_map)
Exemplo n.º 15
0
    def expand(self, transform_fn):
        saved_model_dir, metadata = transform_fn

        metadata_path = os.path.join(
            self._path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR)
        pipeline = saved_model_dir.pipeline
        write_metadata_done = (
            metadata
            | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
                metadata_path, pipeline))

        transform_fn_path = os.path.join(
            self._path, tft.TFTransformOutput.TRANSFORM_FN_DIR)
        write_transform_fn_done = (
            saved_model_dir
            | 'WriteTransformFn' >> beam.Map(_copy_tree, transform_fn_path))

        # TODO(KesterTong): Move this "must follows" logic into a TFT wide helper
        # function or into Beam.
        return (write_transform_fn_done
                | 'WaitOnWriteMetadataDone' >> beam.Map(
                    lambda x, dummy: x,
                    dummy=beam.pvalue.AsSingleton(write_metadata_done)))
Exemplo n.º 16
0
def transform_data(train_neg_filepattern, train_pos_filepattern,
                   test_neg_filepattern, test_pos_filepattern,
                   transformed_train_filebase, transformed_test_filebase,
                   transformed_metadata_dir):
  """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    train_neg_filepattern: Filepattern for training data negative examples
    train_pos_filepattern: Filepattern for training data positive examples
    test_neg_filepattern: Filepattern for test data negative examples
    test_pos_filepattern: Filepattern for test data positive examples
    transformed_train_filebase: Base filename for transformed training data
        shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data
        should be written
  """

  with beam.Pipeline() as pipeline:
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
      # pylint: disable=no-value-for-parameter
      train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData(
          (train_neg_filepattern, train_pos_filepattern))
      # pylint: disable=no-value-for-parameter
      test_data = pipeline | 'ReadTest' >> ReadAndShuffleData(
          (test_neg_filepattern, test_pos_filepattern))

      metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
          REVIEW_COLUMN: dataset_schema.ColumnSchema(
              tf.string, [], dataset_schema.FixedColumnRepresentation()),
          LABEL_COLUMN: dataset_schema.ColumnSchema(
              tf.int64, [], dataset_schema.FixedColumnRepresentation()),
      }))

      def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        review = inputs[REVIEW_COLUMN]

        def remove_character(s, char):
          """Remove a character from a string.

          Args:
            s: A SparseTensor of rank 1 of type tf.string
            char: A string of length 1

          Returns:
            The string `s` with the given character removed (i.e. replaced by
            '')
          """
          # Hacky implementation where we split and rejoin.
          split = tf.string_split(s, char)
          rejoined = tf.reduce_join(
              tf.sparse_to_dense(
                  split.indices, split.dense_shape, split.values, ''),
              1)
          return rejoined

        def remove_punctuation(s):
          """Remove puncuation from a string.

          Args:
            s: A SparseTensor of rank 1 of type tf.string

          Returns:
            The string `s` with punctuation removed.
          """
          for char in PUNCTUATION_CHARACTERS:
            s = remove_character(s, char)
          return s

        cleaned_review = tft.map(remove_punctuation, review)
        review_tokens = tft.map(tf.string_split, cleaned_review)
        review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE)
        return {
            REVIEW_COLUMN: review_indices,
            LABEL_COLUMN: inputs[LABEL_COLUMN]
        }

      (transformed_train_data, transformed_metadata), transform_fn = (
          (train_data, metadata)
          | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(
              preprocessing_fn))

      transformed_test_data, _ = (
          ((test_data, metadata), transform_fn)
          | 'Transform' >> beam_impl.TransformDataset())

      _ = (
          transformed_train_data
          | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
              transformed_train_filebase,
              coder=example_proto_coder.ExampleProtoCoder(
                  transformed_metadata.schema)))

      _ = (
          transformed_test_data
          | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
              transformed_test_filebase,
              coder=example_proto_coder.ExampleProtoCoder(
                  transformed_metadata.schema)))

      _ = (
          transformed_metadata
          | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
              transformed_metadata_dir, pipeline=pipeline))
Exemplo n.º 17
0
def preprocess_data(train_neg_file_pattern,
                    train_pos_file_pattern,
                    test_neg_file_pattern,
                    test_pos_file_pattern,
                    transformed_train_file_pattern,
                    transformed_test_file_pattern,
                    transformed_metadata_dir,
                    raw_metadata_dir,
                    transform_func_dir,
                    temp_dir,
                    vocab_size,
                    delimiters):
    """Transform the data and write out as a TFRecord of Example protos.
    Read in the data from the positive and negative examples on disk, and
    transform it using a preprocessing pipeline that removes punctuation,
    tokenizes and maps tokens to int64 values indices.

    Args:
    train_neg_filepattern: Filepattern for training data negative examples
    train_pos_filepattern: Filepattern for training data positive examples
    test_neg_filepattern: Filepattern for test data negative examples
    test_pos_filepattern: Filepattern for test data positive examples
    transformed_train_filebase: Base filename for transformed training data shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data should be written


    raw_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
        REVIEW_COLUMN: dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()),
        LABEL_COLUMN: dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation()),
    }))
    """
    pipeline_name = 'DataflowRunner'
    options = {
        'job_name': ('cloud-ml-hazmat-preprocess-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))),
        'temp_location': temp_dir,
        'project': "stone-outpost-636",
        'max_num_workers': 8
    }
    pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
    #with beam.Pipeline(pipeline_name, options=pipeline_options) as pipeline:
    #    with beam_impl.Context(temp_dir=temp_dir):
    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):

            train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData((train_neg_file_pattern, train_pos_file_pattern))
            test_data = pipeline | 'ReadTest' >> ReadAndShuffleData((test_neg_file_pattern, test_pos_file_pattern))
            preprocessing_fn = generate_preprocessing_fn(vocab_size, delimiters)

            (transformed_train_data, transformed_metadata), transform_fn = ((train_data, const.RAW_METADATA)
              | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

            _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(transform_func_dir))

            transformed_test_data, _ = (((test_data, const.RAW_METADATA), transform_fn)
              | 'Transform' >> beam_impl.TransformDataset())

            _ = (transformed_train_data
              | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(transformed_train_file_pattern,
                  coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)))

            _ = (transformed_test_data
              | 'WriteTestData' >> tfrecordio.WriteToTFRecord(transformed_test_file_pattern,
                  coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)))

            _ = (transformed_metadata
              | 'WriteTransformedMetadata' >> beam_metadata_io.WriteMetadata(transformed_metadata_dir, pipeline=pipeline))

            _ = (const.RAW_METADATA
              | 'WriteRawMetadata' >> beam_metadata_io.WriteMetadata(raw_metadata_dir, pipeline=pipeline))
Exemplo n.º 18
0
def transform_data(train_data_file, test_data_file, transformed_train_filebase,
                   transformed_test_filebase, transformed_metadata_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and coverts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    transformed_train_filebase: Base filename for transformed training data
        shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data
        should be written
  """
    raw_data_schema = {
        key:
        dataset_schema.ColumnSchema(tf.string, [],
                                    dataset_schema.FixedColumnRepresentation())
        for key in CATEGORICAL_COLUMNS
    }
    raw_data_schema.update({
        key:
        dataset_schema.ColumnSchema(tf.float32, [],
                                    dataset_schema.FixedColumnRepresentation())
        for key in NUMERIC_COLUMNS
    })
    raw_data_schema[LABEL_COLUMN] = dataset_schema.ColumnSchema(
        tf.string, [], dataset_schema.FixedColumnRepresentation())
    raw_data_schema = dataset_schema.Schema(raw_data_schema)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_data_schema)

    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_COLUMNS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # For all categorical columns except the label column, we use
        # tft.string_to_int which computes the set of unique values and uses this
        # to convert the strings to indices.
        for key in CATEGORICAL_COLUMNS:
            outputs[key] = tft.string_to_int(inputs[key])

        # For the label column we provide the mapping from string to index.
        def convert_label(label):
            table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K'])
            return table.lookup(label)

        outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN])

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # Create a coder to read the census data with the schema.  To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            ordered_columns = [
                'age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'label'
            ]
            converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema)

            # Read in raw data and convert using CSV converter.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do the from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV converter can read, in particular
            # removing empty lines and removing spaces after commas.
            raw_data = (pipeline
                        |
                        'ReadTrainData' >> textio.ReadFromText(train_data_file)
                        | 'FilterTrainData' >> beam.Filter(lambda line: line)
                        | 'FixCommasTrainData' >>
                        beam.Map(lambda line: line.replace(', ', ','))
                        | 'DecodeTrainData' >> beam.Map(converter.decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, raw_data_metadata)
            transformed_dataset, transform_fn = (
                raw_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset

            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                transformed_train_filebase,
                coder=example_proto_coder.ExampleProtoCoder(
                    transformed_metadata.schema))

            # Now apply transform function to test data.  In this case we also remove
            # the header line from the CSV file and the trailing period at the end of
            # each line.
            raw_test_data = (
                pipeline
                | 'ReadTestData' >> textio.ReadFromText(test_data_file)
                | 'FilterTestData' >> beam.Filter(
                    lambda line: line and line != '|1x3 Cross validator')
                | 'FixCommasTestData' >>
                beam.Map(lambda line: line.replace(', ', ','))
                | 'RemoveTrailingPeriodsTestData' >>
                beam.Map(lambda line: line[:-1])
                | 'DecodeTestData' >> beam.Map(converter.decode))

            raw_test_dataset = (raw_test_data, raw_data_metadata)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | beam_impl.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                transformed_test_filebase,
                coder=example_proto_coder.ExampleProtoCoder(
                    transformed_metadata.schema))

            _ = (transformed_metadata
                 | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
                     transformed_metadata_dir, pipeline=pipeline))
Exemplo n.º 19
0
def transform_data(train_neg_filepattern, train_pos_filepattern,
                   test_neg_filepattern, test_pos_filepattern,
                   transformed_train_filebase, transformed_test_filebase,
                   transformed_metadata_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    train_neg_filepattern: Filepattern for training data negative examples
    train_pos_filepattern: Filepattern for training data positive examples
    test_neg_filepattern: Filepattern for test data negative examples
    test_pos_filepattern: Filepattern for test data positive examples
    transformed_train_filebase: Base filename for transformed training data
        shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data
        should be written
  """

    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # pylint: disable=no-value-for-parameter
            train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData(
                (train_neg_filepattern, train_pos_filepattern))
            # pylint: disable=no-value-for-parameter
            test_data = pipeline | 'ReadTest' >> ReadAndShuffleData(
                (test_neg_filepattern, test_pos_filepattern))

            metadata = dataset_metadata.DatasetMetadata(
                dataset_schema.Schema({
                    REVIEW_COLUMN:
                    dataset_schema.ColumnSchema(
                        tf.string, [],
                        dataset_schema.FixedColumnRepresentation()),
                    LABEL_COLUMN:
                    dataset_schema.ColumnSchema(
                        tf.int64, [],
                        dataset_schema.FixedColumnRepresentation()),
                }))

            def preprocessing_fn(inputs):
                """Preprocess input columns into transformed columns."""
                review = inputs[REVIEW_COLUMN]

                review_tokens = tf.string_split(review, DELIMITERS)
                review_indices = tft.string_to_int(review_tokens,
                                                   top_k=VOCAB_SIZE)
                # Add one for the oov bucket created by string_to_int.
                review_bow_indices, review_weight = tft.tfidf(
                    review_indices, VOCAB_SIZE + 1)
                return {
                    REVIEW_COLUMN: review_bow_indices,
                    REVIEW_WEIGHT: review_weight,
                    LABEL_COLUMN: inputs[LABEL_COLUMN]
                }

            (transformed_train_data, transformed_metadata), transform_fn = (
                (train_data, metadata)
                | 'AnalyzeAndTransform' >>
                beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

            transformed_test_data, _ = (
                ((test_data, metadata), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())

            _ = (transformed_train_data
                 | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                     transformed_train_filebase,
                     coder=example_proto_coder.ExampleProtoCoder(
                         transformed_metadata.schema)))

            _ = (transformed_test_data
                 | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                     transformed_test_filebase,
                     coder=example_proto_coder.ExampleProtoCoder(
                         transformed_metadata.schema)))

            _ = (transformed_metadata
                 | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
                     transformed_metadata_dir, pipeline=pipeline))