Пример #1
0
    def expand(self, transform_fn):
        saved_model_dir_pcoll, metadata = transform_fn

        # Write metadata in non-deferred manner.  Once metadata contains deferred
        # components, the deferred components will be written in a deferred manner
        # while the non-deferred components will be written in a non-deferred
        # manner.
        def safe_copy_tree(source, dest):
            if source == dest:
                raise ValueError(
                    'Cannot write a TransformFn to its current location.')
            fileio.ChannelFactory.copytree(source, dest)

        _ = metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
            os.path.join(self._path, 'transformed_metadata'),
            pipeline=saved_model_dir_pcoll.pipeline)
        return saved_model_dir_pcoll | 'WriteTransformFn' >> beam.Map(
            safe_copy_tree, os.path.join(self._path, 'transform_fn'))
Пример #2
0
    def testTransformFnExportAndImportRoundtrip(self):
        tranform_fn_dir = os.path.join(self.get_temp_dir(),
                                       'export_transform_fn')
        metadata_dir = os.path.join(self.get_temp_dir(), 'export_metadata')

        with beam.Pipeline() as p:

            def preprocessing_fn(inputs):
                return {'x_scaled': tft.scale_to_0_1(inputs['x'])}

            metadata = self.toMetadata(
                {'x': tf.FixedLenFeature((), tf.float32, 0)})
            columns = p | 'CreateTrainingData' >> beam.Create([{
                'x': v
            } for v in [4, 1, 5, 2]])
            _, transform_fn = (
                (columns, metadata)
                | 'Analyze and Transform' >>
                beam_impl.AnalyzeAndTransformDataset(
                    preprocessing_fn,
                    os.path.join(self.get_temp_dir(), 'no_automaterialize')))

            _ = transform_fn | transform_fn_io.WriteTransformFn(
                tranform_fn_dir)
            _ = metadata | beam_metadata_io.WriteMetadata(metadata_dir,
                                                          pipeline=p)

        with beam.Pipeline() as p:
            transform_fn = p | transform_fn_io.ReadTransformFn(tranform_fn_dir)
            metadata = p | beam_metadata_io.ReadMetadata(metadata_dir)
            # Run transform_columns on some eval dataset.
            eval_data = p | 'CreateEvalData' >> beam.Create([{
                'x': v
            } for v in [6, 3]])
            transformed_eval_data, _ = (
                ((eval_data, metadata), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())
            expected_transformed_eval_data = [{
                'x_scaled': v
            } for v in [1.25, 0.5]]
            beam_test_util.assert_that(
                transformed_eval_data,
                beam_test_util.equal_to(expected_transformed_eval_data))
Пример #3
0
def transform_data(train_data_file, test_data_file,
                   transformed_train_filebase, transformed_test_filebase,
                   transformed_metadata_dir):
  """Transform the cleaned data and write out as a TFRecord of Example protos.

  Read in the cleaned data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and coverts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    transformed_train_filebase: Base filename for transformed training data
        shards
    transformed_test_filebase: Base filename for cleaned test data shards
    transformed_metadata_dir: Directory where metadata for transformed data
        should be written.
  """
  raw_data_schema = {
      key: dataset_schema.ColumnSchema(
          tf.string, [], dataset_schema.FixedColumnRepresentation())
      for key in CATEGORICAL_COLUMNS
  }
  raw_data_schema.update({
      key: dataset_schema.ColumnSchema(
          tf.float32, [], dataset_schema.FixedColumnRepresentation())
      for key in NUMERIC_COLUMNS
  })
  raw_data_schema[LABEL_COLUMN] = dataset_schema.ColumnSchema(
      tf.string, [], dataset_schema.FixedColumnRepresentation())
  raw_data_schema = dataset_schema.Schema(raw_data_schema)
  raw_data_metadata = dataset_metadata.DatasetMetadata(raw_data_schema)

  def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    outputs = {}

    # Scale numeric columns to have range [0, 1].
    for key in NUMERIC_COLUMNS:
      outputs[key] = tft.scale_to_0_1(inputs[key])

    # For all categorical columns except the label column, we use
    # tft.string_to_int which computes the set of unique values and uses this
    # to convert the strings to indices.
    for key in CATEGORICAL_COLUMNS:
      outputs[key] = tft.string_to_int(inputs[key])

    # Update outputs of both kinds to convert from shape (batch,), i.e. a batch
    # of scalars, to shape (batch, 1), i.e. a batch of vectors of length 1.
    # This is needed so the output can be easily wrapped in `FeatureColumn`s.
    for key in NUMERIC_COLUMNS + CATEGORICAL_COLUMNS:
      outputs[key] = tft.map(lambda x: tf.expand_dims(x, -1), outputs[key])

    # For the label column we provide the mapping from string to index.
    def convert_label(label):
      table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K'])
      return table.lookup(label)
    outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN])

    return outputs

  # The "with" block will create a pipeline, and run that pipeline at the exit
  # of the block.
  with beam.Pipeline() as p:
    # Create a coder to read the census data with the schema.  To do this we
    # need to list all columns in order since the schema doesn't specify the
    # order of columns in the csv.
    ordered_columns = [
        'age', 'workclass', 'fnlwgt', 'education', 'education-num',
        'marital-status', 'occupation', 'relationship', 'race', 'sex',
        'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
        'label'
    ]
    converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema)

    # Read in raw data and convert using CSV converter.  Note that we apply some
    # Beam transformations here, which will not be encoded in the TF graph since
    # we don't do the from within tf.Transform's methods (AnalyzeDataset,
    # TransformDataset etc.).  These transformations are just to get data into
    # a format that the CSV converter can read, in particular removing empty
    # lines and removing spaces after commas.
    raw_data = (
        p
        | 'ReadTrainData' >> textio.ReadFromText(train_data_file)
        | 'FilterTrainData' >> beam.Filter(lambda line: line)
        | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(', ', ','))
        | 'DecodeTrainData' >> beam.Map(converter.decode))

    # Combine data and schema into a dataset tuple.  Note that we already used
    # the schema to read the CSV data, but we also need it to interpret
    # raw_data.
    raw_dataset = (raw_data, raw_data_metadata)
    transformed_dataset, transform_fn = (
        raw_dataset | beam_impl.AnalyzeAndTransformDataset(
            preprocessing_fn, output_dir=tempfile.mkdtemp()))
    transformed_data, transformed_metadata = transformed_dataset

    _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
        transformed_train_filebase,
        coder=example_proto_coder.ExampleProtoCoder(
            transformed_metadata.schema))

    # Now apply transform function to test data.  In this case we also remove
    # the header line from the CSV file and the trailing period at the end of
    # each line.
    raw_test_data = (
        p
        | 'ReadTestData' >> textio.ReadFromText(test_data_file)
        | 'FilterTestData' >> beam.Filter(
            lambda line: line and line != '|1x3 Cross validator')
        | 'FixCommasTestData' >> beam.Map(lambda line: line.replace(', ', ','))
        | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1])
        | 'DecodeTestData' >> beam.Map(converter.decode))

    raw_test_dataset = (raw_test_data, raw_data_metadata)

    transformed_test_dataset = (
        (raw_test_dataset, transform_fn) | beam_impl.TransformDataset())
    # Don't need transformed data schema, it's the same as before.
    transformed_test_data, _ = transformed_test_dataset

    _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
        transformed_test_filebase,
        coder=example_proto_coder.ExampleProtoCoder(
            transformed_metadata.schema))

    _ = (
        transformed_metadata
        | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
            transformed_metadata_dir, pipeline=p))