示例#1
0
def read_and_shuffle_data(
    train_neg_filepattern, train_pos_filepattern, test_neg_filepattern,
    test_pos_filepattern, working_dir):
  """Read and shuffle the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, shuffle it
  and write it out in TFRecord format.
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    train_neg_filepattern: Filepattern for training data negative examples
    train_pos_filepattern: Filepattern for training data positive examples
    test_neg_filepattern: Filepattern for test data negative examples
    test_pos_filepattern: Filepattern for test data positive examples
    working_dir: Directory to write shuffled data to
  """
  with beam.Pipeline() as pipeline:
    # pylint: disable=no-value-for-parameter
    _ = (
        pipeline
        | 'ReadAndShuffleTrain' >> ReadAndShuffleData(
            (train_neg_filepattern, train_pos_filepattern))
        | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
            os.path.join(working_dir, SHUFFLED_TRAIN_DATA_FILEBASE),
            coder=example_proto_coder.ExampleProtoCoder(
                RAW_DATA_METADATA.schema)))
    _ = (
        pipeline
        | 'ReadAndShuffleTest' >> ReadAndShuffleData(
            (test_neg_filepattern, test_pos_filepattern))
        | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
            os.path.join(working_dir, SHUFFLED_TEST_DATA_FILEBASE),
            coder=example_proto_coder.ExampleProtoCoder(
                RAW_DATA_METADATA.schema)))
示例#2
0
  def test_example_proto_coder_bad_default_value(self):
    input_schema = dataset_schema.from_feature_spec({
        'scalar_feature_2': tf.FixedLenFeature(shape=[2], dtype=tf.float32,
                                               default_value=[1.0]),
    })
    with self.assertRaisesRegexp(ValueError,
                                 'got default value with incorrect shape'):
      example_proto_coder.ExampleProtoCoder(input_schema)

    input_schema = dataset_schema.from_feature_spec({
        'scalar_feature_2': tf.FixedLenFeature(shape=[], dtype=tf.float32,
                                               default_value=[0.0]),
    })
    with self.assertRaisesRegexp(ValueError,
                                 'got default value with incorrect shape'):
      example_proto_coder.ExampleProtoCoder(input_schema)

    input_schema = dataset_schema.from_feature_spec({
        '2d_vector_feature':
            tf.FixedLenFeature(
                shape=[2, 3],
                dtype=tf.float32,
                default_value=[[1.0, 1.0], [1.0]]),
    })
    with self.assertRaisesRegexp(ValueError,
                                 'got default value with incorrect shape'):
      example_proto_coder.ExampleProtoCoder(input_schema)
示例#3
0
def read_and_shuffle_data(train_neg_filepattern, train_pos_filepattern,
                          test_neg_filepattern, test_pos_filepattern,
                          shuffled_train_filebase, shuffled_test_filebase):
    """Read and shuffle the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, shuffle it
  and write it out in TFRecord format.
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    train_neg_filepattern: Filepattern for training data negative examples
    train_pos_filepattern: Filepattern for training data positive examples
    test_neg_filepattern: Filepattern for test data negative examples
    test_pos_filepattern: Filepattern for test data positive examples
    shuffled_train_filebase: Base filename for shuffled training data shards
    shuffled_test_filebase: Base filename for shuffled test data shards
  """
    with beam.Pipeline() as pipeline:
        # pylint: disable=no-value-for-parameter
        _ = (pipeline
             | 'ReadAndShuffleTrain' >> ReadAndShuffleData(
                 (train_neg_filepattern, train_pos_filepattern))
             | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                 shuffled_train_filebase,
                 coder=example_proto_coder.ExampleProtoCoder(
                     RAW_DATA_METADATA.schema)))
        _ = (pipeline
             | 'ReadAndShuffleTest' >> ReadAndShuffleData(
                 (test_neg_filepattern, test_pos_filepattern))
             | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                 shuffled_test_filebase,
                 coder=example_proto_coder.ExampleProtoCoder(
                     RAW_DATA_METADATA.schema)))
示例#4
0
  def test_example_proto_coder_error(self):
    input_schema = dataset_schema.from_feature_spec({
        '2d_vector_feature': tf.FixedLenFeature(shape=[2, 2], dtype=tf.int64),
    })
    coder = example_proto_coder.ExampleProtoCoder(input_schema)

    example_decoded_value = {
        '2d_vector_feature': [1, 2, 3]
    }
    example_proto_text = """
    features {
      feature { key: "1d_vector_feature"
                value { int64_list { value: [ 1, 2, 3 ] } } }
    }
    """
    example = tf.train.Example()
    text_format.Merge(example_proto_text, example)

    # Ensure that we raise an exception for trying to encode invalid data.
    with self.assertRaisesRegexp(ValueError, 'got wrong number of values'):
      _ = coder.encode(example_decoded_value)

    # Ensure that we raise an exception for trying to parse invalid data.
    with self.assertRaisesRegexp(ValueError, 'got wrong number of values'):
      _ = coder.decode(example.SerializeToString())
示例#5
0
  def test_example_proto_coder_default_value(self):
    input_schema = dataset_schema.from_feature_spec({
        'scalar_feature_3':
            tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=1.0),
        'scalar_feature_4':
            tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=0.0),
        '1d_vector_feature':
            tf.FixedLenFeature(
                shape=[1], dtype=tf.float32, default_value=[2.0]),
        '2d_vector_feature':
            tf.FixedLenFeature(
                shape=[2, 2],
                dtype=tf.float32,
                default_value=[[1.0, 2.0], [3.0, 4.0]]),
    })
    coder = example_proto_coder.ExampleProtoCoder(input_schema)

    # Python types.
    example_proto_text = """
    features {
    }
    """
    example = tf.train.Example()
    text_format.Merge(example_proto_text, example)
    data = example.SerializeToString()

    # Assert the data is decoded into the expected format.
    expected_decoded = {
        'scalar_feature_3': 1.0,
        'scalar_feature_4': 0.0,
        '1d_vector_feature': [2.0],
        '2d_vector_feature': [[1.0, 2.0], [3.0, 4.0]],
    }
    decoded = coder.decode(data)
    np.testing.assert_equal(expected_decoded, decoded)
示例#6
0
 def test_encode_non_serialized(self, feature_spec, ascii_proto, instance,
                                **kwargs):
   schema = schema_utils.schema_from_feature_spec(feature_spec)
   coder = example_proto_coder.ExampleProtoCoder(
       schema, serialized=False, **kwargs)
   proto = _ascii_to_example(ascii_proto)
   np.testing.assert_equal(coder.encode(instance), proto)
示例#7
0
 def test_decode_error(self, feature_spec, ascii_proto, error_msg,
                       error_type=ValueError, **kwargs):
   schema = dataset_schema.from_feature_spec(feature_spec)
   coder = example_proto_coder.ExampleProtoCoder(schema, **kwargs)
   serialized_proto = _ascii_to_binary(ascii_proto)
   with self.assertRaisesRegexp(error_type, error_msg):
     coder.decode(serialized_proto)
 def test_decode_non_serialized(self, feature_spec, ascii_proto, instance,
                                **kwargs):
   schema = dataset_schema.from_feature_spec(feature_spec)
   coder = example_proto_coder.ExampleProtoCoder(
       schema, serialized=False, **kwargs)
   proto = _ascii_to_example(ascii_proto)
   np.testing.assert_equal(coder.decode(proto), instance)
    def test_example_proto_coder_bad_default_value(self):
        input_schema = dataset_schema.from_feature_spec({
            'scalar_feature_2':
            tf.FixedLenFeature(shape=[2],
                               dtype=tf.float32,
                               default_value=[1.0, 2.0]),
        })
        with self.assertRaisesRegexp(
                ValueError, 'only scalar default values are supported'):
            example_proto_coder.ExampleProtoCoder(input_schema)

        input_schema = dataset_schema.from_feature_spec({
            'scalar_feature_2':
            tf.FixedLenFeature(shape=[], dtype=tf.float32,
                               default_value=[1.0]),
        })
        with self.assertRaisesRegexp(
                ValueError, 'only scalar default values are supported'):
            example_proto_coder.ExampleProtoCoder(input_schema)
 def test_encode_error(self,
                       feature_spec,
                       instance,
                       error_msg,
                       error_type=ValueError,
                       **kwargs):
     schema = schema_utils.schema_from_feature_spec(feature_spec)
     with self.assertRaisesRegexp(error_type, error_msg):
         coder = example_proto_coder.ExampleProtoCoder(schema, **kwargs)
         coder.encode(instance)
示例#11
0
  def test_example_proto_coder_unicode(self):
    coder = example_proto_coder.ExampleProtoCoder(
        dataset_schema.from_feature_spec({
            'unicode_feature': tf.FixedLenFeature(shape=[], dtype=tf.string)
        }))

    encoded_example = coder.encode({'unicode_feature': u'Hello κόσμε'})
    example = tf.train.Example()
    example.ParseFromString(encoded_example)
    self.assertEqual(
        example.features.feature['unicode_feature'].bytes_list.value[0],
        u'Hello κόσμε'.encode('utf-8'))
 def test_example_proto_coder_picklable(self):
     encode_case = _maybe_extend_encode_case_with_ragged(
         _ENCODE_CASES['multiple_columns'])
     schema = schema_utils.schema_from_feature_spec(
         encode_case['feature_spec'])
     coder = example_proto_coder.ExampleProtoCoder(schema)
     ascii_proto = encode_case['ascii_proto']
     instance = encode_case['instance']
     serialized_proto = _ascii_to_binary(ascii_proto)
     for _ in range(2):
         coder = pickle.loads(pickle.dumps(coder))
         self.assertSerializedProtosEqual(coder.encode(instance),
                                          serialized_proto)
示例#13
0
 def test_example_proto_coder_cache(self):
   """Test that the cache remains valid after reading/writing None."""
   schema = schema_utils.schema_from_feature_spec({
       'varlen': tf.io.VarLenFeature(tf.int64),
   })
   coder = example_proto_coder.ExampleProtoCoder(schema)
   ascii_protos = [
       'features {feature {key: "varlen" value {int64_list {value: [5] }}}}',
       'features {feature {key: "varlen" value {}}}',
       'features {feature {key: "varlen" value {int64_list {value: [6] }}}}',
   ]
   instances = [{'varlen': [5]}, {'varlen': None}, {'varlen': [6]}]
   serialized_protos = map(_ascii_to_binary, ascii_protos)
   for instance, serialized_proto in zip(instances, serialized_protos):
     self.assertSerializedProtosEqual(coder.encode(instance), serialized_proto)
示例#14
0
def write_tfrecords(data, schema, filename, name):
    """
    Converts input pcollection into a file of tfrecords following schema.

    Args
    ----
      data: pcollection.
      schema: dataset_schema from tensorflow transform.
      name: str to identify operations.
    """
    _ = (data
         | '{} tfrecords write'.format(name) >>
         beam.io.tfrecordio.WriteToTFRecord(
             filename,
             coder=example_proto_coder.ExampleProtoCoder(
                 dataset_schema.Schema(schema))))
def store_transformed_data(data, schema, path, name=''):
    """Stores data from input pipeline into TFRecord in the specified path.

  Args:
    data: `PCollection`, input pipeline.
    schema: `DatasetMetadata` object, describes schema of the input pipeline.
    path: string, where to write output.
    name: string: name describing pipeline to be written.

  Returns:
    PCollection
  """

    p = (data
         | 'WriteData{}'.format(name) >> tfrecordio.WriteToTFRecord(
             path, coder=example_proto_coder.ExampleProtoCoder(schema.schema)))
    return p
示例#16
0
 def test_example_proto_coder_picklable(self):
   schema = schema_utils.schema_from_feature_spec(_FEATURE_SPEC)
   coder = example_proto_coder.ExampleProtoCoder(schema)
   ascii_proto = """
   features {
     feature { key: "scalar_feature_1" value { int64_list { value: [ 12 ] } } }
     feature { key: "varlen_feature_1"
               value { float_list { value: [ 89.0 ] } } }
     feature { key: "scalar_feature_2" value { int64_list { value: [ 12 ] } } }
     feature { key: "scalar_feature_3"
               value { float_list { value: [ 2.0 ] } } }
     feature { key: "1d_vector_feature"
               value { bytes_list { value: [ 'this is a ,text' ] } } }
     feature { key: "2d_vector_feature"
               value { float_list { value: [ 1.0, 2.0, 3.0, 4.0 ] } } }
     feature { key: "varlen_feature_2"
               value { bytes_list { value: [ 'female' ] } } }
     feature { key: "value" value { float_list { value: [ 12.0, 20.0 ] } } }
     feature { key: "idx" value { int64_list { value: [ 1, 4 ] } } }
     feature { key: "idx0" value { int64_list { value: [ 1, 1 ]} } }
     feature { key: "idx1" value { int64_list { value: [ 3, 7 ]} } }
     feature { key: "2d_val" value { float_list { value: [ 13.0, 23.0 ] } } }
   }
   """
   instance = {
       'scalar_feature_1': 12,
       'scalar_feature_2': 12,
       'scalar_feature_3': 2.0,
       'varlen_feature_1': [89.0],
       '1d_vector_feature': [b'this is a ,text'],
       '2d_vector_feature': [[1.0, 2.0], [3.0, 4.0]],
       'varlen_feature_2': [b'female'],
       'idx': [1, 4],
       'value': [12.0, 20.0],
       'idx0': [1, 1],
       'idx1': [3, 7],
       '2d_val': [13.0, 23.0],
   }
   serialized_proto = _ascii_to_binary(ascii_proto)
   for _ in range(2):
     coder = pickle.loads(pickle.dumps(coder))
     self.assertSerializedProtosEqual(coder.encode(instance), serialized_proto)
    def test_example_proto_coder_picklable(self):
        coder = example_proto_coder.ExampleProtoCoder(self._INPUT_SCHEMA)

        example_proto_text = """
    features {
      feature { key: "scalar_feature_1" value { int64_list { value: [ 12 ] } } }
      feature { key: "varlen_feature_1"
                value { float_list { value: [ 89.0 ] } } }
      feature { key: "scalar_feature_2" value { int64_list { value: [ 12 ] } } }
      feature { key: "scalar_feature_3"
                value { float_list { value: [ 2.0 ] } } }
      feature { key: "1d_vector_feature"
                value { bytes_list { value: [ 'this is a ,text' ] } } }
      feature { key: "2d_vector_feature"
                value { float_list { value: [ 1.0, 2.0, 3.0, 4.0 ] } } }
      feature { key: "varlen_feature_2"
                value { bytes_list { value: [ 'female' ] } } }
      feature { key: "value" value { float_list { value: [ 12.0, 20.0 ] } } }
      feature { key: "idx" value { int64_list { value: [ 1, 4 ] } } }
    }
    """
        expected_decoded = {
            'scalar_feature_1': 12,
            'scalar_feature_2': 12,
            'scalar_feature_3': 2.0,
            'varlen_feature_1': [89.0],
            '1d_vector_feature': ['this is a ,text'],
            '2d_vector_feature': [[1.0, 2.0], [3.0, 4.0]],
            'varlen_feature_2': ['female'],
            'sparse_feature': ([1, 4], [12.0, 20.0])
        }

        # Ensure we can pickle right away.
        coder = pickle.loads(pickle.dumps(coder))
        self._assert_encode_decode(coder, example_proto_text, expected_decoded)
        self._assert_decode_encode(coder, example_proto_text, expected_decoded)

        #  And after use.
        coder = pickle.loads(pickle.dumps(coder))
        self._assert_encode_decode(coder, example_proto_text, expected_decoded)
        self._assert_decode_encode(coder, example_proto_text, expected_decoded)
示例#18
0
def transform_and_write(pcollection, input_metadata, output_dir, transform_fn,
                        file_prefix):
  """Transforms data and writes results to local disc or Cloud Storage bucket.

  Args:
    pcollection: Pipeline data.
    input_metadata: DatasetMetadata object for given input data.
    output_dir: Directory to write transformed output.
    transform_fn: TensorFlow transform function.
    file_prefix: File prefix to add to output file.
  """
  shuffled_data = (pcollection | 'RandomizeData' >> beam.transforms.Reshuffle())
  (transformed_data,
   transformed_metadata) = (((shuffled_data, input_metadata), transform_fn)
                            | 'Transform' >> tft_beam.TransformDataset())
  coder = example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)
  (transformed_data
   | 'SerializeExamples' >> beam.Map(coder.encode)
   | 'WriteExamples' >> beam.io.WriteToTFRecord(
       os.path.join(output_dir, file_prefix),
       file_name_suffix=_FILE_NAME_SUFFIX))
示例#19
0
def transform_predict(pipeline, predict_data, data_source, output_dir, schema):
  """Transforms prediction input data.

  Args:
    pipeline: Beam Pipeline instance.
    predict_data: Prediction csv data.
    data_source: Input data source - path to CSV file or BigQuery table. Expects
      either `csv` or `bigquery`.
    output_dir: Directory to write transformed output.
    schema: A text-serialized TensorFlow metadata schema for the input data.
  """
  data_schema = utils.make_dataset_schema(
      schema, mode=tf.estimator.ModeKeys.PREDICT)
  coder = example_proto_coder.ExampleProtoCoder(data_schema)

  raw_data = (
      pipeline
      | 'ReadPredictData' >> ReadData(predict_data, data_source, schema,
                                      tf.estimator.ModeKeys.PREDICT))
  (raw_data
   | 'EncodePredictData' >> beam.Map(coder.encode)
   | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
       os.path.join(output_dir, _PREDICT_PREFIX), file_name_suffix='.tfrecord'))
示例#20
0
def run(p, params):
    """Defines Beam preprocessing pipeline.

  Performs the following:
    - Reads text files from pattern.
    - Split text files in train and validation sets.

  Args:
    p: PCollection, initial pipeline.
    params: Object holding a set of parameters as name-value pairs.
  """

    path_pattern = os.path.join(params.input_dir, '*',
                                '*{}'.format(constants.FILE_EXTENSION))
    data = (p
            | 'ListFiles' >> beam.Create(gfile.Glob(path_pattern))
            | 'ReadFiles' >> beam.ParDo(ReadFile())
            | 'SplitData' >> beam.ParDo(
                _SplitData(),
                train_size=params.train_size,
                val_label=_DatasetType.VAL.name).with_outputs(
                    _DatasetType.VAL.name, main=_DatasetType.TRAIN.name))

    schema = dataset_schema.from_feature_spec(
        utils.get_processed_data_schema())
    for dataset in _DatasetType:
        if not dataset.value:
            continue
        _ = (
            data[dataset.name]
            | 'Shuffle{}'.format(dataset.name) >> shuffle()  # pylint: disable=no-value-for-parameter
            |
            'WriteFiles{}'.format(dataset.name) >> tfrecordio.WriteToTFRecord(
                os.path.join(params.output_dir,
                             dataset.name + constants.TFRECORD),
                coder=example_proto_coder.ExampleProtoCoder(schema)))
示例#21
0
            train_data = (pipeline
                          | "Create train list" >> beam.Create(list_train) |
                          "Read Images - Train" >> beam.ParDo(ReadImageDoFn()))

            transformed_train, transform_fn = (
                (train_data, schema) | "Analyze and Transform - Train" >>
                impl.AnalyzeAndTransformDataset(lambda t: _preprocess_fn(
                    t, new_shape=(known_args.image_dim, known_args.image_dim)))
            )
            transformed_train_data, transformed_train_metadata = transformed_train

            _ = transformed_train_data | 'Write TFrecords - train' >> beam.io.tfrecordio.WriteToTFRecord(
                file_path_prefix=train_tfrecord_path,
                file_name_suffix=".tfrecords",
                num_shards=known_args.n_shards,
                coder=example_proto_coder.ExampleProtoCoder(
                    transformed_train_metadata.schema))

            # Process evaluation data
            eval_data = (pipeline
                         | "Create eval list" >> beam.Create(list_eval)
                         | "Read Images - Eval" >> beam.ParDo(ReadImageDoFn()))

            transformed_eval = (((eval_data, schema), transform_fn) |
                                "Transform - Eval" >> impl.TransformDataset())

            transformed_eval_data, transformed_eval_metadata = transformed_eval

            _ = transformed_eval_data | 'Write TFrecords - eval' >> beam.io.tfrecordio.WriteToTFRecord(
                file_path_prefix=eval_tfrecord_path,
                file_name_suffix=".tfrecords",
                num_shards=known_args.n_shards,
示例#22
0
def transform_data(train_neg_filepattern, train_pos_filepattern,
                   test_neg_filepattern, test_pos_filepattern,
                   transformed_train_filebase, transformed_test_filebase,
                   transformed_metadata_dir):
  """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    train_neg_filepattern: Filepattern for training data negative examples
    train_pos_filepattern: Filepattern for training data positive examples
    test_neg_filepattern: Filepattern for test data negative examples
    test_pos_filepattern: Filepattern for test data positive examples
    transformed_train_filebase: Base filename for transformed training data
        shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data
        should be written
  """

  with beam.Pipeline() as pipeline:
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
      # pylint: disable=no-value-for-parameter
      train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData(
          (train_neg_filepattern, train_pos_filepattern))
      # pylint: disable=no-value-for-parameter
      test_data = pipeline | 'ReadTest' >> ReadAndShuffleData(
          (test_neg_filepattern, test_pos_filepattern))

      metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
          REVIEW_COLUMN: dataset_schema.ColumnSchema(
              tf.string, [], dataset_schema.FixedColumnRepresentation()),
          LABEL_COLUMN: dataset_schema.ColumnSchema(
              tf.int64, [], dataset_schema.FixedColumnRepresentation()),
      }))

      def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        review = inputs[REVIEW_COLUMN]

        def remove_character(s, char):
          """Remove a character from a string.

          Args:
            s: A SparseTensor of rank 1 of type tf.string
            char: A string of length 1

          Returns:
            The string `s` with the given character removed (i.e. replaced by
            '')
          """
          # Hacky implementation where we split and rejoin.
          split = tf.string_split(s, char)
          rejoined = tf.reduce_join(
              tf.sparse_to_dense(
                  split.indices, split.dense_shape, split.values, ''),
              1)
          return rejoined

        def remove_punctuation(s):
          """Remove puncuation from a string.

          Args:
            s: A SparseTensor of rank 1 of type tf.string

          Returns:
            The string `s` with punctuation removed.
          """
          for char in PUNCTUATION_CHARACTERS:
            s = remove_character(s, char)
          return s

        cleaned_review = tft.map(remove_punctuation, review)
        review_tokens = tft.map(tf.string_split, cleaned_review)
        review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE)
        return {
            REVIEW_COLUMN: review_indices,
            LABEL_COLUMN: inputs[LABEL_COLUMN]
        }

      (transformed_train_data, transformed_metadata), transform_fn = (
          (train_data, metadata)
          | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(
              preprocessing_fn))

      transformed_test_data, _ = (
          ((test_data, metadata), transform_fn)
          | 'Transform' >> beam_impl.TransformDataset())

      _ = (
          transformed_train_data
          | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
              transformed_train_filebase,
              coder=example_proto_coder.ExampleProtoCoder(
                  transformed_metadata.schema)))

      _ = (
          transformed_test_data
          | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
              transformed_test_filebase,
              coder=example_proto_coder.ExampleProtoCoder(
                  transformed_metadata.schema)))

      _ = (
          transformed_metadata
          | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
              transformed_metadata_dir, pipeline=pipeline))
 def test_encode(self, feature_spec, ascii_proto, instance, **kwargs):
     schema = schema_utils.schema_from_feature_spec(feature_spec)
     coder = example_proto_coder.ExampleProtoCoder(schema, **kwargs)
     serialized_proto = _ascii_to_binary(ascii_proto)
     self.assertSerializedProtosEqual(coder.encode(instance),
                                      serialized_proto)
示例#24
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
                _fill_in_missing(inputs[key]))

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[taxi.transformed_name(
                key)] = transform.compute_and_apply_vocabulary(
                    _fill_in_missing(inputs[key]),
                    top_k=taxi.VOCAB_SIZE,
                    num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = transform.bucketize(
                _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

        # Was this passenger a big tipper?
        taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
        tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
        outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
            tf.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                    tf.int64))

        return outputs

    schema = taxi.read_schema(schema_file)
    raw_feature_spec = taxi.get_raw_feature_spec(schema)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        with tft_beam.Context(temp_dir=working_dir):
            if input_handle.lower().endswith('csv'):
                csv_coder = taxi.make_csv_coder(schema)
                raw_data = (pipeline
                            | 'ReadFromText' >> beam.io.ReadFromText(
                                input_handle, skip_header_lines=1))
                decode_transform = beam.Map(csv_coder.decode)
            else:
                query = taxi.make_sql(input_handle, max_rows, for_eval=False)
                raw_data = (pipeline
                            | 'ReadBigQuery' >> beam.io.Read(
                                beam.io.BigQuerySource(query=query,
                                                       use_standard_sql=True)))
                decode_transform = beam.Map(taxi.clean_raw_data_dict,
                                            raw_feature_spec=raw_feature_spec)

            if transform_dir is None:
                decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform
                transform_fn = (
                    (decoded_data, raw_data_metadata) |
                    ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

                _ = (transform_fn
                     | ('WriteTransformFn' >>
                        tft_beam.WriteTransformFn(working_dir)))
            else:
                transform_fn = pipeline | tft_beam.ReadTransformFn(
                    transform_dir)

            # Shuffling the data before materialization will improve Training
            # effectiveness downstream. Here we shuffle the raw_data (as opposed to
            # decoded data) since it has a compact representation.
            shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
            )

            decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform
            (transformed_data, transformed_metadata) = (
                ((decoded_data, raw_data_metadata), transform_fn)
                | 'Transform' >> tft_beam.TransformDataset())

            coder = example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema)
            _ = (transformed_data
                 | 'SerializeExamples' >> beam.Map(coder.encode)
                 | 'WriteExamples' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, outfile_prefix),
                     file_name_suffix='.gz'))
示例#25
0
def run(input_feature_spec,
        labels,
        feature_extraction,
        feature_scaling=None,
        eval_percent=20.0,
        beam_options=None,
        work_dir=None):
    """Runs the whole preprocessing step.

  This runs the feature extraction PTransform, validates that the data conforms
  to the schema provided, normalizes the features, and splits the dataset into
  a training and evaluation dataset.
  """

    # Populate optional arguments
    if not feature_scaling:
        feature_scaling = lambda inputs: inputs

    # Type checking
    if not isinstance(labels, list):
        raise ValueError('`labels` must be list(str). '
                         'Given: {} {}'.format(labels, type(labels)))

    if not isinstance(feature_extraction, beam.PTransform):
        raise ValueError('`feature_extraction` must be {}. '
                         'Given: {} {}'.format(beam.PTransform,
                                               feature_extraction,
                                               type(feature_extraction)))

    if not callable(feature_scaling):
        raise ValueError('`feature_scaling` must be callable. '
                         'Given: {} {}'.format(feature_scaling,
                                               type(feature_scaling)))

    if beam_options and not isinstance(beam_options, PipelineOptions):
        raise ValueError('`beam_options` must be {}. '
                         'Given: {} {}'.format(PipelineOptions, beam_options,
                                               type(beam_options)))

    if not work_dir:
        work_dir = tempfile.mkdtemp(prefix='tensorflow-preprocessing')

    tft_temp_dir = os.path.join(work_dir, 'tft-temp')
    train_dataset_dir = os.path.join(work_dir, 'train-dataset')
    eval_dataset_dir = os.path.join(work_dir, 'eval-dataset')

    transform_fn_dir = os.path.join(work_dir, transform_fn_io.TRANSFORM_FN_DIR)
    #  if tf.gfile.Exists(transform_fn_dir):
    if tf.io.gfile.exists(transform_fn_dir):
        tf.gfile.DeleteRecursively(transform_fn_dir)

    # [START dataflow_molecules_create_pipeline]
    # Build and run a Beam Pipeline
    with beam.Pipeline(options=beam_options) as p, \
         beam_impl.Context(temp_dir=tft_temp_dir):
        # [END dataflow_molecules_create_pipeline]

        # [START dataflow_molecules_feature_extraction]
        # Transform and validate the input data matches the input schema
        dataset = (
            p
            | 'Feature extraction' >> feature_extraction
            # [END dataflow_molecules_feature_extraction]
            # [START dataflow_molecules_validate_inputs]
            | 'Validate inputs' >> beam.ParDo(
                ValidateInputData(input_feature_spec)))
        # [END dataflow_molecules_validate_inputs]

        # [START dataflow_molecules_analyze_and_transform_dataset]
        # Apply the tf.Transform preprocessing_fn
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec(input_feature_spec))

        dataset_and_metadata, transform_fn = (
            (dataset, input_metadata)
            | 'Feature scaling' >>
            beam_impl.AnalyzeAndTransformDataset(feature_scaling))
        dataset, metadata = dataset_and_metadata
        # [END dataflow_molecules_analyze_and_transform_dataset]

        # [START dataflow_molecules_split_to_train_and_eval_datasets]
        # Split the dataset into a training set and an evaluation set
        assert 0 < eval_percent < 100, 'eval_percent must in the range (0-100)'
        train_dataset, eval_dataset = (
            dataset
            | 'Split dataset' >> beam.Partition(
                lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2))
        # [END dataflow_molecules_split_to_train_and_eval_datasets]

        # [START dataflow_molecules_write_tfrecords]
        # Write the datasets as TFRecords
        coder = example_proto_coder.ExampleProtoCoder(metadata.schema)

        train_dataset_prefix = os.path.join(train_dataset_dir, 'part')
        _ = (train_dataset
             | 'Write train dataset' >> tfrecordio.WriteToTFRecord(
                 train_dataset_prefix, coder))

        eval_dataset_prefix = os.path.join(eval_dataset_dir, 'part')
        _ = (eval_dataset
             | 'Write eval dataset' >> tfrecordio.WriteToTFRecord(
                 eval_dataset_prefix, coder))

        # Write the transform_fn
        _ = (transform_fn
             |
             'Write transformFn' >> transform_fn_io.WriteTransformFn(work_dir))
        # [END dataflow_molecules_write_tfrecords]

    return PreprocessData(input_feature_spec, labels,
                          train_dataset_prefix + '*',
                          eval_dataset_prefix + '*')
示例#26
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   max_rows=None,
                   pipeline_args=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as
      DATASET.TABLE or path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform
      function will be emitted.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[key] = transform.scale_to_z_score(inputs[key])

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[key] = transform.string_to_int(
                inputs[key],
                top_k=taxi.VOCAB_SIZE,
                num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[key] = transform.bucketize(inputs[key],
                                               taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[key] = inputs[key]

        # Was this passenger a big tipper?
        def convert_label(label):
            taxi_fare = inputs[taxi.FARE_KEY]
            return tf.where(
                tf.is_nan(taxi_fare),
                tf.cast(tf.zeros_like(taxi_fare), tf.int64),
                # Test if the tip was > 20% of the fare.
                tf.cast(
                    tf.greater(label, tf.multiply(taxi_fare,
                                                  tf.constant(0.2))),
                    tf.int64))

        outputs[taxi.LABEL_KEY] = transform.apply_function(
            convert_label, inputs[taxi.LABEL_KEY])

        return outputs

    raw_feature_spec = taxi.get_raw_feature_spec()
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        with beam_impl.Context(temp_dir=working_dir):
            if input_handle.lower().endswith('csv'):
                csv_coder = taxi.make_csv_coder()
                raw_data = (pipeline
                            | 'ReadFromText' >> beam.io.ReadFromText(
                                input_handle, skip_header_lines=1)
                            | 'ParseCSV' >> beam.Map(csv_coder.decode))
            else:
                query = taxi.make_sql(input_handle, max_rows, for_eval=False)
                raw_data = (pipeline
                            | 'ReadBigQuery' >> beam.io.Read(
                                beam.io.BigQuerySource(query=query,
                                                       use_standard_sql=True)))

            raw_data |= 'CleanData' >> beam.Map(taxi.clean_raw_data_dict)

            transform_fn = (
                (raw_data, raw_data_metadata)
                | 'Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn))

            _ = (transform_fn
                 | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(working_dir))

            # Shuffling the data before materialization will improve Training
            # effectiveness downstream.
            shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
            )

            (transformed_data, transformed_metadata) = (
                ((shuffled_data, raw_data_metadata), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())

            coder = example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema)
            _ = (
                transformed_data
                | 'SerializeExamples' >> beam.Map(coder.encode)
                | 'WriteExamples' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, outfile_prefix),
                    compression_type=beam.io.filesystem.CompressionTypes.GZIP))
示例#27
0
def transform_data(train_data_file, test_data_file, transformed_train_filebase,
                   transformed_test_filebase, transformed_metadata_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and coverts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    transformed_train_filebase: Base filename for transformed training data
        shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data
        should be written
  """
    raw_data_schema = {
        key:
        dataset_schema.ColumnSchema(tf.string, [],
                                    dataset_schema.FixedColumnRepresentation())
        for key in CATEGORICAL_COLUMNS
    }
    raw_data_schema.update({
        key:
        dataset_schema.ColumnSchema(tf.float32, [],
                                    dataset_schema.FixedColumnRepresentation())
        for key in NUMERIC_COLUMNS
    })
    raw_data_schema[LABEL_COLUMN] = dataset_schema.ColumnSchema(
        tf.string, [], dataset_schema.FixedColumnRepresentation())
    raw_data_schema = dataset_schema.Schema(raw_data_schema)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_data_schema)

    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_COLUMNS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # For all categorical columns except the label column, we use
        # tft.string_to_int which computes the set of unique values and uses this
        # to convert the strings to indices.
        for key in CATEGORICAL_COLUMNS:
            outputs[key] = tft.string_to_int(inputs[key])

        # For the label column we provide the mapping from string to index.
        def convert_label(label):
            table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K'])
            return table.lookup(label)

        outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN])

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # Create a coder to read the census data with the schema.  To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            ordered_columns = [
                'age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'label'
            ]
            converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema)

            # Read in raw data and convert using CSV converter.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do the from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV converter can read, in particular
            # removing empty lines and removing spaces after commas.
            raw_data = (pipeline
                        |
                        'ReadTrainData' >> textio.ReadFromText(train_data_file)
                        | 'FilterTrainData' >> beam.Filter(lambda line: line)
                        | 'FixCommasTrainData' >>
                        beam.Map(lambda line: line.replace(', ', ','))
                        | 'DecodeTrainData' >> beam.Map(converter.decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, raw_data_metadata)
            transformed_dataset, transform_fn = (
                raw_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset

            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                transformed_train_filebase,
                coder=example_proto_coder.ExampleProtoCoder(
                    transformed_metadata.schema))

            # Now apply transform function to test data.  In this case we also remove
            # the header line from the CSV file and the trailing period at the end of
            # each line.
            raw_test_data = (
                pipeline
                | 'ReadTestData' >> textio.ReadFromText(test_data_file)
                | 'FilterTestData' >> beam.Filter(
                    lambda line: line and line != '|1x3 Cross validator')
                | 'FixCommasTestData' >>
                beam.Map(lambda line: line.replace(', ', ','))
                | 'RemoveTrailingPeriodsTestData' >>
                beam.Map(lambda line: line[:-1])
                | 'DecodeTestData' >> beam.Map(converter.decode))

            raw_test_dataset = (raw_test_data, raw_data_metadata)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | beam_impl.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                transformed_test_filebase,
                coder=example_proto_coder.ExampleProtoCoder(
                    transformed_metadata.schema))

            _ = (transformed_metadata
                 | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
                     transformed_metadata_dir, pipeline=pipeline))
    def test_example_proto_coder(self):
        # We use a single coder and invoke multiple encodes and decodes on it to
        # make sure that cache consistency is implemented properly.
        coder = example_proto_coder.ExampleProtoCoder(self._INPUT_SCHEMA)

        # Python types.
        example_proto_text = """
    features {
      feature { key: "scalar_feature_1" value { int64_list { value: [ 12 ] } } }
      feature { key: "varlen_feature_1"
                value { float_list { value: [ 89.0 ] } } }
      feature { key: "scalar_feature_2" value { int64_list { value: [ 12 ] } } }
      feature { key: "scalar_feature_3"
                value { float_list { value: [ 1.0 ] } } }
      feature { key: "1d_vector_feature"
                value { bytes_list { value: [ 'this is a ,text' ] } } }
      feature { key: "varlen_feature_2"
                value { bytes_list { value: [ 'female' ] } } }
      feature { key: "value" value { float_list { value: [ 12.0, 20.0 ] } } }
      feature { key: "idx" value { int64_list { value: [ 1, 4 ] } } }
    }
    """
        expected_decoded = {
            'scalar_feature_1': 12,
            'scalar_feature_2': 12,
            'scalar_feature_3': 1.0,
            'varlen_feature_1': [89.0],
            '1d_vector_feature': ['this is a ,text'],
            'varlen_feature_2': ['female'],
            'sparse_feature': ([12.0, 20.0], [1, 4])
        }
        self._assert_encode_decode(coder, example_proto_text, expected_decoded)
        self._assert_decode_encode(coder, example_proto_text, expected_decoded)

        # Numpy types (with different values from above).
        example_proto_text = """
    features {
      feature { key: "scalar_feature_1" value { int64_list { value: [ 13 ] } } }
      feature { key: "varlen_feature_1" value { float_list { } } }
      feature { key: "scalar_feature_2" value { int64_list { value: [ 14 ] } } }
      feature { key: "scalar_feature_3"
                value { float_list { value: [ 2.0 ] } } }
      feature { key: "1d_vector_feature"
                value { bytes_list { value: [ 'this is another ,text' ] } } }
      feature { key: "varlen_feature_2"
                value { bytes_list { value: [ 'male' ] } } }
      feature { key: "value" value { float_list { value: [ 13.0, 21.0 ] } } }
      feature { key: "idx" value { int64_list { value: [ 2, 5 ] } } }
    }
    """
        expected_decoded = {
            'scalar_feature_1': np.array(13),
            'scalar_feature_2': np.int32(14),
            'scalar_feature_3': np.array(2.0),
            'varlen_feature_1': np.array([]),
            '1d_vector_feature': np.array(['this is another ,text']),
            'varlen_feature_2': np.array(['male']),
            'sparse_feature': (np.array([13.0, 21.0]), np.array([2, 5]))
        }
        self._assert_encode_decode(coder, example_proto_text, expected_decoded)
        self._assert_decode_encode(coder, example_proto_text, expected_decoded)
示例#29
0
                orders_by_date_train | "Extract timeseries windows - train" >>
                beam.ParDo(ExtractRawTimeseriesWindow(known_args.window_size))
                | "Fusion breaker train" >> beam.Reshuffle())

            ts_windows_schema = _get_feature_spec(known_args.window_size)
            norm_ts_windows_train, transform_fn = (
                (ts_windows_train, ts_windows_schema)
                | "Analyze and Transform - train" >>
                impl.AnalyzeAndTransformDataset(lambda t: _preprocess_fn(
                    t, known_args.window_size, znorm_stats)))
            norm_ts_windows_train_data, norm_ts_windows_train_metadata = norm_ts_windows_train

            _ = norm_ts_windows_train_data | 'Write TFrecords - train' >> beam.io.tfrecordio.WriteToTFRecord(
                file_path_prefix=train_tfrecord_path,
                file_name_suffix=".tfrecords",
                coder=example_proto_coder.ExampleProtoCoder(
                    norm_ts_windows_train_metadata.schema))

            # Process evaluation data
            raw_data_eval = _read_data_from_bq(pipeline, known_args.split_date,
                                               known_args.end_date)

            orders_by_date_eval = (
                raw_data_eval | "Merge SKUs - eval" >> beam.CombineGlobally(
                    GroupItemsByDate(community_area_list,
                                     (split_datetime, end_datetime))))

            ts_windows_eval = (
                orders_by_date_eval | "Extract timeseries windows - eval" >>
                beam.ParDo(ExtractRawTimeseriesWindow(known_args.window_size))
                | "Fusion breaker eval" >> beam.Reshuffle())
示例#30
0
 def test_decode(self, feature_spec, ascii_proto, instance, **kwargs):
     schema = schema_utils.schema_from_feature_spec(feature_spec)
     coder = example_proto_coder.ExampleProtoCoder(schema, **kwargs)
     serialized_proto = _ascii_to_binary(ascii_proto)
     np.testing.assert_equal(coder.decode(serialized_proto), instance)