Пример #1
0
  def test_schema_equality(self):
    schema1 = sch.Schema(column_schemas={
        'fixed_bool_with_default': sch.ColumnSchema(
            tf.bool, [1], sch.FixedColumnRepresentation(False)),
        'var_float': sch.ColumnSchema(
            tf.float32, None, sch.ListColumnRepresentation())
    })
    schema2 = sch.Schema(column_schemas={
        'fixed_bool_with_default': sch.ColumnSchema(
            tf.bool, [1], sch.FixedColumnRepresentation(False)),
        'var_float': sch.ColumnSchema(
            tf.float32, None, sch.ListColumnRepresentation())
    })
    schema3 = sch.Schema(column_schemas={
        'fixed_bool_with_default': sch.ColumnSchema(
            tf.bool, [1], sch.FixedColumnRepresentation(False)),
        'var_float': sch.ColumnSchema(
            tf.float64, None, sch.ListColumnRepresentation())
    })
    schema4 = sch.Schema(column_schemas={
        'fixed_bool_with_default': sch.ColumnSchema(
            tf.bool, [1], sch.FixedColumnRepresentation(False))
    })

    self.assertEqual(schema1, schema2)
    self.assertNotEqual(schema1, schema3)
    self.assertNotEqual(schema1, schema4)
Пример #2
0
    def test_column_representation_equality(self):
        fixed1 = sch.FixedColumnRepresentation(1.1)
        fixed2 = sch.FixedColumnRepresentation(1.1)
        fixed3 = sch.FixedColumnRepresentation()

        list1 = sch.ListColumnRepresentation()
        list2 = sch.ListColumnRepresentation()

        sparse1 = sch.SparseColumnRepresentation('val', [
            sch.SparseIndexField('idx1', False),
            sch.SparseIndexField('idx2', True)
        ])
        sparse2 = sch.SparseColumnRepresentation('val', [
            sch.SparseIndexField('idx1', False),
            sch.SparseIndexField('idx2', True)
        ])
        sparse3 = sch.SparseColumnRepresentation('val', [
            sch.SparseIndexField('idx1', False),
            sch.SparseIndexField('idx2', False)
        ])

        self.assertEqual(fixed1, fixed2)
        self.assertNotEqual(fixed1, fixed3)
        self.assertNotEqual(fixed1, list1)
        self.assertNotEqual(fixed1, sparse1)

        self.assertEqual(list1, list2)
        self.assertNotEqual(list1, sparse1)

        self.assertEqual(sparse1, sparse2)
        self.assertNotEqual(sparse1, sparse3)
Пример #3
0
def _from_feature_dict(feature_dict):
  """Translate a JSON feature dict into a `ColumnSchema`."""
  domain = _from_domain_dict(feature_dict['domain'])

  axes = []
  if 'fixedShape' in feature_dict:
    for axis in feature_dict['fixedShape']['axis']:
      # int() is needed because protobuf JSON encodes int64 as string
      axes.append(sch.Axis(int(axis.get('size'))))
  elif 'valueCount' in feature_dict:
    # Value_count always means a 1-D feature of unknown size.
    # We don't support value_count.min and value_count.max yet.
    axes.append(sch.Axis(None))

  tf_options = feature_dict['parsingOptions']['tfOptions']
  if tf_options.get('fixedLenFeature') is not None:
    default_value = None
    try:
      # int() is needed because protobuf JSON encodes int64 as string
      default_value = int(tf_options['fixedLenFeature']['intDefaultValue'])
    except KeyError:
      try:
        default_value = tf_options['fixedLenFeature']['stringDefaultValue']
      except KeyError:
        try:
          default_value = tf_options['fixedLenFeature']['floatDefaultValue']
        except KeyError:
          pass
    representation = sch.FixedColumnRepresentation(default_value)
  elif tf_options.get('varLenFeature') is not None:
    representation = sch.ListColumnRepresentation()
  else:
    raise ValueError('Could not interpret tfOptions: {}'.format(tf_options))

  return sch.ColumnSchema(domain, axes, representation)
Пример #4
0
  def test_schema_equality(self):
    schema1 = sch.Schema(column_schemas={
        'fixed_bool_with_default': sch.ColumnSchema(
            sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                    sch.LogicalShape([sch.Axis(1)])),
            sch.FixedColumnRepresentation(False)),
        'var_float': sch.ColumnSchema(
            sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                    sch.LogicalShape([sch.Axis(None)])),
            sch.ListColumnRepresentation())
    })
    schema2 = sch.Schema(column_schemas={
        'fixed_bool_with_default': sch.ColumnSchema(
            sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                    sch.LogicalShape([sch.Axis(1)])),
            sch.FixedColumnRepresentation(False)),
        'var_float': sch.ColumnSchema(
            sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                    sch.LogicalShape([sch.Axis(None)])),
            sch.ListColumnRepresentation())
    })
    schema3 = sch.Schema(column_schemas={
        'fixed_bool_with_default': sch.ColumnSchema(
            sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                    sch.LogicalShape([sch.Axis(1)])),
            sch.FixedColumnRepresentation(False)),
        'var_float': sch.ColumnSchema(
            sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float64),
                                    sch.LogicalShape([sch.Axis(None)])),
            sch.ListColumnRepresentation())
    })
    schema4 = sch.Schema(column_schemas={
        'fixed_bool_with_default': sch.ColumnSchema(
            sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                    sch.LogicalShape([sch.Axis(1)])),
            sch.FixedColumnRepresentation(False))
    })

    self.assertEqual(schema1, schema2)
    self.assertNotEqual(schema1, schema3)
    self.assertNotEqual(schema1, schema4)
Пример #5
0
def get_manually_created_schema():
    """Provide a test schema built from scratch using the Schema classes."""
    return sch.Schema({
        # FixedLenFeatures
        'fixed_categorical_int_with_range':
        sch.ColumnSchema(sch.IntDomain(tf.int64, -5, 10, True), [],
                         sch.FixedColumnRepresentation()),
        'fixed_int':
        sch.ColumnSchema(tf.int64, [5], sch.FixedColumnRepresentation()),
        'fixed_float':
        sch.ColumnSchema(tf.float32, [5], sch.FixedColumnRepresentation()),
        'fixed_string':
        sch.ColumnSchema(tf.string, [5], sch.FixedColumnRepresentation()),
        # VarLenFeatures
        'var_int':
        sch.ColumnSchema(tf.int64, None, sch.ListColumnRepresentation()),
        'var_float':
        sch.ColumnSchema(tf.float32, None, sch.ListColumnRepresentation()),
        'var_string':
        sch.ColumnSchema(tf.string, None, sch.ListColumnRepresentation())
    })
Пример #6
0
  def test_infer_column_schema_from_tensor(self):
    dense = tf.constant([[1., 2.], [3., 4.]], dtype=tf.float32, shape=[2, 2])
    column_schema = sch.infer_column_schema_from_tensor(dense)
    expected_column_schema = sch.ColumnSchema(
        tf.float32, [2], sch.FixedColumnRepresentation())
    self.assertEqual(expected_column_schema, column_schema)

    varlen = tf.sparse_placeholder(tf.string)
    column_schema = sch.infer_column_schema_from_tensor(varlen)
    expected_column_schema = sch.ColumnSchema(
        tf.string, [None], sch.ListColumnRepresentation())
    self.assertEqual(expected_column_schema, column_schema)
Пример #7
0
def _make_transformed_schema(shape):
  schema = sch.Schema()

  schema.column_schemas['transformed_a'] = (
      sch.ColumnSchema(tf.int64, shape, sch.FixedColumnRepresentation()))

  schema.column_schemas['transformed_b'] = (
      sch.ColumnSchema(tf.int64, shape, sch.ListColumnRepresentation()))

  schema.column_schemas['transformed_label'] = (
      sch.ColumnSchema(tf.int64, shape, sch.FixedColumnRepresentation()))

  return schema
Пример #8
0
    def test_schema_equality(self):
        schema1 = sch.Schema(
            column_schemas={
                'fixed_int':
                sch.ColumnSchema(tf.int64, [2],
                                 sch.FixedColumnRepresentation()),
                'var_float':
                sch.ColumnSchema(tf.float32, None,
                                 sch.ListColumnRepresentation())
            })
        schema2 = sch.Schema(
            column_schemas={
                'fixed_int':
                sch.ColumnSchema(tf.int64, [2],
                                 sch.FixedColumnRepresentation()),
                'var_float':
                sch.ColumnSchema(tf.float32, None,
                                 sch.ListColumnRepresentation())
            })
        schema3 = sch.Schema(
            column_schemas={
                'fixed_int':
                sch.ColumnSchema(tf.int64, [2],
                                 sch.FixedColumnRepresentation()),
                'var_float':
                sch.ColumnSchema(tf.string, None,
                                 sch.ListColumnRepresentation())
            })
        schema4 = sch.Schema(
            column_schemas={
                'fixed_int':
                sch.ColumnSchema(tf.int64, [2],
                                 sch.FixedColumnRepresentation())
            })

        self.assertEqual(schema1, schema2)
        self.assertNotEqual(schema1, schema3)
        self.assertNotEqual(schema1, schema4)
Пример #9
0
  def test_infer_column_schema_from_tensor(self):
    dense = tf.constant([[1., 2.], [3., 4.]], dtype=tf.float32, shape=[2, 2])
    column_schema = sch.infer_column_schema_from_tensor(dense)
    expected_column_schema = sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                sch.LogicalShape([sch.Axis(2)])),
        sch.FixedColumnRepresentation())
    self.assertEqual(expected_column_schema, column_schema)

    varlen = tf.sparse_placeholder(tf.string)
    column_schema = sch.infer_column_schema_from_tensor(varlen)
    expected_column_schema = sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string),
                                sch.LogicalShape([sch.Axis(None)])),
        sch.ListColumnRepresentation())
    self.assertEqual(expected_column_schema, column_schema)
Пример #10
0
def _create_output_metadata(features_config, min_value, max_value):
    """Constructs a custom DatasetMetadata.

  Args:
    features_config: Features configuration mock.
    min_value: Minimum value for IntDomain.
    max_value: Maximum value for IntDomain.

  Returns:
    A `tft.tf_metadata.dataset_metadata.DatasetMetadata` object.
  """
    schema = {
        features_config.TARGET_FEATURE:
        dataset_schema.ColumnSchema(
            tf.float32, [], dataset_schema.FixedColumnRepresentation()),
        features_config.ID_FEATURE:
        dataset_schema.ColumnSchema(tf.int64, [None],
                                    dataset_schema.ListColumnRepresentation())
    }
    schema.update({
        utils.make_transformed_key(feature):
        dataset_schema.ColumnSchema(tf.float32, [],
                                    dataset_schema.FixedColumnRepresentation())
        for feature in features_config.NUMERIC_FEATURES
    })
    categorical_col_schema = dataset_schema.ColumnSchema(
        dataset_schema.IntDomain(tf.int64,
                                 min_value,
                                 max_value,
                                 is_categorical=True), [],
        dataset_schema.FixedColumnRepresentation())
    schema.update({
        utils.make_transformed_key(feature): categorical_col_schema
        for feature in features_config.CATEGORICAL_FEATURES
    })
    return dataset_metadata.DatasetMetadata(schema)
Пример #11
0
import unittest
from tensorflow.python.framework import test_util
from tensorflow.python.lib.io import file_io

_TEST_METADATA = dataset_metadata.DatasetMetadata({
    'fixed_column':
    dataset_schema.ColumnSchema(tf.string, (1, 3, 2),
                                dataset_schema.FixedColumnRepresentation()),
    'fixed_column_with_default':
    dataset_schema.ColumnSchema(
        tf.float32, (1, 3, 2),
        dataset_schema.FixedColumnRepresentation(123.4)),
    'list_columm':
    dataset_schema.ColumnSchema(tf.float32, (None, ),
                                dataset_schema.ListColumnRepresentation())
})

_TEST_METADATA_WITH_FUTURES = dataset_metadata.DatasetMetadata({
    'fixed_column':
    dataset_schema.ColumnSchema(tf.string, (1, 3, 2),
                                dataset_schema.FixedColumnRepresentation()),
    'fixed_column_with_default':
    dataset_schema.ColumnSchema(
        tf.float32, (1, futures.Future('a'), 2),
        dataset_schema.FixedColumnRepresentation(123.4)),
    'list_columm':
    dataset_schema.ColumnSchema(tf.float32, (None, ),
                                dataset_schema.ListColumnRepresentation())
})
Пример #12
0

def get_test_schema():
    return sch.from_feature_spec(test_feature_spec)


_COLUMN_SCHEMAS = {
    # FixedLenFeatures
    'fixed_categorical_int_with_range':
    sch.ColumnSchema(sch.IntDomain(tf.int64, -5, 10, True), [],
                     sch.FixedColumnRepresentation()),
    'fixed_int':
    sch.ColumnSchema(tf.int64, [5], sch.FixedColumnRepresentation()),
    'fixed_float':
    sch.ColumnSchema(tf.float32, [5], sch.FixedColumnRepresentation()),
    'fixed_string':
    sch.ColumnSchema(tf.string, [5], sch.FixedColumnRepresentation()),
    # VarLenFeatures
    'var_int':
    sch.ColumnSchema(tf.int64, None, sch.ListColumnRepresentation()),
    'var_float':
    sch.ColumnSchema(tf.float32, None, sch.ListColumnRepresentation()),
    'var_string':
    sch.ColumnSchema(tf.string, None, sch.ListColumnRepresentation())
}


def get_manually_created_schema():
    """Provide a test schema built from scratch using the Schema classes."""
    return sch.Schema(_COLUMN_SCHEMAS)
Пример #13
0
def get_manually_created_schema():
    """Provide a test schema built from scratch using the Schema classes."""
    schema = sch.Schema()

    # FixedLenFeatures
    schema.column_schemas['fixed_bool_with_default'] = (sch.ColumnSchema(
        tf.bool, [1], sch.FixedColumnRepresentation(default_value=False)))

    schema.column_schemas['fixed_bool_without_default'] = (sch.ColumnSchema(
        tf.bool, [5], sch.FixedColumnRepresentation()))

    schema.column_schemas['fixed_int_with_default'] = (sch.ColumnSchema(
        tf.int64, [1], sch.FixedColumnRepresentation(default_value=0)))

    schema.column_schemas['fixed_categorical_int_with_range'] = (
        sch.ColumnSchema(sch.IntDomain(tf.int64, -5, 10, True), [1],
                         sch.FixedColumnRepresentation(0)))

    schema.column_schemas['fixed_categorical_int_with_vocab'] = (
        sch.ColumnSchema(
            sch.IntDomain(tf.int64, vocabulary_file='test_filename'), [1],
            sch.FixedColumnRepresentation(0)))

    schema.column_schemas['fixed_int_without_default'] = (sch.ColumnSchema(
        tf.int64, [5], sch.FixedColumnRepresentation()))

    schema.column_schemas['fixed_float_with_default'] = (sch.ColumnSchema(
        tf.float32, [1], sch.FixedColumnRepresentation(default_value=0.0)))

    schema.column_schemas['fixed_float_without_default'] = (sch.ColumnSchema(
        tf.float32, [5], sch.FixedColumnRepresentation()))

    schema.column_schemas['fixed_string_with_default'] = (sch.ColumnSchema(
        tf.string, [1],
        sch.FixedColumnRepresentation(default_value='default')))

    schema.column_schemas['fixed_string_without_default'] = (sch.ColumnSchema(
        tf.string, [5], sch.FixedColumnRepresentation()))

    schema.column_schemas['3d_fixed_int_without_default'] = (sch.ColumnSchema(
        tf.int64, [5, 6, 7], sch.FixedColumnRepresentation()))

    # VarLenFeatures
    schema.column_schemas['var_bool'] = (sch.ColumnSchema(
        tf.bool, None, sch.ListColumnRepresentation()))

    schema.column_schemas['var_int'] = (sch.ColumnSchema(
        tf.int64, None, sch.ListColumnRepresentation()))

    schema.column_schemas['var_float'] = (sch.ColumnSchema(
        tf.float32, None, sch.ListColumnRepresentation()))

    schema.column_schemas['var_string'] = (sch.ColumnSchema(
        tf.string, None, sch.ListColumnRepresentation()))

    # SparseFeatures
    schema.column_schemas['sparse_bool'] = (sch.ColumnSchema(
        tf.bool, [15],
        sch.SparseColumnRepresentation(
            'sparse_bool_value',
            [sch.SparseIndexField('sparse_bool_index', True)])))

    schema.column_schemas['sparse_int'] = (sch.ColumnSchema(
        tf.int64, [150],
        sch.SparseColumnRepresentation(
            'sparse_int_value',
            [sch.SparseIndexField('sparse_int_index', False)])))

    schema.column_schemas['sparse_float'] = (sch.ColumnSchema(
        tf.float32, [1500],
        sch.SparseColumnRepresentation(
            'sparse_float_value',
            [sch.SparseIndexField('sparse_float_index', False)])))

    schema.column_schemas['sparse_string'] = (sch.ColumnSchema(
        tf.string, [15000],
        sch.SparseColumnRepresentation(
            'sparse_string_value',
            [sch.SparseIndexField('sparse_string_index', True)])))

    return schema
Пример #14
0
def get_manually_created_schema():
    """Provide a test schema built from scratch using the Schema classes."""
    schema = sch.Schema()

    # This verbose stuff may be replaced with convienience methods in the future.

    # FixedLenFeatures
    schema.column_schemas['fixed_bool_with_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                sch.LogicalShape([sch.Axis(1)])),
        sch.FixedColumnRepresentation(False)))

    schema.column_schemas['fixed_bool_without_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                sch.LogicalShape([sch.Axis(5)])),
        sch.FixedColumnRepresentation()))

    schema.column_schemas['fixed_int_with_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64),
                                sch.LogicalShape([sch.Axis(1)])),
        sch.FixedColumnRepresentation(0)))

    schema.column_schemas['fixed_int_without_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64),
                                sch.LogicalShape([sch.Axis(5)])),
        sch.FixedColumnRepresentation()))

    schema.column_schemas['fixed_float_with_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                sch.LogicalShape([sch.Axis(1)])),
        sch.FixedColumnRepresentation(0.0)))

    schema.column_schemas['fixed_float_without_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                sch.LogicalShape([sch.Axis(5)])),
        sch.FixedColumnRepresentation()))

    schema.column_schemas['fixed_string_with_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string),
                                sch.LogicalShape([sch.Axis(1)])),
        sch.FixedColumnRepresentation('default')))

    schema.column_schemas['fixed_string_without_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string),
                                sch.LogicalShape([sch.Axis(5)])),
        sch.FixedColumnRepresentation()))

    schema.column_schemas['3d_fixed_int_without_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(
            sch.dtype_to_domain(tf.int64),
            sch.LogicalShape([sch.Axis(5),
                              sch.Axis(6),
                              sch.Axis(7)])), sch.FixedColumnRepresentation()))

    # VarLenFeatures
    schema.column_schemas['var_bool'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                sch.LogicalShape([sch.Axis(None)])),
        sch.ListColumnRepresentation()))

    schema.column_schemas['var_int'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64),
                                sch.LogicalShape([sch.Axis(None)])),
        sch.ListColumnRepresentation()))

    schema.column_schemas['var_float'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                sch.LogicalShape([sch.Axis(None)])),
        sch.ListColumnRepresentation()))

    schema.column_schemas['var_string'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string),
                                sch.LogicalShape([sch.Axis(None)])),
        sch.ListColumnRepresentation()))

    # SparseFeatures
    schema.column_schemas['sparse_bool'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                sch.LogicalShape([sch.Axis(15)])),
        sch.SparseColumnRepresentation(
            'sparse_bool_value',
            [sch.SparseIndexField('sparse_bool_index', True)])))

    schema.column_schemas['sparse_int'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64),
                                sch.LogicalShape([sch.Axis(150)])),
        sch.SparseColumnRepresentation(
            'sparse_int_value',
            [sch.SparseIndexField('sparse_int_index', False)])))

    schema.column_schemas['sparse_float'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                sch.LogicalShape([sch.Axis(1500)])),
        sch.SparseColumnRepresentation(
            'sparse_float_value',
            [sch.SparseIndexField('sparse_float_index', False)])))

    schema.column_schemas['sparse_string'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string),
                                sch.LogicalShape([sch.Axis(15000)])),
        sch.SparseColumnRepresentation(
            'sparse_string_value',
            [sch.SparseIndexField('sparse_string_index', True)])))

    return schema
Пример #15
0
import tensorflow_transform as tft
from tensorflow_transform.beam.tft_beam_io import beam_metadata_io
from tensorflow_transform.beam.tft_beam_io import transform_fn_io
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import metadata_io

import unittest
from tensorflow.python.framework import test_util
from tensorflow.python.lib.io import file_io

_TEST_METADATA_COMPLETE = dataset_metadata.DatasetMetadata({
    'fixed_column': dataset_schema.ColumnSchema(
        tf.string, (3,), dataset_schema.FixedColumnRepresentation()),
    'list_columm': dataset_schema.ColumnSchema(
        tf.float32, (None,), dataset_schema.ListColumnRepresentation())
})

_TEST_METADATA = dataset_metadata.DatasetMetadata({
    'fixed_column': dataset_schema.ColumnSchema(
        tf.string, (3,), dataset_schema.FixedColumnRepresentation()),
    # zeros will be overriddden
    'list_columm': dataset_schema.ColumnSchema(
        dataset_schema.IntDomain(tf.int64, min_value=0, max_value=0),
        (None,), dataset_schema.ListColumnRepresentation())
})


class BeamMetadataIoTest(test_util.TensorFlowTestCase):

  def testReadTransformFn(self):