Exemplo n.º 1
0
 def testInferFeatureSchema(self):
     columns = {
         'a': api._InputColumn(tf.placeholder(tf.float32, (None, )), None),
         'b': api._InputColumn(tf.placeholder(tf.string, (1, 2, 3)), None),
         'c': api._InputColumn(tf.placeholder(tf.int64, None), None)
     }
     schema = impl_helper.infer_feature_schema(columns)
     expected_schema = sch.Schema(
         column_schemas={
             'a':
             sch.ColumnSchema(
                 sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                         sch.LogicalShape([])),
                 sch.FixedColumnRepresentation()),
             'b':
             sch.ColumnSchema(
                 sch.LogicalColumnSchema(
                     sch.dtype_to_domain(tf.string),
                     sch.LogicalShape([sch.Axis(2),
                                       sch.Axis(3)])),
                 sch.FixedColumnRepresentation()),
             'c':
             sch.ColumnSchema(
                 sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64),
                                         sch.LogicalShape(None)),
                 sch.FixedColumnRepresentation())
         })
     self.assertEqual(schema, expected_schema)
Exemplo n.º 2
0
        def preprocessing_fn(inputs):
            sparse_sum = tft.map(lambda x: tf.sparse_reduce_sum(x, axis=1),
                                 inputs['sparse'])
            sparse_copy = tft.map(
                lambda y: tf.SparseTensor(y.indices, y.values, y.dense_shape),
                inputs['sparse'])
            varlen_copy = tft.map(
                lambda y: tf.SparseTensor(y.indices, y.values, y.dense_shape),
                inputs['varlen'])

            sparse_copy.schema = sch.ColumnSchema(
                sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                        sch.LogicalShape([sch.Axis(10)])),
                sch.SparseColumnRepresentation(
                    'val_copy', [sch.SparseIndexField('idx_copy', False)]))

            return {
                'fixed': sparse_sum,  # Schema should be inferred.
                'sparse': inputs['sparse'],  # Schema manually attached above.
                'varlen': inputs['varlen'],  # Schema should be inferred.
                'sparse_copy':
                sparse_copy,  # Schema should propagate from input.
                'varlen_copy':
                varlen_copy  # Schema should propagate from input.
            }
def _from_sparse_feature_dict(feature_dict):
    """Translate a JSON sparse feature dict into a ColumnSchema."""
    # assume there is only one value column
    value_feature = feature_dict['valueFeature'][0]
    domain = _to_domain(value_feature['domain'])

    index_feature_dicts = feature_dict['indexFeature']

    # int() is needed because protobuf JSON encodes int64 as string
    axes = [
        sch.Axis(int(index_feature_dict['size']))
        for index_feature_dict in index_feature_dicts
    ]
    shape = sch.LogicalShape(axes)

    logical_column = sch.LogicalColumnSchema(domain, shape)

    value_field_name = value_feature['name']
    index_fields = [
        sch.SparseIndexField(index_feature_dict['name'],
                             index_feature_dict['isSorted'])
        for index_feature_dict in index_feature_dicts
    ]

    representation = sch.SparseColumnRepresentation(value_field_name,
                                                    index_fields)

    return sch.ColumnSchema(logical_column, representation)
Exemplo n.º 4
0
  def test_infer_column_schema_from_tensor(self):
    dense = tf.constant([[1., 2.], [3., 4.]], dtype=tf.float32, shape=[2, 2])
    column_schema = sch.infer_column_schema_from_tensor(dense)
    expected_column_schema = sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                sch.LogicalShape([sch.Axis(2)])),
        sch.FixedColumnRepresentation())
    self.assertEqual(expected_column_schema, column_schema)

    varlen = tf.sparse_placeholder(tf.string)
    column_schema = sch.infer_column_schema_from_tensor(varlen)
    expected_column_schema = sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string),
                                sch.LogicalShape([sch.Axis(None)])),
        sch.ListColumnRepresentation())
    self.assertEqual(expected_column_schema, column_schema)
Exemplo n.º 5
0
  def test_logical_column_schema_equality(self):
    c1 = sch.LogicalColumnSchema(
        sch.dtype_to_domain(tf.int64),
        sch.LogicalShape([sch.Axis(5), sch.Axis(6), sch.Axis(7)]))
    c2 = sch.LogicalColumnSchema(
        sch.dtype_to_domain(tf.int64),
        sch.LogicalShape([sch.Axis(5), sch.Axis(6), sch.Axis(7)]))
    c3 = sch.LogicalColumnSchema(
        sch.dtype_to_domain(tf.int32),
        sch.LogicalShape([sch.Axis(5), sch.Axis(6), sch.Axis(7)]))
    c4 = sch.LogicalColumnSchema(
        sch.dtype_to_domain(tf.int64),
        sch.LogicalShape(None))

    self.assertEqual(c1, c2)
    self.assertNotEqual(c1, c3)
    self.assertNotEqual(c3, c4)
Exemplo n.º 6
0
def _make_transformed_schema():
    schema = sch.Schema()

    schema.column_schemas['transformed_a'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64),
                                sch.LogicalShape([sch.Axis(1)])),
        sch.FixedColumnRepresentation()))

    schema.column_schemas['transformed_b'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64),
                                sch.LogicalShape([sch.Axis(1)])),
        sch.FixedColumnRepresentation()))

    schema.column_schemas['transformed_label'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64),
                                sch.LogicalShape([sch.Axis(1)])),
        sch.FixedColumnRepresentation()))

    return schema
Exemplo n.º 7
0
  def test_column_schema_equality(self):
    c1 = sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                sch.LogicalShape([sch.Axis(1)])),
        sch.FixedColumnRepresentation(False))
    c2 = sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                sch.LogicalShape([sch.Axis(1)])),
        sch.FixedColumnRepresentation(False))
    c3 = sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                sch.LogicalShape([sch.Axis(1)])),
        sch.FixedColumnRepresentation())
    c4 = sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                sch.LogicalShape([sch.Axis(2)])),
        sch.FixedColumnRepresentation())

    self.assertEqual(c1, c2)
    self.assertNotEqual(c1, c3)
    self.assertNotEqual(c3, c4)
def _from_feature_dict(feature_dict):
    """Translate a JSON feature dict into a `ColumnSchema`."""
    domain = _to_domain(feature_dict['domain'])

    axes = []
    if 'fixedShape' in feature_dict:
        for axis in feature_dict['fixedShape']['axis']:
            # int() is needed because protobuf JSON encodes int64 as string
            axes.append(sch.Axis(int(axis.get('size'))))
    elif 'valueCount' in feature_dict:
        # Value_count always means a 1-D feature of unknown size.
        # We don't support value_count.min and value_count.max yet.
        axes.append(sch.Axis(None))

    shape = sch.LogicalShape(axes)

    logical_column = sch.LogicalColumnSchema(domain, shape)

    tf_options = feature_dict['parsingOptions']['tfOptions']
    if tf_options.get('fixedLenFeature') is not None:
        default_value = None
        try:
            # int() is needed because protobuf JSON encodes int64 as string
            default_value = int(
                tf_options['fixedLenFeature']['intDefaultValue'])
        except KeyError:
            try:
                default_value = tf_options['fixedLenFeature'][
                    'stringDefaultValue']
            except KeyError:
                try:
                    default_value = tf_options['fixedLenFeature'][
                        'floatDefaultValue']
                except KeyError:
                    pass
        representation = sch.FixedColumnRepresentation(default_value)
    elif tf_options.get('varLenFeature') is not None:
        representation = sch.ListColumnRepresentation()
    else:
        raise ValueError(
            'Could not interpret tfOptions: {}'.format(tf_options))

    return sch.ColumnSchema(logical_column, representation)
Exemplo n.º 9
0
  def test_schema_equality(self):
    schema1 = sch.Schema(column_schemas={
        'fixed_bool_with_default': sch.ColumnSchema(
            sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                    sch.LogicalShape([sch.Axis(1)])),
            sch.FixedColumnRepresentation(False)),
        'var_float': sch.ColumnSchema(
            sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                    sch.LogicalShape([sch.Axis(None)])),
            sch.ListColumnRepresentation())
    })
    schema2 = sch.Schema(column_schemas={
        'fixed_bool_with_default': sch.ColumnSchema(
            sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                    sch.LogicalShape([sch.Axis(1)])),
            sch.FixedColumnRepresentation(False)),
        'var_float': sch.ColumnSchema(
            sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                    sch.LogicalShape([sch.Axis(None)])),
            sch.ListColumnRepresentation())
    })
    schema3 = sch.Schema(column_schemas={
        'fixed_bool_with_default': sch.ColumnSchema(
            sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                    sch.LogicalShape([sch.Axis(1)])),
            sch.FixedColumnRepresentation(False)),
        'var_float': sch.ColumnSchema(
            sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float64),
                                    sch.LogicalShape([sch.Axis(None)])),
            sch.ListColumnRepresentation())
    })
    schema4 = sch.Schema(column_schemas={
        'fixed_bool_with_default': sch.ColumnSchema(
            sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                    sch.LogicalShape([sch.Axis(1)])),
            sch.FixedColumnRepresentation(False))
    })

    self.assertEqual(schema1, schema2)
    self.assertNotEqual(schema1, schema3)
    self.assertNotEqual(schema1, schema4)
Exemplo n.º 10
0
}, {
    'x': 2,
    'y': 2,
    's': 'world'
}, {
    'x': 3,
    'y': 3,
    's': 'hello'
}]

raw_data_metadata = dataset_metadata.DatasetMetadata(
    dataset_schema.Schema({
        's':
        dataset_schema.ColumnSchema(
            dataset_schema.LogicalColumnSchema(
                dataset_schema.Domain(tf.string),
                dataset_schema.LogicalShape([])),
            dataset_schema.FixedColumnRepresentation()),
        'y':
        dataset_schema.ColumnSchema(
            dataset_schema.LogicalColumnSchema(
                dataset_schema.Domain(tf.float32),
                dataset_schema.LogicalShape([])),
            dataset_schema.FixedColumnRepresentation()),
        'x':
        dataset_schema.ColumnSchema(
            dataset_schema.LogicalColumnSchema(
                dataset_schema.Domain(tf.float32),
                dataset_schema.LogicalShape([])),
            dataset_schema.FixedColumnRepresentation())
    }))
Exemplo n.º 11
0
def get_manually_created_schema():
    """Provide a test schema built from scratch using the Schema classes."""
    schema = sch.Schema()

    # This verbose stuff may be replaced with convienience methods in the future.

    # FixedLenFeatures
    schema.column_schemas['fixed_bool_with_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                sch.LogicalShape([sch.Axis(1)])),
        sch.FixedColumnRepresentation(False)))

    schema.column_schemas['fixed_bool_without_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                sch.LogicalShape([sch.Axis(5)])),
        sch.FixedColumnRepresentation()))

    schema.column_schemas['fixed_int_with_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64),
                                sch.LogicalShape([sch.Axis(1)])),
        sch.FixedColumnRepresentation(0)))

    schema.column_schemas['fixed_int_without_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64),
                                sch.LogicalShape([sch.Axis(5)])),
        sch.FixedColumnRepresentation()))

    schema.column_schemas['fixed_float_with_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                sch.LogicalShape([sch.Axis(1)])),
        sch.FixedColumnRepresentation(0.0)))

    schema.column_schemas['fixed_float_without_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                sch.LogicalShape([sch.Axis(5)])),
        sch.FixedColumnRepresentation()))

    schema.column_schemas['fixed_string_with_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string),
                                sch.LogicalShape([sch.Axis(1)])),
        sch.FixedColumnRepresentation('default')))

    schema.column_schemas['fixed_string_without_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string),
                                sch.LogicalShape([sch.Axis(5)])),
        sch.FixedColumnRepresentation()))

    schema.column_schemas['3d_fixed_int_without_default'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(
            sch.dtype_to_domain(tf.int64),
            sch.LogicalShape([sch.Axis(5),
                              sch.Axis(6),
                              sch.Axis(7)])), sch.FixedColumnRepresentation()))

    # VarLenFeatures
    schema.column_schemas['var_bool'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                sch.LogicalShape([sch.Axis(None)])),
        sch.ListColumnRepresentation()))

    schema.column_schemas['var_int'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64),
                                sch.LogicalShape([sch.Axis(None)])),
        sch.ListColumnRepresentation()))

    schema.column_schemas['var_float'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                sch.LogicalShape([sch.Axis(None)])),
        sch.ListColumnRepresentation()))

    schema.column_schemas['var_string'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string),
                                sch.LogicalShape([sch.Axis(None)])),
        sch.ListColumnRepresentation()))

    # SparseFeatures
    schema.column_schemas['sparse_bool'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool),
                                sch.LogicalShape([sch.Axis(15)])),
        sch.SparseColumnRepresentation(
            'sparse_bool_value',
            [sch.SparseIndexField('sparse_bool_index', True)])))

    schema.column_schemas['sparse_int'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64),
                                sch.LogicalShape([sch.Axis(150)])),
        sch.SparseColumnRepresentation(
            'sparse_int_value',
            [sch.SparseIndexField('sparse_int_index', False)])))

    schema.column_schemas['sparse_float'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                sch.LogicalShape([sch.Axis(1500)])),
        sch.SparseColumnRepresentation(
            'sparse_float_value',
            [sch.SparseIndexField('sparse_float_index', False)])))

    schema.column_schemas['sparse_string'] = (sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string),
                                sch.LogicalShape([sch.Axis(15000)])),
        sch.SparseColumnRepresentation(
            'sparse_string_value',
            [sch.SparseIndexField('sparse_string_index', True)])))

    return schema
Exemplo n.º 12
0
def transform_data(train_data_file, eval_data_file,
                   transformed_train_data_base, transformed_eval_data_base,
                   transformed_metadata_dir):
    """Transform the cleaned data and write out as a TFRecord of Example protos.

  Read in the cleaned data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and coverts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    eval_data_file: File containing evaluation data
    transformed_train_data_base: Base filename for transformed training data
        shards
    transformed_eval_data_base: Base filename for cleaned evaluation data
        shards
    transformed_metadata_dir: Directory where metadata for transformed data
        should be written.
  """
    raw_data_schema = {
        key: dataset_schema.ColumnSchema(
            dataset_schema.LogicalColumnSchema(
                dataset_schema.Domain(tf.string),
                dataset_schema.LogicalShape([])),
            dataset_schema.FixedColumnRepresentation())
        for key in CATEGORICAL_COLUMNS
    }
    raw_data_schema.update({
        key: dataset_schema.ColumnSchema(
            dataset_schema.LogicalColumnSchema(
                dataset_schema.Domain(tf.float32),
                dataset_schema.LogicalShape([])),
            dataset_schema.FixedColumnRepresentation())
        for key in NUMERIC_COLUMNS
    })
    raw_data_schema[LABEL_COLUMN] = dataset_schema.ColumnSchema(
        dataset_schema.LogicalColumnSchema(dataset_schema.Domain(tf.string),
                                           dataset_schema.LogicalShape([])),
        dataset_schema.FixedColumnRepresentation())
    raw_data_schema = dataset_schema.Schema(raw_data_schema)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_data_schema)

    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_COLUMNS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # For all categorical columns except the label column, we use
        # tft.string_to_int which computes the set of unique values and uses this
        # to convert the strings to indices.
        for key in CATEGORICAL_COLUMNS:
            outputs[key] = tft.string_to_int(inputs[key])

        # Update outputs of both kinds to convert from shape (batch,), i.e. a batch
        # of scalars, to shape (batch, 1), i.e. a batch of vectors of length 1.
        # This is needed so the output can be easily wrapped in `FeatureColumn`s.
        for key in NUMERIC_COLUMNS + CATEGORICAL_COLUMNS:
            outputs[key] = tft.map(lambda x: tf.expand_dims(x, -1),
                                   outputs[key])

        # For the label column we provide the mapping from string to index.
        def convert_label(label):
            table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K'])
            return table.lookup(label)

        outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN])

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as p:
        # Create a coder to read the census data with the schema.  To do this we
        # need to list all columns in order since the schema doesn't specify the
        # order of columns in the csv.
        ordered_columns = [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'label'
        ]
        converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema)

        # Read in raw data and convert using CSV converter.  Note that we apply some
        # Beam transformations here, which will not be encoded in the TF graph since
        # we don't do the from within tf.Transform's methods (AnalyzeDataset,
        # TransformDataset etc.).  These transformations are just to get data into
        # a format that the CSV converter can read, in particular removing empty
        # lines and removing spaces after commas.
        raw_data = (p
                    | 'ReadTrainData' >> textio.ReadFromText(train_data_file)
                    | 'FilterTrainData' >> beam.Filter(lambda line: line)
                    | 'FixCommasTrainData' >>
                    beam.Map(lambda line: line.replace(', ', ','))
                    | 'DecodeTrainData' >> beam.Map(converter.decode))

        # Combine data and schema into a dataset tuple.  Note that we already used
        # the schema to read the CSV data, but we also need it to interpret
        # raw_data.
        raw_dataset = (raw_data, raw_data_metadata)
        transformed_dataset, transform_fn = (
            raw_dataset | beam_impl.AnalyzeAndTransformDataset(
                preprocessing_fn, output_dir=os.path.join(tempfile.mkdtemp())))
        transformed_data, transformed_metadata = transformed_dataset

        _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
            transformed_train_data_base,
            coder=example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema))

        # Now apply transform function to eval data.  In this case we also remove
        # the header line from the CSV file and the trailing period at the end of
        # each line.
        raw_eval_data = (
            p
            | 'ReadEvalData' >> textio.ReadFromText(eval_data_file)
            | 'FilterEvalData' >>
            beam.Filter(lambda line: line and line != '|1x3 Cross validator')
            | 'FixCommasEvalData' >>
            beam.Map(lambda line: line.replace(', ', ','))
            |
            'RemoveTrailingPeriodsEvalData' >> beam.Map(lambda line: line[:-1])
            | 'DecodeEvalData' >> beam.Map(converter.decode))

        raw_eval_dataset = (raw_eval_data, raw_data_metadata)

        transformed_eval_dataset = ((raw_eval_dataset, transform_fn)
                                    | beam_impl.TransformDataset())
        # Don't need transformed data schema, it's the same as before.
        transformed_eval_data, _ = transformed_eval_dataset

        _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord(
            transformed_eval_data_base,
            coder=example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema))

        _ = (transformed_metadata
             | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
                 transformed_metadata_dir, pipeline=p))