Пример #1
0
    def __serializedToRows(self, serializedThing):
        example_features = {
            'feature':
            tf.SparseFeature(
                index_key=['feature_indices_0', 'feature_indices_1'],
                value_key='feature_values',
                dtype=tf.int64,
                size=self.meta['feature_dense_shape']),
            'label':
            tf.SparseFeature(index_key=['label_indices_0', 'label_indices_1'],
                             value_key='label_values',
                             dtype=tf.int64,
                             size=self.meta['label_dense_shape'])
        }

        rows = tf.parse_single_example(serializedThing,
                                       features=example_features)
        feature, label = rows['feature'], rows['label']
        feature, label = tf.reshape(
            tf.sparse_tensor_to_dense(feature),
            [self.meta['feature_dense_shape'][0]]), tf.reshape(
                tf.sparse_tensor_to_dense(label),
                [self.meta['label_dense_shape'][0]])
        feature, label = tf.cast(feature,
                                 dtype=tf.float32), tf.cast(label,
                                                            dtype=tf.int8)

        return feature, label
Пример #2
0
    def testMalformedSparseFeatures(self):
        tensors = {
            'a': tf.sparse_placeholder(tf.int64),
        }

        # Invalid indices.
        schema = self.toSchema(
            {'a': tf.SparseFeature('idx', 'val', tf.float32, 10)})
        instances = [{'a': ([-1, 2], [1.0, 2.0])}]
        with self.assertRaisesRegexp(ValueError, 'has index .* out of range'):
            impl_helper.make_feed_dict(tensors, schema, instances)

        instances = [{'a': ([11, 1], [1.0, 2.0])}]
        with self.assertRaisesRegexp(ValueError, 'has index .* out of range'):
            impl_helper.make_feed_dict(tensors, schema, instances)

        # Indices and values of different lengths.
        schema = self.toSchema(
            {'a': tf.SparseFeature('idx', 'val', tf.float32, 10)})
        instances = [{'a': ([1, 2], [1])}]
        with self.assertRaisesRegexp(
                ValueError, 'indices and values of different lengths'):
            impl_helper.make_feed_dict(tensors, schema, instances)

        # Tuple of the wrong length.
        instances = [{'a': ([1], [2], [3])}]
        with self.assertRaisesRegexp(ValueError, 'too many values to unpack'):
            impl_helper.make_feed_dict(tensors, schema, instances)
Пример #3
0
  def testRunTransformFn(self):
    schema = self.toSchema({
        'dense_1': tf.FixedLenFeature((), tf.float32),
        'dense_2': tf.FixedLenFeature((1, 2), tf.int64),
        'var_len': tf.VarLenFeature(tf.string),
        'sparse': tf.SparseFeature('ix', 'val', tf.float32, 100)
    })
    def preprocessing_fn(inputs):
      return {
          'dense_out': mappers.scale_to_0_1(inputs['dense_1']),
          'sparse_out': api.map(lambda x: tf.sparse_reshape(x, (1, 10)),
                                inputs['sparse'])
      }

    inputs, outputs = impl_helper.run_preprocessing_fn(
        preprocessing_fn, schema)

    # Verify that the input placeholders have the correct types.
    expected_dtype_and_shape = {
        'dense_1': (tf.float32, tf.TensorShape([None])),
        'dense_2': (tf.int64, tf.TensorShape([None, 1, 2])),
        'var_len': (tf.string, tf.TensorShape(None)),
        'sparse': (tf.float32, tf.TensorShape(None)),
        'dense_out': (tf.float32, tf.TensorShape([None])),
        'sparse_out': (tf.float32, tf.TensorShape([None, None])),
    }

    for key, column in inputs.items() + outputs.items():
      dtype, shape = expected_dtype_and_shape[key]
      self.assertEqual(column.tensor.dtype, dtype)
      self.assertShapesEqual(column.tensor.get_shape(), shape)
Пример #4
0
  def testRunPreprocessingFn(self):
    schema = self.toSchema({
        'dense_1': tf.FixedLenFeature((), tf.float32),
        'dense_2': tf.FixedLenFeature((1, 2), tf.int64),
        'var_len': tf.VarLenFeature(tf.string),
        'sparse': tf.SparseFeature('ix', 'val', tf.float32, 100)
    })
    def preprocessing_fn(inputs):
      return {
          'dense_out': mappers.scale_to_0_1(inputs['dense_1']),
          'sparse_out': tf.sparse_reshape(inputs['sparse'], (1, 10)),
      }

    _, inputs, outputs = impl_helper.run_preprocessing_fn(
        preprocessing_fn, schema)

    # Verify that the input placeholders have the correct types.
    expected_dtype_and_shape = {
        'dense_1': (tf.float32, tf.TensorShape([None])),
        'dense_2': (tf.int64, tf.TensorShape([None, 1, 2])),
        'var_len': (tf.string, tf.TensorShape([None, None])),
        'sparse': (tf.float32, tf.TensorShape([None, None])),
        'dense_out': (tf.float32, tf.TensorShape([None])),
        'sparse_out': (tf.float32, tf.TensorShape([None, None])),
    }

    for key, tensor in itertools.chain(six.iteritems(inputs),
                                       six.iteritems(outputs)):
      dtype, shape = expected_dtype_and_shape[key]
      self.assertEqual(tensor.dtype, dtype)
      tensor.get_shape().assert_is_compatible_with(shape)
    def test_example_with_feature_spec_decoder(self):
        feature_spec = {
            "scalar_feature_1":
            tf.FixedLenFeature(shape=[], dtype=tf.int64),
            "scalar_feature_2":
            tf.FixedLenFeature(shape=[], dtype=tf.int64),
            "scalar_feature_3":
            tf.FixedLenFeature(shape=[], dtype=tf.float32),
            "varlen_feature_1":
            tf.VarLenFeature(dtype=tf.float32),
            "varlen_feature_2":
            tf.VarLenFeature(dtype=tf.string),
            "1d_vector_feature":
            tf.FixedLenFeature(shape=[1], dtype=tf.string),
            "2d_vector_feature":
            tf.FixedLenFeature(shape=[2, 2], dtype=tf.float32),
            "sparse_feature":
            tf.SparseFeature("sparse_feature_idx", "sparse_feature_value",
                             tf.float32, 10),
        }

        dec = ExampleWithFeatureSpecDecoder(feature_spec)
        actual_json = json.loads(dec.to_json(self.example_str))
        expected_decoded = {
            "scalar_feature_1": 12,
            "scalar_feature_2": 12,
            "scalar_feature_3": 1.0,
            "varlen_feature_1": [89.0],
            "1d_vector_feature": ["this is a ,text"],
            "2d_vector_feature": [[1.0, 2.0], [3.0, 4.0]],
            "varlen_feature_2": ["female"],
            "sparse_feature_idx": [1, 4],
            "sparse_feature_value": [12.0, 20.0],
        }
        self.assertEqual(actual_json, expected_decoded)
Пример #6
0
def inputs(file_pattern):
    pattern = os.path.join(FLAGS.dataset_dir, file_pattern)
    print(pattern)
    files = tf.gfile.Glob(pattern)
        
    capacity = 10000 + 10000 * FLAGS.batch_size
    
    with tf.device('/cpu:0'):
        with tf.name_scope('input'):
            filename_queue = tf.train.string_input_producer(files)
            reader = tf.TFRecordReader()
            key, value = reader.read_up_to(filename_queue, 10240)

            record = tf.train.shuffle_batch([value], batch_size=FLAGS.batch_size, num_threads=2, min_after_dequeue=FLAGS.batch_size * 1000, capacity=capacity, enqueue_many=True)  

            parsed = tf.parse_example(
                record,
                features={
                'label' : tf.FixedLenFeature([FLAGS.label_size], dtype=tf.float32),
                "feature": tf.SparseFeature(index_key="fea_id", value_key="fea_value", dtype=tf.float32, size=FLAGS.fea_size),
                'neig_id' : tf.VarLenFeature(dtype=tf.int64),
                'neig_value' : tf.VarLenFeature(dtype=tf.float32),
             })
                
    return parsed['label'], parsed['feature'], parsed['neig_id'], parsed['neig_value']
Пример #7
0
    def testMakeOutputDict(self):
        schema = self.toSchema({
            'a':
            tf.FixedLenFeature(None, tf.int64),
            'b':
            tf.FixedLenFeature([], tf.float32),
            'c':
            tf.FixedLenFeature([1], tf.float32),
            'd':
            tf.FixedLenFeature([2, 2], tf.float32),
            'e':
            tf.VarLenFeature(tf.string),
            'f':
            tf.SparseFeature('idx', 'val', tf.float32, 10)
        })

        fetches = {
            'a':
            np.array([100, 200]),
            'b':
            np.array([10.0, 20.0]),
            'c':
            np.array([[40.0], [80.0]]),
            'd':
            np.array([[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0], [7.0, 8.0]]]),
            'e':
            tf.SparseTensorValue(
                indices=np.array([(0, 0), (0, 1), (0, 2), (1, 0), (1, 1),
                                  (1, 2)]),
                values=np.array(['doe', 'a', 'deer', 'a', 'female', 'deer']),
                dense_shape=(2, 3)),
            'f':
            tf.SparseTensorValue(indices=np.array([(0, 2), (0, 4), (0, 8),
                                                   (1, 4), (1, 8)]),
                                 values=np.array(
                                     [10.0, 20.0, 30.0, 40.0, 50.0]),
                                 dense_shape=(2, 20))
        }

        instance_dicts = impl_helper.to_instance_dicts(schema, fetches)
        self.assertEqual(2, len(instance_dicts))
        self.assertSetEqual(set(six.iterkeys(instance_dicts[0])),
                            set(['a', 'b', 'c', 'd', 'e', 'f']))
        self.assertAllEqual(instance_dicts[0]['a'], 100)
        self.assertAllEqual(instance_dicts[0]['b'], 10.0)
        self.assertAllEqual(instance_dicts[0]['c'], [40.0])
        self.assertAllEqual(instance_dicts[0]['d'], [[1.0, 2.0], [3.0, 4.0]])
        self.assertAllEqual(instance_dicts[0]['e'], ['doe', 'a', 'deer'])
        self.assertEqual(len(instance_dicts[0]['f']), 2)
        self.assertAllEqual(instance_dicts[0]['f'][0], [2, 4, 8])
        self.assertAllEqual(instance_dicts[0]['f'][1], [10.0, 20.0, 30.0])
        self.assertAllEqual(instance_dicts[1]['a'], 200)
        self.assertAllEqual(instance_dicts[1]['b'], 20.0)
        self.assertAllEqual(instance_dicts[1]['c'], [80.0])
        self.assertAllEqual(instance_dicts[1]['d'], [[5.0, 6.0], [7.0, 8.0]])
        self.assertAllEqual(instance_dicts[1]['e'], ['a', 'female', 'deer'])
        self.assertEqual(len(instance_dicts[1]['f']), 2)
        self.assertAllEqual(instance_dicts[1]['f'][0], [4, 8])
        self.assertAllEqual(instance_dicts[1]['f'][1], [40.0, 50.0])
def parse_simple_example(serialized_batch_example, input_dim):
    data = tf.parse_example(serialized_batch_example,
                            features={
                                'label':
                                tf.FixedLenFeature([], tf.float32),
                                'deep':
                                tf.SparseFeature(index_key='col_index',
                                                 value_key='col_value',
                                                 dtype=tf.float32,
                                                 size=input_dim),
                                'wide':
                                tf.SparseFeature(index_key='bias_index',
                                                 value_key='bias_value',
                                                 dtype=tf.float32,
                                                 size=input_dim)
                            })
    return data['deep'], data['label'], data['wide']
Пример #9
0
    def testSerializedContainingSparseAndSparseFeatureAndDenseWithNoDefault(
            self):
        expected_st_a = (  # indices, values, shape
            np.empty((0, 2), dtype=np.int64),  # indices
            np.empty((0, ), dtype=np.int64),  # sp_a is DT_INT64
            np.array([2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
        expected_sp = (  # indices, values, shape
            np.array([[0, 0], [0, 3], [1, 7]],
                     dtype=np.int64), np.array(["a", "b", "c"], dtype="|S"),
            np.array([2, 13], dtype=np.int64))  # batch == 4, max_elems = 13

        original = [
            example(features=features({
                "c": float_feature([3, 4]),
                "val": bytes_feature([b"a", b"b"]),
                "idx": int64_feature([0, 3])
            })),
            example(features=features({
                "c": float_feature([1, 2]),
                "val": bytes_feature([b"c"]),
                "idx": int64_feature([7])
            }))
        ]

        names = ["in1", "in2"]
        serialized = [m.SerializeToString() for m in original]

        a_default = [1, 2, 3]
        b_default = np.random.rand(3, 3).astype(bytes)
        expected_output = {
            "st_a": expected_st_a,
            "sp": expected_sp,
            "a": np.array(2 * [[a_default]]),
            "b": np.array(2 * [b_default]),
            "c": np.array([[3, 4], [1, 2]], dtype=np.float32),
        }

        self._test(
            {
                "example_names": names,
                "serialized": tf.convert_to_tensor(serialized),
                "features": {
                    "st_a":
                    tf.VarLenFeature(tf.int64),
                    "sp":
                    tf.SparseFeature("idx", "val", tf.string, 13),
                    "a":
                    tf.FixedLenFeature(
                        (1, 3), tf.int64, default_value=a_default),
                    "b":
                    tf.FixedLenFeature(
                        (3, 3), tf.string, default_value=b_default),
                    # Feature "c" must be provided, since it has no default_value.
                    "c":
                    tf.FixedLenFeature((2, ), tf.float32),
                }
            },
            expected_output)
Пример #10
0
 def as_feature_spec(self, column):
     ind = self.index_fields
     if len(ind) != 1 or len(column.axes) != 1:
         raise ValueError(
             'tf.Example parser supports only 1-d sparse features.')
     index = ind[0]
     return tf.SparseFeature(index.name, self._value_field_name,
                             column.domain.dtype, column.axes[0].size,
                             index.is_sorted)
Пример #11
0
    def make_input_fn(self, file_paths, epochs=None):
        """
        Function that loads the TFRecords files and creates the placeholders
        for the data inputs.

        Parameters
        ----------
        file_paths : list
            List of TFRecord files from which to read from.
        epochs : int
            Integer specifying the number of times to read through the dataset.
            If None, cycles through the dataset forever.
            NOTE - If specified, creates a variable that must be initialized,
            so call tf.local_variables_initializer() and run the op in a session.
            Default is None.

        Returns
        -------
        features : Tensor
            Tensor containing a batch of cells (vector of expression levels).
        cluster : Tensor
            Tensor containing (a batch of) the cluster indexes of the
            corresponding cells.
        """

        feature_map = {
            'scg':
            tf.SparseFeature(index_key='indices',
                             value_key='values',
                             dtype=tf.float32,
                             size=self.genes_no),
            'cluster_int':
            tf.FixedLenFeature(1, tf.int64)
        }

        options = tf.python_io.TFRecordOptions(
            tf.python_io.TFRecordCompressionType.GZIP)

        batched_features = tf.contrib.learn.read_batch_features(
            file_pattern=file_paths,
            batch_size=self.batch_size,
            features=feature_map,
            reader=lambda: tf.TFRecordReader(options=options),
            num_epochs=epochs)

        sgc = batched_features['scg']

        sparse = tf.sparse_reshape(sgc, (self.batch_size, self.genes_no))

        dense = tf.sparse_tensor_to_dense(sparse)

        cluster = tf.squeeze(tf.to_int32(batched_features['cluster_int']))

        features = tf.reshape(dense, (self.batch_size, self.genes_no))

        return features, cluster
Пример #12
0
def tfrecord_schema(original_dim=202498):
    return {
        'sparse':
        tf.SparseFeature(index_key=['token_ids'],
                         value_key='counts',
                         dtype=tf.int64,
                         size=[original_dim]),
        "volid":
        tf.FixedLenFeature((), tf.string, default_value=""),
        #'page_seq': tf.FixedLenFeature((), tf.string, default_value="")
    }
Пример #13
0
def parse_field_example(serialized_batch_example, input_dim, field_sizes):
    features = {
        'label':
        tf.FixedLenFeature([], tf.float32),
        'wide':
        tf.SparseFeature(index_key='bias_idx',
                         value_key='bias_val',
                         dtype=tf.float32,
                         size=input_dim)
    }
    num_field = len(field_sizes)
    for i in range(num_field):
        features['%d' % i] = tf.SparseFeature(index_key='i_%d' % (i),
                                              value_key='v_%d' % (i),
                                              dtype=tf.float32,
                                              size=field_sizes[i])

    data = tf.parse_example(serialized_batch_example, features=features)

    X = [data['%d' % i] for i in range(num_field)]
    return X, data['label'], data['wide']
def parse_fn(example):
    example_fmt = {
        "embedding_average":
        tf.FixedLenFeature([8], tf.float32),
        "one_hot":
        tf.SparseFeature(index_key=["index"],
                         value_key="value",
                         dtype=tf.float32,
                         size=[15])  # size必须写死, 不能传超参
    }
    parsed = tf.parse_single_example(example, example_fmt)
    return parsed["embedding_average"], tf.sparse_tensor_to_dense(
        parsed["one_hot"])
Пример #15
0
    def testSingleExampleWithSparseAndSparseFeatureAndDense(self):
        original = example(features=features({
            "c": float_feature([3, 4]),
            "val": bytes_feature([b"a", b"b"]),
            "idx": int64_feature([0, 3]),
            "st_a": float_feature([3.0, 4.0])
        }))

        serialized = original.SerializeToString()

        expected_st_a = (
            np.array([[0], [1]], dtype=np.int64),  # indices
            np.array([3.0, 4.0], dtype=np.float32),  # values
            np.array([2], dtype=np.int64))  # shape: max_values = 2

        expected_sp = (  # indices, values, shape
            np.array([[0], [3]],
                     dtype=np.int64), np.array(["a", "b"], dtype="|S"),
            np.array([13], dtype=np.int64))  # max_values = 13

        a_default = [1, 2, 3]
        b_default = np.random.rand(3, 3).astype(bytes)
        expected_output = {
            "st_a": expected_st_a,
            "sp": expected_sp,
            "a": [a_default],
            "b": b_default,
            "c": np.array([3, 4], dtype=np.float32),
        }

        self._test(
            {
                "example_names": tf.convert_to_tensor("in1"),
                "serialized": tf.convert_to_tensor(serialized),
                "features": {
                    "st_a":
                    tf.VarLenFeature(tf.float32),
                    "sp":
                    tf.SparseFeature("idx", "val", tf.string, 13),
                    "a":
                    tf.FixedLenFeature(
                        (1, 3), tf.int64, default_value=a_default),
                    "b":
                    tf.FixedLenFeature(
                        (3, 3), tf.string, default_value=b_default),
                    # Feature "c" must be provided, since it has no default_value.
                    "c":
                    tf.FixedLenFeature((2, ), tf.float32),
                }
            },
            expected_output)
Пример #16
0
    def get_TFReord_parser(self):
        ''' Create the parser used to parse data read from  TFRecord'''
        context_feature_columns, example_feature_columns = self.create_feature_columns(
        )
        # build feature map
        feature_map = {}
        feature_map['label'] = tf.FixedLenFeature([self.list_size], tf.float32)
        for k in context_feature_columns:
            if k.endswith('unigrams'):
                feature_map[k] = tf.SparseFeature(index_key=['%s_idx' % k],
                                                  value_key='%s_int_value' % k,
                                                  dtype=tf.int64,
                                                  size=[self.max_query_length])
            else:
                feature_map[k] = tf.FixedLenFeature([1], tf.float32)
        for k in example_feature_columns:
            if k.endswith('unigrams'):
                feature_map[k] = tf.SparseFeature(
                    index_key=['%s_list_idx' % k,
                               '%s_idx' % k],
                    value_key='%s_int_value' % k,
                    dtype=tf.int64,
                    size=[self.list_size, self.max_doc_length])
            else:
                feature_map[k] = tf.FixedLenFeature([self.list_size],
                                                    tf.float32)

        def parser(serialized_example):
            """Parses a single tf.Example into image and label tensors."""

            features = tf.parse_single_example(serialized_example,
                                               features=feature_map)
            label = features.pop('label')
            print(features['bm25s'])

            return features, label

        return parser
Пример #17
0
    def testSerializedContainingSparseFeatureReuse(self):
        original = [
            example(features=features({
                "val1": float_feature([3, 4]),
                "val2": float_feature([5, 6]),
                "idx": int64_feature([5, 10])
            })),
            example(features=features({
                "val1": float_feature([]),  # empty float list
                "idx": int64_feature([])
            })),
        ]

        serialized = [m.SerializeToString() for m in original]

        expected_sp1 = (  # indices, values, shape
            np.array([[0, 5], [0, 10]],
                     dtype=np.int64), np.array([3.0, 4.0], dtype=np.float32),
            np.array([2, 13], dtype=np.int64))  # batch == 2, max_elems = 13

        expected_sp2 = (  # indices, values, shape
            np.array([[0, 5], [0, 10]],
                     dtype=np.int64), np.array([5.0, 6.0], dtype=np.float32),
            np.array([2, 7], dtype=np.int64))  # batch == 2, max_elems = 13

        expected_output = {
            "sp1": expected_sp1,
            "sp2": expected_sp2,
        }

        self._test(
            {
                "serialized": tf.convert_to_tensor(serialized),
                "features": {
                    "sp1": tf.SparseFeature("idx", "val1", tf.float32, 13),
                    "sp2": tf.SparseFeature("idx", "val2", tf.float32, 7)
                }
            }, expected_output)
 def test_round_trip(self):
     feature_spec = {
         "scalar_feature_1": tf.FixedLenFeature(shape=[], dtype=tf.int64),
         "scalar_feature_2": tf.FixedLenFeature(shape=[], dtype=tf.int64),
         "scalar_feature_3": tf.FixedLenFeature(shape=[], dtype=tf.float32),
         "varlen_feature_1": tf.VarLenFeature(dtype=tf.float32),
         "varlen_feature_2": tf.VarLenFeature(dtype=tf.string),
         "1d_vector_feature": tf.FixedLenFeature(shape=[1], dtype=tf.string),
         "2d_vector_feature": tf.FixedLenFeature(shape=[2, 2], dtype=tf.float32),
         "sparse_feature": tf.SparseFeature("idx", "value", tf.float32, 10),
     }
     inferred_schema = feature_spec_to_schema(feature_spec)
     inferred_feature_spec = schema_to_feature_spec(inferred_schema)
     self.assertEqual(inferred_feature_spec, feature_spec)
Пример #19
0
 def _parse_sparse_feature(cls, feature):
     # type: (Schema.Feature) -> Tuple[str, tf.SparseFeature]
     if len(feature.index_feature) == 1:
         index_key = feature.index_feature[0].name
     else:
         index_key = [idf.name for idf in feature.index_feature]
     dtype = cls._tf_type_mapper.proto_to_tf_type(feature, is_sparse=True)
     if len(feature.dense_shape.dim) == 1:
         size = feature.dense_shape.dim[0].size
     else:
         size = [d.size for d in feature.dense_shape.dim]
     return feature.name, tf.SparseFeature(index_key=index_key,
                                           value_key=feature.value_feature.name,
                                           dtype=dtype,
                                           size=size)
Пример #20
0
    def as_feature_spec(self, column):
        ind = self.index_fields
        if len(ind) != 1 or len(column.axes) != 1:
            raise ValueError(
                'tf.Example parser supports only 1-d sparse features.')
        index = ind[0]

        if column.domain.dtype not in _TF_EXAMPLE_ALLOWED_TYPES:
            raise ValueError(
                'tf.Example parser supports only types {}, so it is '
                'invalid to generate a feature_spec with type '
                '{}.'.format(_TF_EXAMPLE_ALLOWED_TYPES,
                             repr(column.domain.dtype)))

        return tf.SparseFeature(index.name, self._value_field_name,
                                column.domain.dtype, column.axes[0].size,
                                index.is_sorted)
Пример #21
0
    def test_sparse_feature_incorrect_values(self):
        input_schema = dataset_schema.from_feature_spec({
            'a':
            tf.SparseFeature('idx', 'value', tf.float32, 10),
        })
        coder = csv_coder.CsvCoder(column_names=['idx', 'value'],
                                   schema=input_schema)

        # Index negative.
        with self.assertRaisesRegexp(ValueError, 'has index -1 out of range'):
            coder.decode('-1,12.0')

        # Index equal to size.
        with self.assertRaisesRegexp(ValueError, 'has index 10 out of range'):
            coder.decode('10,12.0')

        # Index greater than size.
        with self.assertRaisesRegexp(ValueError, 'has index 11 out of range'):
            coder.decode('11,12.0')
Пример #22
0
    def test_sparse_feature_missing_values(self):
        input_schema = dataset_schema.from_feature_spec({
            'a':
            tf.SparseFeature('idx', 'value', tf.float32, 10),
        })
        coder = csv_coder.CsvCoder(column_names=['idx', 'value'],
                                   schema=input_schema)

        # Missing both value and index (which is allowed).
        self.assertEqual(coder.decode(','), {'a': ([], [])})

        # Missing index only (not allowed).
        with self.assertRaisesRegexp(ValueError,
                                     'expected an index in column "idx"'):
            coder.decode(',12.0')

        # Missing value only (not allowed).
        with self.assertRaisesRegexp(ValueError,
                                     'expected a value in column "value"'):
            coder.decode('1,')
Пример #23
0
    def testMakeOutputDict(self):
        schema = self.toSchema({
            'a':
            tf.FixedLenFeature(None, tf.int64),
            'b':
            tf.FixedLenFeature([2, 2], tf.float32),
            'c':
            tf.VarLenFeature(tf.string),
            'd':
            tf.SparseFeature('idx', 'val', tf.float32, 10)
        })

        fetches = {
            'a':
            np.asarray([100, 200]),
            'b':
            np.asarray([[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0], [7.0, 8.0]]]),
            'c':
            tf.SparseTensorValue(
                indices=[(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)],
                values=['doe', 'a', 'deer', 'a', 'female', 'deer'],
                dense_shape=(2, 3)),
            'd':
            tf.SparseTensorValue(indices=[(0, 2), (0, 4), (0, 8)],
                                 values=[10.0, 20.0, 30.0],
                                 dense_shape=(2, 20))
        }
        output_dicts = impl_helper.make_output_dict(schema, fetches)
        self.assertEqual(2, len(output_dicts))
        self.assertSetEqual(set(output_dicts[0].keys()),
                            set(['a', 'b', 'c', 'idx', 'val']))
        self.assertAllEqual(output_dicts[0]['a'], 100)
        self.assertAllEqual(output_dicts[0]['b'], [[1.0, 2.0], [3.0, 4.0]])
        self.assertAllEqual(output_dicts[0]['c'], ['doe', 'a', 'deer'])
        self.assertAllEqual(output_dicts[0]['idx'], [2, 4, 8])
        self.assertAllEqual(output_dicts[0]['val'], [10.0, 20.0, 30.0])
        self.assertAllEqual(output_dicts[1]['a'], 200)
        self.assertAllEqual(output_dicts[1]['b'], [[5.0, 6.0], [7.0, 8.0]])
        self.assertAllEqual(output_dicts[1]['c'], ['a', 'female', 'deer'])
        self.assertAllEqual(output_dicts[1]['idx'], [])
        self.assertAllEqual(output_dicts[1]['val'], [])
Пример #24
0
    def testSerializedContainingSparseAndSparseFeatureWithReuse(self):
        expected_idx = (  # indices, values, shape
            np.array([[0, 0], [0, 1], [1, 0], [1, 1]],
                     dtype=np.int64), np.array([0, 3, 7, 1]),
            np.array([2, 2], dtype=np.int64))  # batch == 4, max_elems = 2

        expected_sp = (  # indices, values, shape
            np.array([[0, 0], [0, 3], [1, 1], [1, 7]],
                     dtype=np.int64), np.array(["a", "b", "d", "c"],
                                               dtype="|S"),
            np.array([2, 13], dtype=np.int64))  # batch == 4, max_elems = 13

        original = [
            example(features=features({
                "val": bytes_feature([b"a", b"b"]),
                "idx": int64_feature([0, 3])
            })),
            example(features=features({
                "val": bytes_feature([b"c", b"d"]),
                "idx": int64_feature([7, 1])
            }))
        ]

        names = ["in1", "in2"]
        serialized = [m.SerializeToString() for m in original]

        expected_output = {
            "idx": expected_idx,
            "sp": expected_sp,
        }

        self._test(
            {
                "example_names": names,
                "serialized": tf.convert_to_tensor(serialized),
                "features": {
                    "idx": tf.VarLenFeature(tf.int64),
                    "sp": tf.SparseFeature("idx", "val", tf.string, 13),
                }
            }, expected_output)
Пример #25
0
    def testSerializedContainingSparseFeature(self):
        original = [
            example(features=features({
                "val": float_feature([3, 4]),
                "idx": int64_feature([5, 10])
            })),
            example(features=features({
                "val": float_feature([]),  # empty float list
                "idx": int64_feature([])
            })),
            example(features=features({
                "val": feature(),  # feature with nothing in it
                # missing idx feature
            })),
            example(features=features({
                "val": float_feature([1, 2, -1]),
                "idx": int64_feature([0, 9, 3])  # unsorted
            }))
        ]

        serialized = [m.SerializeToString() for m in original]

        expected_sp = (  # indices, values, shape
            np.array([[0, 5], [0, 10], [3, 0], [3, 3], [3, 9]],
                     dtype=np.int64),
            np.array([3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32),
            np.array([4, 13], dtype=np.int64))  # batch == 4, max_elems = 13

        expected_output = {
            "sp": expected_sp,
        }

        self._test(
            {
                "serialized": tf.convert_to_tensor(serialized),
                "features": {
                    "sp": tf.SparseFeature("idx", "val", tf.float32, 13)
                }
            }, expected_output)
Пример #26
0
 def test_all_values_present(self):
     columns = ['a', 'b', 'c', 'd', 'e']
     input_schema = dataset_schema.from_feature_spec({
         'b':
         tf.FixedLenFeature(shape=[], dtype=tf.float32),
         'a':
         tf.FixedLenFeature(shape=[], dtype=tf.string),
         'c':
         tf.VarLenFeature(dtype=tf.string),
         'y':
         tf.SparseFeature('d', 'e', tf.float32, 10),
     })
     coder = csv_coder.CsvCoder(column_names=columns, schema=input_schema)
     self.assertEqual(
         coder.decode('a_value,1.0,0,1,12.0'),
         # Column 'c' is specified as a string so the value is not casted.
         {
             'a': 'a_value',
             'b': 1.0,
             'c': ['0'],
             'y': ([12.0], [1])
         })
class ExampleProtoCoderTest(unittest.TestCase):

    _INPUT_SCHEMA = dataset_schema.from_feature_spec({
        'scalar_feature_1':
        tf.FixedLenFeature(shape=[], dtype=tf.int64),
        'scalar_feature_2':
        tf.FixedLenFeature(shape=[], dtype=tf.int64),
        'scalar_feature_3':
        tf.FixedLenFeature(shape=[], dtype=tf.float32),
        'varlen_feature_1':
        tf.VarLenFeature(dtype=tf.float32),
        '1d_vector_feature':
        tf.FixedLenFeature(shape=[0], dtype=tf.string),
        'varlen_feature_2':
        tf.VarLenFeature(dtype=tf.string),
        'sparse_feature':
        tf.SparseFeature('idx', 'value', tf.float32, 10),
    })

    def _assert_encode_decode(self, coder, expected_proto_text,
                              expected_decoded):
        example = tf.train.Example()
        text_format.Merge(expected_proto_text, example)
        data = example.SerializeToString()

        # Assert the data is decoded into the expected format.
        decoded = coder.decode(data)
        np.testing.assert_equal(expected_decoded, decoded)

        # Assert the decoded data can be encoded back into the original proto.
        encoded = coder.encode(decoded)
        parsed_example = tf.train.Example()
        parsed_example.ParseFromString(encoded)
        self.assertEqual(example, parsed_example)

        # Assert the data can be decoded from the encoded string.
        decoded_again = coder.decode(encoded)
        np.testing.assert_equal(expected_decoded, decoded_again)

    def _assert_decode_encode(self, coder, expected_proto_text,
                              expected_decoded):
        example = tf.train.Example()
        text_format.Merge(expected_proto_text, example)

        # Assert the expected decoded data can be encoded into the expected proto.
        encoded = coder.encode(expected_decoded)
        parsed_example = tf.train.Example()
        parsed_example.ParseFromString(encoded)
        self.assertEqual(example, parsed_example)

        # Assert the encoded data can be decoded into the original input.
        decoded = coder.decode(encoded)
        np.testing.assert_equal(expected_decoded, decoded)

        # Assert the decoded data can be encoded back into the expected proto.
        encoded_again = coder.encode(decoded)
        parsed_example_again = tf.train.Example()
        parsed_example_again.ParseFromString(encoded_again)
        np.testing.assert_equal(example, parsed_example_again)

    def test_example_proto_coder(self):
        # We use a single coder and invoke multiple encodes and decodes on it to
        # make sure that cache consistency is implemented properly.
        coder = example_proto_coder.ExampleProtoCoder(self._INPUT_SCHEMA)

        # Python types.
        example_proto_text = """
    features {
      feature { key: "scalar_feature_1" value { int64_list { value: [ 12 ] } } }
      feature { key: "varlen_feature_1"
                value { float_list { value: [ 89.0 ] } } }
      feature { key: "scalar_feature_2" value { int64_list { value: [ 12 ] } } }
      feature { key: "scalar_feature_3"
                value { float_list { value: [ 1.0 ] } } }
      feature { key: "1d_vector_feature"
                value { bytes_list { value: [ 'this is a ,text' ] } } }
      feature { key: "varlen_feature_2"
                value { bytes_list { value: [ 'female' ] } } }
      feature { key: "value" value { float_list { value: [ 12.0, 20.0 ] } } }
      feature { key: "idx" value { int64_list { value: [ 1, 4 ] } } }
    }
    """
        expected_decoded = {
            'scalar_feature_1': 12,
            'scalar_feature_2': 12,
            'scalar_feature_3': 1.0,
            'varlen_feature_1': [89.0],
            '1d_vector_feature': ['this is a ,text'],
            'varlen_feature_2': ['female'],
            'sparse_feature': ([12.0, 20.0], [1, 4])
        }
        self._assert_encode_decode(coder, example_proto_text, expected_decoded)
        self._assert_decode_encode(coder, example_proto_text, expected_decoded)

        # Numpy types (with different values from above).
        example_proto_text = """
    features {
      feature { key: "scalar_feature_1" value { int64_list { value: [ 13 ] } } }
      feature { key: "varlen_feature_1" value { float_list { } } }
      feature { key: "scalar_feature_2" value { int64_list { value: [ 14 ] } } }
      feature { key: "scalar_feature_3"
                value { float_list { value: [ 2.0 ] } } }
      feature { key: "1d_vector_feature"
                value { bytes_list { value: [ 'this is another ,text' ] } } }
      feature { key: "varlen_feature_2"
                value { bytes_list { value: [ 'male' ] } } }
      feature { key: "value" value { float_list { value: [ 13.0, 21.0 ] } } }
      feature { key: "idx" value { int64_list { value: [ 2, 5 ] } } }
    }
    """
        expected_decoded = {
            'scalar_feature_1': np.array(13),
            'scalar_feature_2': np.int32(14),
            'scalar_feature_3': np.array(2.0),
            'varlen_feature_1': np.array([]),
            '1d_vector_feature': np.array(['this is another ,text']),
            'varlen_feature_2': np.array(['male']),
            'sparse_feature': (np.array([13.0, 21.0]), np.array([2, 5]))
        }
        self._assert_encode_decode(coder, example_proto_text, expected_decoded)
        self._assert_decode_encode(coder, example_proto_text, expected_decoded)

    def test_example_proto_coder_picklable(self):
        coder = example_proto_coder.ExampleProtoCoder(self._INPUT_SCHEMA)

        example_proto_text = """
    features {
      feature { key: "scalar_feature_1" value { int64_list { value: [ 12 ] } } }
      feature { key: "varlen_feature_1"
                value { float_list { value: [ 89.0 ] } } }
      feature { key: "scalar_feature_2" value { int64_list { value: [ 12 ] } } }
      feature { key: "scalar_feature_3"
                value { float_list { value: [ 2.0 ] } } }
      feature { key: "1d_vector_feature"
                value { bytes_list { value: [ 'this is a ,text' ] } } }
      feature { key: "varlen_feature_2"
                value { bytes_list { value: [ 'female' ] } } }
      feature { key: "value" value { float_list { value: [ 12.0, 20.0 ] } } }
      feature { key: "idx" value { int64_list { value: [ 1, 4 ] } } }
    }
    """
        expected_decoded = {
            'scalar_feature_1': 12,
            'scalar_feature_2': 12,
            'scalar_feature_3': 2.0,
            'varlen_feature_1': [89.0],
            '1d_vector_feature': ['this is a ,text'],
            'varlen_feature_2': ['female'],
            'sparse_feature': ([12.0, 20.0], [1, 4])
        }

        # Ensure we can pickle right away.
        coder = pickle.loads(pickle.dumps(coder))
        self._assert_encode_decode(coder, example_proto_text, expected_decoded)
        self._assert_decode_encode(coder, example_proto_text, expected_decoded)

        #  And after use.
        coder = pickle.loads(pickle.dumps(coder))
        self._assert_encode_decode(coder, example_proto_text, expected_decoded)
        self._assert_decode_encode(coder, example_proto_text, expected_decoded)
Пример #28
0
    def testMakeFeedDict(self):
        tensors = {
            'a': tf.placeholder(tf.int64),
            'b': tf.placeholder(tf.float32),
            'c': tf.placeholder(tf.float32),
            'd': tf.placeholder(tf.float32),
            'e': tf.sparse_placeholder(tf.string),
            'f': tf.sparse_placeholder(tf.float32)
        }
        schema = self.toSchema({
            'a':
            tf.FixedLenFeature(None, tf.int64),
            'b':
            tf.FixedLenFeature([], tf.float32),
            'c':
            tf.FixedLenFeature([1], tf.float32),
            'd':
            tf.FixedLenFeature([2, 2], tf.float32),
            'e':
            tf.VarLenFeature(tf.string),
            'f':
            tf.SparseFeature('idx', 'val', tf.float32, 10)
        })

        # Feed some dense and sparse values.
        instances = [{
            'a': 100,
            'b': 1.0,
            'c': [2.0],
            'd': [[1.0, 2.0], [3.0, 4.0]],
            'e': ['doe', 'a', 'deer'],
            'f': ([2, 4, 8], [10.0, 20.0, 30.0])
        }, {
            'a': 100,
            'b': 2.0,
            'c': [4.0],
            'd': [[5.0, 6.0], [7.0, 8.0]],
            'e': ['a', 'female', 'deer'],
            'f': ([], [])
        }]

        feed_dict = impl_helper.make_feed_dict(tensors, schema, instances)
        self.assertSetEqual(set(six.iterkeys(feed_dict)),
                            set(six.itervalues(tensors)))
        self.assertAllEqual(feed_dict[tensors['a']], [100, 100])
        self.assertAllEqual(feed_dict[tensors['b']], [1.0, 2.0])
        self.assertAllEqual(feed_dict[tensors['c']], [[2.0], [4.0]])
        self.assertAllEqual(
            feed_dict[tensors['d']],
            [[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0], [7.0, 8.0]]])
        self.assertSparseValuesEqual(
            feed_dict[tensors['e']],
            tf.SparseTensorValue(
                indices=[(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)],
                values=['doe', 'a', 'deer', 'a', 'female', 'deer'],
                dense_shape=(2, 3)))
        self.assertSparseValuesEqual(
            feed_dict[tensors['f']],
            tf.SparseTensorValue(indices=[(0, 2), (0, 4), (0, 8)],
                                 values=[10.0, 20.0, 30.0],
                                 dense_shape=(2, 10)))

        # Feed numpy versions of everything.
        instances = [{
            'a': np.int64(100),
            'b': np.array(1.0, np.float32),
            'c': np.array([2.0], np.float32),
            'd': np.array([[1.0, 2.0], [3.0, 4.0]], np.float32),
            'e': ['doe', 'a', 'deer'],
            'f': (np.array([2, 4, 8]), np.array([10.0, 20.0, 30.0])),
        }, {
            'a': np.int64(100),
            'b': np.array(2.0, np.float32),
            'c': np.array([4.0], np.float32),
            'd': np.array([[5.0, 6.0], [7.0, 8.0]], np.float32),
            'e': ['a', 'female', 'deer'],
            'f': (np.array([], np.int32), np.array([], np.float32))
        }]

        feed_dict = impl_helper.make_feed_dict(tensors, schema, instances)
        self.assertSetEqual(set(six.iterkeys(feed_dict)),
                            set(six.itervalues(tensors)))
        self.assertAllEqual(feed_dict[tensors['a']], [100, 100])
        self.assertAllEqual(feed_dict[tensors['b']], [1.0, 2.0])
        self.assertAllEqual(feed_dict[tensors['c']], [[2.0], [4.0]])
        self.assertAllEqual(
            feed_dict[tensors['d']],
            [[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0], [7.0, 8.0]]])
        self.assertSparseValuesEqual(
            feed_dict[tensors['e']],
            tf.SparseTensorValue(
                indices=[(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)],
                values=['doe', 'a', 'deer', 'a', 'female', 'deer'],
                dense_shape=(2, 3)))
        self.assertSparseValuesEqual(
            feed_dict[tensors['f']],
            tf.SparseTensorValue(indices=[(0, 2), (0, 4), (0, 8)],
                                 values=[10.0, 20.0, 30.0],
                                 dense_shape=(2, 10)))

        # Feed some empty sparse values
        instances = [{
            'a': 100,
            'b': 5.0,
            'c': [1.0],
            'd': [[1.0, 2.0], [3.0, 4.0]],
            'e': [],
            'f': ([], [])
        }]
        feed_dict = impl_helper.make_feed_dict(tensors, schema, instances)
        self.assertSparseValuesEqual(
            feed_dict[tensors['e']],
            tf.SparseTensorValue(indices=np.empty([0, 2], np.int64),
                                 values=[],
                                 dense_shape=(1, 0)))
        self.assertSparseValuesEqual(
            feed_dict[tensors['f']],
            tf.SparseTensorValue(indices=np.empty([0, 2], np.int64),
                                 values=[],
                                 dense_shape=(1, 10)))
Пример #29
0
import tensorflow as tf
from tensorflow_transform import test_case
from tensorflow_transform.coders import csv_coder
from tensorflow_transform.tf_metadata import dataset_schema

_COLUMNS = [
    'numeric1', 'text1', 'category1', 'idx', 'numeric2', 'value', 'numeric3'
]

_FEATURE_SPEC = {
    'numeric1': tf.FixedLenFeature([], tf.int64),
    'numeric2': tf.VarLenFeature(tf.float32),
    'numeric3': tf.FixedLenFeature([1], tf.int64),
    'text1': tf.FixedLenFeature([], tf.string),
    'category1': tf.VarLenFeature(tf.string),
    'y': tf.SparseFeature('idx', 'value', tf.float32, 10),
}

_ENCODE_DECODE_CASES = [
    dict(
        testcase_name='multiple_columns',
        columns=_COLUMNS,
        feature_spec=_FEATURE_SPEC,
        csv_line='12,"this is a ,text",categorical_value,1,89.0,12.0,20',
        instance={
            'category1': [b'categorical_value'],
            'numeric1': 12,
            'numeric2': [89.0],
            'numeric3': [20],
            'text1': b'this is a ,text',
            'y': ([1], [12.0])
Пример #30
0
class TestCSVCoder(unittest.TestCase):

    _COLUMNS = [
        'numeric1', 'text1', 'category1', 'idx', 'numeric2', 'value',
        'numeric3'
    ]
    _INPUT_SCHEMA = dataset_schema.from_feature_spec({
        'numeric1':
        tf.FixedLenFeature(shape=[], dtype=tf.int64),
        'numeric2':
        tf.VarLenFeature(dtype=tf.float32),
        'numeric3':
        tf.FixedLenFeature(shape=[1], dtype=tf.int64),
        'text1':
        tf.FixedLenFeature(shape=[], dtype=tf.string),
        'category1':
        tf.VarLenFeature(dtype=tf.string),
        'y':
        tf.SparseFeature('idx', 'value', tf.float32, 10),
    })

    _ENCODE_DECODE_CASES = [
        # FixedLenFeature scalar int.
        ('12', 12, False, tf.FixedLenFeature(shape=[], dtype=tf.int64)),
        # FixedLenFeature scalar float without decimal point.
        ('12', 12, False, tf.FixedLenFeature(shape=[], dtype=tf.float32)),
        # FixedLenFeature length 1 vector int.
        ('12', [12], False, tf.FixedLenFeature(shape=[1], dtype=tf.int64)),
        # FixedLenFeature size 1 matrix int.
        ('12', [[12]], False, tf.FixedLenFeature(shape=[1, 1],
                                                 dtype=tf.int64)),
        # FixedLenFeature unquoted text.
        ('this is unquoted text', 'this is unquoted text', False,
         tf.FixedLenFeature(shape=[], dtype=tf.string)),
        # FixedLenFeature quoted text.
        ('"this is a ,text"', 'this is a ,text', False,
         tf.FixedLenFeature(shape=[], dtype=tf.string)),
        # VarLenFeature text.
        ('a test', ['a test'], False, tf.VarLenFeature(dtype=tf.string)),
        # SparseFeature float one value.
        ('5,2.0', ([5], [2.0]), False,
         tf.SparseFeature('idx', 'value', tf.float32, 10)),
        # SparseFeature float no values.
        (',', ([], []), False, tf.SparseFeature('idx', 'value', tf.float32,
                                                10)),
        # FixedLenFeature scalar int, multivalent.
        ('12', 12, True, tf.FixedLenFeature(shape=[], dtype=tf.int64)),
        # FixedLenFeature length 1 vector int, multivalent.
        ('12', [12], True, tf.FixedLenFeature(shape=[1], dtype=tf.int64)),
        # FixedLenFeature length 2 vector int, multivalent.
        ('12|14', [12, 14], True, tf.FixedLenFeature(shape=[2],
                                                     dtype=tf.int64)),
        # FixedLenFeature size 1 matrix int.
        ('12', [[12]], True, tf.FixedLenFeature(shape=[1, 1], dtype=tf.int64)),
        # FixedLenFeature size (2, 2) matrix int.
        ('12|13|14|15', [[12, 13], [14, 15]], True,
         tf.FixedLenFeature(shape=[2, 2], dtype=tf.int64)),
    ]

    _DECODE_ERROR_CASES = [
        # FixedLenFeature scalar numeric missing value.
        ('', ValueError, r'expected a value on column \"x\"', False,
         tf.FixedLenFeature(shape=[], dtype=tf.int64)),
        # FixedLenFeature length 1 vector numeric missing value.
        ('', ValueError, r'expected a value on column \"x\"', False,
         tf.FixedLenFeature(shape=[1], dtype=tf.int64)),
        # FixedLenFeature length >1 vector.
        ('1', ValueError, r'FixedLenFeature \"x\" was not multivalent', False,
         tf.FixedLenFeature(shape=[2], dtype=tf.int64)),
        # FixedLenFeature scalar text missing value.
        ('', ValueError, r'expected a value on column \"x\"', False,
         tf.FixedLenFeature(shape=[], dtype=tf.string)),
        # SparseFeature with missing value but present index.
        ('5,', ValueError,
         r'SparseFeature \"x\" has indices and values of different lengths',
         False, tf.SparseFeature('idx', 'value', tf.float32, 10)),
        # SparseFeature with missing index but present value.
        (',2.0', ValueError,
         r'SparseFeature \"x\" has indices and values of different lengths',
         False, tf.SparseFeature('idx', 'value', tf.float32, 10)),
        # SparseFeature with negative index.
        ('-1,2.0', ValueError, r'has index -1 out of range', False,
         tf.SparseFeature('idx', 'value', tf.float32, 10)),
        # SparseFeature with index equal to size.
        ('10,2.0', ValueError, r'has index 10 out of range', False,
         tf.SparseFeature('idx', 'value', tf.float32, 10)),
        # SparseFeature with index greater than size.
        ('11,2.0', ValueError, r'has index 11 out of range', False,
         tf.SparseFeature('idx', 'value', tf.float32, 10)),
        # FixedLenFeature with text missing value.
        ('test', ValueError, r'could not convert string to float: test', False,
         tf.FixedLenFeature(shape=[], dtype=tf.float32)),
        # FixedLenFeature scalar int, multivalent, too many values.
        ('1|2', ValueError,
         r'FixedLenFeature \"x\" got wrong number of values', True,
         tf.FixedLenFeature(shape=[], dtype=tf.float32)),
        # FixedLenFeature length 1 int, multivalent, too many values.
        ('1|2', ValueError,
         r'FixedLenFeature \"x\" got wrong number of values', True,
         tf.FixedLenFeature(shape=[1], dtype=tf.float32)),
        # FixedLenFeature length 2 int, multivalent, too few values.
        ('1', ValueError, r'FixedLenFeature \"x\" got wrong number of values',
         True, tf.FixedLenFeature(shape=[2], dtype=tf.float32)),
    ]

    _ENCODE_ERROR_CASES = [
        # FixedLenFeature length 2 vector, multivalent with wrong number of
        # values.
        ([1, 2,
          3], ValueError, r'FixedLenFeature \"x\" got wrong number of values',
         True, tf.FixedLenFeature(shape=[2], dtype=tf.string))
    ]

    _DECODE_ONLY_CASES = [
        # FixedLenFeature scalar float with decimal point.
        ('12.0', 12, False, tf.FixedLenFeature(shape=[], dtype=tf.float32)),
        # FixedLenFeature scalar float with quoted value.
        ('"12.0"', 12, False, tf.FixedLenFeature(shape=[], dtype=tf.float32)),
        # VarLenFeature text with missing value.
        ('', [], False, tf.VarLenFeature(dtype=tf.string)),
    ]

    longMessage = True

    def _msg_for_decode_case(self, csv_line, feature_spec):
        return 'While decoding "{csv_line}" with FeatureSpec {feature_spec}'.format(
            csv_line=csv_line, feature_spec=feature_spec)

    def _msg_for_encode_case(self, value, feature_spec):
        return 'While encoding {value} with FeatureSpec {feature_spec}'.format(
            value=value, feature_spec=feature_spec)

    def _assert_encode_decode(self, coder, data, expected_decoded):
        decoded = coder.decode(data)
        np.testing.assert_equal(decoded, expected_decoded)

        encoded = coder.encode(decoded)
        np.testing.assert_equal(encoded, data.encode('utf-8'))

        decoded_again = coder.decode(encoded)
        np.testing.assert_equal(decoded_again, expected_decoded)

    def test_csv_coder(self):
        data = '12,"this is a ,text",categorical_value,1,89.0,12.0,20'

        coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA)

        # Python types.
        expected_decoded = {
            'category1': ['categorical_value'],
            'numeric1': 12,
            'numeric2': [89.0],
            'numeric3': [20],
            'text1': 'this is a ,text',
            'y': ([1], [12.0])
        }
        self._assert_encode_decode(coder, data, expected_decoded)

        # Numpy types.
        expected_decoded = {
            'category1': np.array(['categorical_value']),
            'numeric1': np.array(12),
            'numeric2': np.array([89.0]),
            'numeric3': np.array([20]),
            'text1': np.array(['this is a ,text']),
            'y': (np.array(1), np.array([12.0]))
        }
        self._assert_encode_decode(coder, data, expected_decoded)

    def test_csv_coder_with_unicode(self):
        data = u'12,"this is a ,text",Hello κόσμε,1,89.0,12.0,20'

        coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA)

        # Python types.
        expected_decoded = {
            'category1': [u'Hello κόσμε'.encode('utf-8')],
            'numeric1': 12,
            'numeric2': [89.0],
            'numeric3': [20],
            'text1': 'this is a ,text',
            'y': ([1], [12.0])
        }
        self._assert_encode_decode(coder, data, expected_decoded)

        # Numpy types.
        expected_decoded = {
            'category1': np.array([u'Hello κόσμε'.encode('utf-8')]),
            'numeric1': np.array(12),
            'numeric2': np.array([89.0]),
            'numeric3': np.array([20]),
            'text1': np.array(['this is a ,text']),
            'y': (np.array(1), np.array([12.0]))
        }
        self._assert_encode_decode(coder, data, expected_decoded)

    def test_tsv_coder(self):
        data = '12\t"this is a \ttext"\tcategorical_value\t1\t89.0\t12.0\t20'

        coder = csv_coder.CsvCoder(self._COLUMNS,
                                   self._INPUT_SCHEMA,
                                   delimiter='\t')
        expected_decoded = {
            'category1': ['categorical_value'],
            'numeric1': 12,
            'numeric2': [89.0],
            'numeric3': [20],
            'text1': 'this is a \ttext',
            'y': ([1], [12.0])
        }
        self._assert_encode_decode(coder, data, expected_decoded)

    def test_valency(self):
        data = (
            '11|12,"this is a ,text",categorical_value|other_value,1|3,89.0|'
            '91.0,12.0|15.0,20')
        feature_spec = self._INPUT_SCHEMA.as_feature_spec().copy()
        feature_spec['numeric1'] = tf.FixedLenFeature(shape=[2],
                                                      dtype=tf.int64)
        schema = dataset_schema.from_feature_spec(feature_spec)
        multivalent_columns = ['numeric1', 'numeric2', 'y']
        coder = csv_coder.CsvCoder(self._COLUMNS,
                                   schema,
                                   delimiter=',',
                                   secondary_delimiter='|',
                                   multivalent_columns=multivalent_columns)
        expected_decoded = {
            'category1': ['categorical_value|other_value'],
            'numeric1': [11, 12],
            'numeric2': [89.0, 91.0],
            'numeric3': [20],
            'text1': 'this is a ,text',
            'y': ([1, 3], [12.0, 15.0])
        }
        self._assert_encode_decode(coder, data, expected_decoded)

    # Test successful decoding with a single column.
    def testDecode(self):
        for csv_line, value, multivalent, feature_spec in (
                self._ENCODE_DECODE_CASES + self._DECODE_ONLY_CASES):
            schema = dataset_schema.from_feature_spec({'x': feature_spec})
            if isinstance(feature_spec, tf.SparseFeature):
                columns = [feature_spec.index_key, feature_spec.value_key]
            else:
                columns = 'x'

            if multivalent:
                coder = csv_coder.CsvCoder(columns,
                                           schema,
                                           secondary_delimiter='|',
                                           multivalent_columns=columns)
            else:
                coder = csv_coder.CsvCoder(columns, schema)

            np.testing.assert_equal(
                coder.decode(csv_line), {'x': value},
                self._msg_for_decode_case(csv_line, feature_spec))

    # Test decode errors with a single column.
    def testDecodeErrors(self):
        for csv_line, error_type, error_msg, multivalent, feature_spec in (
                self._DECODE_ERROR_CASES):
            schema = dataset_schema.from_feature_spec({'x': feature_spec})
            if isinstance(feature_spec, tf.SparseFeature):
                columns = [feature_spec.index_key, feature_spec.value_key]
            else:
                columns = 'x'

            with self.assertRaisesRegexp(error_type,
                                         error_msg,
                                         msg=self._msg_for_decode_case(
                                             csv_line, feature_spec)):
                # We don't distinguish between errors in the coder constructor and in
                # the decode method.
                if multivalent:
                    coder = csv_coder.CsvCoder(columns,
                                               schema,
                                               secondary_delimiter='|',
                                               multivalent_columns=columns)
                else:
                    coder = csv_coder.CsvCoder(columns, schema)
                coder.decode(csv_line)

    # Test successful encoding with a single column.
    def testEncode(self):
        for csv_line, value, multivalent, feature_spec in self._ENCODE_DECODE_CASES:
            schema = dataset_schema.from_feature_spec({'x': feature_spec})
            if isinstance(feature_spec, tf.SparseFeature):
                columns = [feature_spec.index_key, feature_spec.value_key]
            else:
                columns = 'x'

            if multivalent:
                coder = csv_coder.CsvCoder(columns,
                                           schema,
                                           secondary_delimiter='|',
                                           multivalent_columns=columns)
            else:
                coder = csv_coder.CsvCoder(columns, schema)

            self.assertEqual(coder.encode({'x': value}),
                             csv_line,
                             msg=self._msg_for_encode_case(
                                 value, feature_spec))

    # Test successful encoding with a single column.
    def testEncodeErrors(self):
        for value, error_type, error_msg, multivalent, feature_spec in (
                self._ENCODE_ERROR_CASES):
            schema = dataset_schema.from_feature_spec({'x': feature_spec})
            if isinstance(feature_spec, tf.SparseFeature):
                columns = [feature_spec.index_key, feature_spec.value_key]
            else:
                columns = 'x'

            with self.assertRaisesRegexp(error_type,
                                         error_msg,
                                         msg=self._msg_for_encode_case(
                                             value, feature_spec)):
                if multivalent:
                    coder = csv_coder.CsvCoder(columns,
                                               schema,
                                               secondary_delimiter='|',
                                               multivalent_columns=columns)
                else:
                    coder = csv_coder.CsvCoder(columns, schema)

                coder.encode({'x': value})

    def test_missing_data(self):
        coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA)

        data = '12,,categorical_value,1,89.0,12.0,20'
        with self.assertRaisesRegexp(ValueError,
                                     'expected a value on column \"text1\"'):
            coder.decode(data)

    def test_bad_row(self):
        coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA)

        # The data has a more columns than expected.
        data = ('12,"this is a ,text",categorical_value,1,89.0,12.0,'
                '"oh no, I\'m an error",14')
        with self.assertRaisesRegexp(
                Exception, 'Columns do not match specified csv headers'):
            coder.decode(data)

        # The data has a fewer columns than expected.
        data = '12,"this is a ,text",categorical_value"'
        with self.assertRaisesRegexp(
                Exception, 'Columns do not match specified csv headers'):
            coder.decode(data)

    def test_column_not_found(self):
        with self.assertRaisesRegexp(ValueError, 'Column not found: '):
            csv_coder.CsvCoder([], self._INPUT_SCHEMA)

    def test_picklable(self):
        encoded_data = '12,"this is a ,text",categorical_value,1,89.0,12.0,20'

        expected_decoded = {
            'category1': ['categorical_value'],
            'numeric1': 12,
            'numeric2': [89.0],
            'numeric3': [20],
            'text1': 'this is a ,text',
            'y': ([1], [12.0])
        }

        coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA)

        # Ensure we can pickle right away.
        coder = pickle.loads(pickle.dumps(coder))
        self._assert_encode_decode(coder, encoded_data, expected_decoded)

        #  And after use.
        coder = pickle.loads(pickle.dumps(coder))
        self._assert_encode_decode(coder, encoded_data, expected_decoded)

    def test_decode_errors(self):
        input_schema = dataset_schema.from_feature_spec({
            'b':
            tf.FixedLenFeature(shape=[], dtype=tf.float32),
            'a':
            tf.FixedLenFeature(shape=[], dtype=tf.string),
        })
        coder = csv_coder.CsvCoder(column_names=['a', 'b'],
                                   schema=input_schema)

        # Test bad csv.
        with self.assertRaisesRegexp(
                csv_coder.DecodeError,
                '\'int\' object has no attribute \'encode\': 123'):
            coder.decode(123)

        # Test extra column.
        with self.assertRaisesRegexp(
                csv_coder.DecodeError,
                'Columns do not match specified csv headers'):
            coder.decode('1,2,')

        # Test missing column.
        with self.assertRaisesRegexp(
                csv_coder.DecodeError,
                'Columns do not match specified csv headers'):
            coder.decode('a_value')

        # Test empty row.
        with self.assertRaisesRegexp(
                csv_coder.DecodeError,
                'Columns do not match specified csv headers'):
            coder.decode('')