def __serializedToRows(self, serializedThing): example_features = { 'feature': tf.SparseFeature( index_key=['feature_indices_0', 'feature_indices_1'], value_key='feature_values', dtype=tf.int64, size=self.meta['feature_dense_shape']), 'label': tf.SparseFeature(index_key=['label_indices_0', 'label_indices_1'], value_key='label_values', dtype=tf.int64, size=self.meta['label_dense_shape']) } rows = tf.parse_single_example(serializedThing, features=example_features) feature, label = rows['feature'], rows['label'] feature, label = tf.reshape( tf.sparse_tensor_to_dense(feature), [self.meta['feature_dense_shape'][0]]), tf.reshape( tf.sparse_tensor_to_dense(label), [self.meta['label_dense_shape'][0]]) feature, label = tf.cast(feature, dtype=tf.float32), tf.cast(label, dtype=tf.int8) return feature, label
def testMalformedSparseFeatures(self): tensors = { 'a': tf.sparse_placeholder(tf.int64), } # Invalid indices. schema = self.toSchema( {'a': tf.SparseFeature('idx', 'val', tf.float32, 10)}) instances = [{'a': ([-1, 2], [1.0, 2.0])}] with self.assertRaisesRegexp(ValueError, 'has index .* out of range'): impl_helper.make_feed_dict(tensors, schema, instances) instances = [{'a': ([11, 1], [1.0, 2.0])}] with self.assertRaisesRegexp(ValueError, 'has index .* out of range'): impl_helper.make_feed_dict(tensors, schema, instances) # Indices and values of different lengths. schema = self.toSchema( {'a': tf.SparseFeature('idx', 'val', tf.float32, 10)}) instances = [{'a': ([1, 2], [1])}] with self.assertRaisesRegexp( ValueError, 'indices and values of different lengths'): impl_helper.make_feed_dict(tensors, schema, instances) # Tuple of the wrong length. instances = [{'a': ([1], [2], [3])}] with self.assertRaisesRegexp(ValueError, 'too many values to unpack'): impl_helper.make_feed_dict(tensors, schema, instances)
def testRunTransformFn(self): schema = self.toSchema({ 'dense_1': tf.FixedLenFeature((), tf.float32), 'dense_2': tf.FixedLenFeature((1, 2), tf.int64), 'var_len': tf.VarLenFeature(tf.string), 'sparse': tf.SparseFeature('ix', 'val', tf.float32, 100) }) def preprocessing_fn(inputs): return { 'dense_out': mappers.scale_to_0_1(inputs['dense_1']), 'sparse_out': api.map(lambda x: tf.sparse_reshape(x, (1, 10)), inputs['sparse']) } inputs, outputs = impl_helper.run_preprocessing_fn( preprocessing_fn, schema) # Verify that the input placeholders have the correct types. expected_dtype_and_shape = { 'dense_1': (tf.float32, tf.TensorShape([None])), 'dense_2': (tf.int64, tf.TensorShape([None, 1, 2])), 'var_len': (tf.string, tf.TensorShape(None)), 'sparse': (tf.float32, tf.TensorShape(None)), 'dense_out': (tf.float32, tf.TensorShape([None])), 'sparse_out': (tf.float32, tf.TensorShape([None, None])), } for key, column in inputs.items() + outputs.items(): dtype, shape = expected_dtype_and_shape[key] self.assertEqual(column.tensor.dtype, dtype) self.assertShapesEqual(column.tensor.get_shape(), shape)
def testRunPreprocessingFn(self): schema = self.toSchema({ 'dense_1': tf.FixedLenFeature((), tf.float32), 'dense_2': tf.FixedLenFeature((1, 2), tf.int64), 'var_len': tf.VarLenFeature(tf.string), 'sparse': tf.SparseFeature('ix', 'val', tf.float32, 100) }) def preprocessing_fn(inputs): return { 'dense_out': mappers.scale_to_0_1(inputs['dense_1']), 'sparse_out': tf.sparse_reshape(inputs['sparse'], (1, 10)), } _, inputs, outputs = impl_helper.run_preprocessing_fn( preprocessing_fn, schema) # Verify that the input placeholders have the correct types. expected_dtype_and_shape = { 'dense_1': (tf.float32, tf.TensorShape([None])), 'dense_2': (tf.int64, tf.TensorShape([None, 1, 2])), 'var_len': (tf.string, tf.TensorShape([None, None])), 'sparse': (tf.float32, tf.TensorShape([None, None])), 'dense_out': (tf.float32, tf.TensorShape([None])), 'sparse_out': (tf.float32, tf.TensorShape([None, None])), } for key, tensor in itertools.chain(six.iteritems(inputs), six.iteritems(outputs)): dtype, shape = expected_dtype_and_shape[key] self.assertEqual(tensor.dtype, dtype) tensor.get_shape().assert_is_compatible_with(shape)
def test_example_with_feature_spec_decoder(self): feature_spec = { "scalar_feature_1": tf.FixedLenFeature(shape=[], dtype=tf.int64), "scalar_feature_2": tf.FixedLenFeature(shape=[], dtype=tf.int64), "scalar_feature_3": tf.FixedLenFeature(shape=[], dtype=tf.float32), "varlen_feature_1": tf.VarLenFeature(dtype=tf.float32), "varlen_feature_2": tf.VarLenFeature(dtype=tf.string), "1d_vector_feature": tf.FixedLenFeature(shape=[1], dtype=tf.string), "2d_vector_feature": tf.FixedLenFeature(shape=[2, 2], dtype=tf.float32), "sparse_feature": tf.SparseFeature("sparse_feature_idx", "sparse_feature_value", tf.float32, 10), } dec = ExampleWithFeatureSpecDecoder(feature_spec) actual_json = json.loads(dec.to_json(self.example_str)) expected_decoded = { "scalar_feature_1": 12, "scalar_feature_2": 12, "scalar_feature_3": 1.0, "varlen_feature_1": [89.0], "1d_vector_feature": ["this is a ,text"], "2d_vector_feature": [[1.0, 2.0], [3.0, 4.0]], "varlen_feature_2": ["female"], "sparse_feature_idx": [1, 4], "sparse_feature_value": [12.0, 20.0], } self.assertEqual(actual_json, expected_decoded)
def inputs(file_pattern): pattern = os.path.join(FLAGS.dataset_dir, file_pattern) print(pattern) files = tf.gfile.Glob(pattern) capacity = 10000 + 10000 * FLAGS.batch_size with tf.device('/cpu:0'): with tf.name_scope('input'): filename_queue = tf.train.string_input_producer(files) reader = tf.TFRecordReader() key, value = reader.read_up_to(filename_queue, 10240) record = tf.train.shuffle_batch([value], batch_size=FLAGS.batch_size, num_threads=2, min_after_dequeue=FLAGS.batch_size * 1000, capacity=capacity, enqueue_many=True) parsed = tf.parse_example( record, features={ 'label' : tf.FixedLenFeature([FLAGS.label_size], dtype=tf.float32), "feature": tf.SparseFeature(index_key="fea_id", value_key="fea_value", dtype=tf.float32, size=FLAGS.fea_size), 'neig_id' : tf.VarLenFeature(dtype=tf.int64), 'neig_value' : tf.VarLenFeature(dtype=tf.float32), }) return parsed['label'], parsed['feature'], parsed['neig_id'], parsed['neig_value']
def testMakeOutputDict(self): schema = self.toSchema({ 'a': tf.FixedLenFeature(None, tf.int64), 'b': tf.FixedLenFeature([], tf.float32), 'c': tf.FixedLenFeature([1], tf.float32), 'd': tf.FixedLenFeature([2, 2], tf.float32), 'e': tf.VarLenFeature(tf.string), 'f': tf.SparseFeature('idx', 'val', tf.float32, 10) }) fetches = { 'a': np.array([100, 200]), 'b': np.array([10.0, 20.0]), 'c': np.array([[40.0], [80.0]]), 'd': np.array([[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0], [7.0, 8.0]]]), 'e': tf.SparseTensorValue( indices=np.array([(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)]), values=np.array(['doe', 'a', 'deer', 'a', 'female', 'deer']), dense_shape=(2, 3)), 'f': tf.SparseTensorValue(indices=np.array([(0, 2), (0, 4), (0, 8), (1, 4), (1, 8)]), values=np.array( [10.0, 20.0, 30.0, 40.0, 50.0]), dense_shape=(2, 20)) } instance_dicts = impl_helper.to_instance_dicts(schema, fetches) self.assertEqual(2, len(instance_dicts)) self.assertSetEqual(set(six.iterkeys(instance_dicts[0])), set(['a', 'b', 'c', 'd', 'e', 'f'])) self.assertAllEqual(instance_dicts[0]['a'], 100) self.assertAllEqual(instance_dicts[0]['b'], 10.0) self.assertAllEqual(instance_dicts[0]['c'], [40.0]) self.assertAllEqual(instance_dicts[0]['d'], [[1.0, 2.0], [3.0, 4.0]]) self.assertAllEqual(instance_dicts[0]['e'], ['doe', 'a', 'deer']) self.assertEqual(len(instance_dicts[0]['f']), 2) self.assertAllEqual(instance_dicts[0]['f'][0], [2, 4, 8]) self.assertAllEqual(instance_dicts[0]['f'][1], [10.0, 20.0, 30.0]) self.assertAllEqual(instance_dicts[1]['a'], 200) self.assertAllEqual(instance_dicts[1]['b'], 20.0) self.assertAllEqual(instance_dicts[1]['c'], [80.0]) self.assertAllEqual(instance_dicts[1]['d'], [[5.0, 6.0], [7.0, 8.0]]) self.assertAllEqual(instance_dicts[1]['e'], ['a', 'female', 'deer']) self.assertEqual(len(instance_dicts[1]['f']), 2) self.assertAllEqual(instance_dicts[1]['f'][0], [4, 8]) self.assertAllEqual(instance_dicts[1]['f'][1], [40.0, 50.0])
def parse_simple_example(serialized_batch_example, input_dim): data = tf.parse_example(serialized_batch_example, features={ 'label': tf.FixedLenFeature([], tf.float32), 'deep': tf.SparseFeature(index_key='col_index', value_key='col_value', dtype=tf.float32, size=input_dim), 'wide': tf.SparseFeature(index_key='bias_index', value_key='bias_value', dtype=tf.float32, size=input_dim) }) return data['deep'], data['label'], data['wide']
def testSerializedContainingSparseAndSparseFeatureAndDenseWithNoDefault( self): expected_st_a = ( # indices, values, shape np.empty((0, 2), dtype=np.int64), # indices np.empty((0, ), dtype=np.int64), # sp_a is DT_INT64 np.array([2, 0], dtype=np.int64)) # batch == 2, max_elems = 0 expected_sp = ( # indices, values, shape np.array([[0, 0], [0, 3], [1, 7]], dtype=np.int64), np.array(["a", "b", "c"], dtype="|S"), np.array([2, 13], dtype=np.int64)) # batch == 4, max_elems = 13 original = [ example(features=features({ "c": float_feature([3, 4]), "val": bytes_feature([b"a", b"b"]), "idx": int64_feature([0, 3]) })), example(features=features({ "c": float_feature([1, 2]), "val": bytes_feature([b"c"]), "idx": int64_feature([7]) })) ] names = ["in1", "in2"] serialized = [m.SerializeToString() for m in original] a_default = [1, 2, 3] b_default = np.random.rand(3, 3).astype(bytes) expected_output = { "st_a": expected_st_a, "sp": expected_sp, "a": np.array(2 * [[a_default]]), "b": np.array(2 * [b_default]), "c": np.array([[3, 4], [1, 2]], dtype=np.float32), } self._test( { "example_names": names, "serialized": tf.convert_to_tensor(serialized), "features": { "st_a": tf.VarLenFeature(tf.int64), "sp": tf.SparseFeature("idx", "val", tf.string, 13), "a": tf.FixedLenFeature( (1, 3), tf.int64, default_value=a_default), "b": tf.FixedLenFeature( (3, 3), tf.string, default_value=b_default), # Feature "c" must be provided, since it has no default_value. "c": tf.FixedLenFeature((2, ), tf.float32), } }, expected_output)
def as_feature_spec(self, column): ind = self.index_fields if len(ind) != 1 or len(column.axes) != 1: raise ValueError( 'tf.Example parser supports only 1-d sparse features.') index = ind[0] return tf.SparseFeature(index.name, self._value_field_name, column.domain.dtype, column.axes[0].size, index.is_sorted)
def make_input_fn(self, file_paths, epochs=None): """ Function that loads the TFRecords files and creates the placeholders for the data inputs. Parameters ---------- file_paths : list List of TFRecord files from which to read from. epochs : int Integer specifying the number of times to read through the dataset. If None, cycles through the dataset forever. NOTE - If specified, creates a variable that must be initialized, so call tf.local_variables_initializer() and run the op in a session. Default is None. Returns ------- features : Tensor Tensor containing a batch of cells (vector of expression levels). cluster : Tensor Tensor containing (a batch of) the cluster indexes of the corresponding cells. """ feature_map = { 'scg': tf.SparseFeature(index_key='indices', value_key='values', dtype=tf.float32, size=self.genes_no), 'cluster_int': tf.FixedLenFeature(1, tf.int64) } options = tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.GZIP) batched_features = tf.contrib.learn.read_batch_features( file_pattern=file_paths, batch_size=self.batch_size, features=feature_map, reader=lambda: tf.TFRecordReader(options=options), num_epochs=epochs) sgc = batched_features['scg'] sparse = tf.sparse_reshape(sgc, (self.batch_size, self.genes_no)) dense = tf.sparse_tensor_to_dense(sparse) cluster = tf.squeeze(tf.to_int32(batched_features['cluster_int'])) features = tf.reshape(dense, (self.batch_size, self.genes_no)) return features, cluster
def tfrecord_schema(original_dim=202498): return { 'sparse': tf.SparseFeature(index_key=['token_ids'], value_key='counts', dtype=tf.int64, size=[original_dim]), "volid": tf.FixedLenFeature((), tf.string, default_value=""), #'page_seq': tf.FixedLenFeature((), tf.string, default_value="") }
def parse_field_example(serialized_batch_example, input_dim, field_sizes): features = { 'label': tf.FixedLenFeature([], tf.float32), 'wide': tf.SparseFeature(index_key='bias_idx', value_key='bias_val', dtype=tf.float32, size=input_dim) } num_field = len(field_sizes) for i in range(num_field): features['%d' % i] = tf.SparseFeature(index_key='i_%d' % (i), value_key='v_%d' % (i), dtype=tf.float32, size=field_sizes[i]) data = tf.parse_example(serialized_batch_example, features=features) X = [data['%d' % i] for i in range(num_field)] return X, data['label'], data['wide']
def parse_fn(example): example_fmt = { "embedding_average": tf.FixedLenFeature([8], tf.float32), "one_hot": tf.SparseFeature(index_key=["index"], value_key="value", dtype=tf.float32, size=[15]) # size必须写死, 不能传超参 } parsed = tf.parse_single_example(example, example_fmt) return parsed["embedding_average"], tf.sparse_tensor_to_dense( parsed["one_hot"])
def testSingleExampleWithSparseAndSparseFeatureAndDense(self): original = example(features=features({ "c": float_feature([3, 4]), "val": bytes_feature([b"a", b"b"]), "idx": int64_feature([0, 3]), "st_a": float_feature([3.0, 4.0]) })) serialized = original.SerializeToString() expected_st_a = ( np.array([[0], [1]], dtype=np.int64), # indices np.array([3.0, 4.0], dtype=np.float32), # values np.array([2], dtype=np.int64)) # shape: max_values = 2 expected_sp = ( # indices, values, shape np.array([[0], [3]], dtype=np.int64), np.array(["a", "b"], dtype="|S"), np.array([13], dtype=np.int64)) # max_values = 13 a_default = [1, 2, 3] b_default = np.random.rand(3, 3).astype(bytes) expected_output = { "st_a": expected_st_a, "sp": expected_sp, "a": [a_default], "b": b_default, "c": np.array([3, 4], dtype=np.float32), } self._test( { "example_names": tf.convert_to_tensor("in1"), "serialized": tf.convert_to_tensor(serialized), "features": { "st_a": tf.VarLenFeature(tf.float32), "sp": tf.SparseFeature("idx", "val", tf.string, 13), "a": tf.FixedLenFeature( (1, 3), tf.int64, default_value=a_default), "b": tf.FixedLenFeature( (3, 3), tf.string, default_value=b_default), # Feature "c" must be provided, since it has no default_value. "c": tf.FixedLenFeature((2, ), tf.float32), } }, expected_output)
def get_TFReord_parser(self): ''' Create the parser used to parse data read from TFRecord''' context_feature_columns, example_feature_columns = self.create_feature_columns( ) # build feature map feature_map = {} feature_map['label'] = tf.FixedLenFeature([self.list_size], tf.float32) for k in context_feature_columns: if k.endswith('unigrams'): feature_map[k] = tf.SparseFeature(index_key=['%s_idx' % k], value_key='%s_int_value' % k, dtype=tf.int64, size=[self.max_query_length]) else: feature_map[k] = tf.FixedLenFeature([1], tf.float32) for k in example_feature_columns: if k.endswith('unigrams'): feature_map[k] = tf.SparseFeature( index_key=['%s_list_idx' % k, '%s_idx' % k], value_key='%s_int_value' % k, dtype=tf.int64, size=[self.list_size, self.max_doc_length]) else: feature_map[k] = tf.FixedLenFeature([self.list_size], tf.float32) def parser(serialized_example): """Parses a single tf.Example into image and label tensors.""" features = tf.parse_single_example(serialized_example, features=feature_map) label = features.pop('label') print(features['bm25s']) return features, label return parser
def testSerializedContainingSparseFeatureReuse(self): original = [ example(features=features({ "val1": float_feature([3, 4]), "val2": float_feature([5, 6]), "idx": int64_feature([5, 10]) })), example(features=features({ "val1": float_feature([]), # empty float list "idx": int64_feature([]) })), ] serialized = [m.SerializeToString() for m in original] expected_sp1 = ( # indices, values, shape np.array([[0, 5], [0, 10]], dtype=np.int64), np.array([3.0, 4.0], dtype=np.float32), np.array([2, 13], dtype=np.int64)) # batch == 2, max_elems = 13 expected_sp2 = ( # indices, values, shape np.array([[0, 5], [0, 10]], dtype=np.int64), np.array([5.0, 6.0], dtype=np.float32), np.array([2, 7], dtype=np.int64)) # batch == 2, max_elems = 13 expected_output = { "sp1": expected_sp1, "sp2": expected_sp2, } self._test( { "serialized": tf.convert_to_tensor(serialized), "features": { "sp1": tf.SparseFeature("idx", "val1", tf.float32, 13), "sp2": tf.SparseFeature("idx", "val2", tf.float32, 7) } }, expected_output)
def test_round_trip(self): feature_spec = { "scalar_feature_1": tf.FixedLenFeature(shape=[], dtype=tf.int64), "scalar_feature_2": tf.FixedLenFeature(shape=[], dtype=tf.int64), "scalar_feature_3": tf.FixedLenFeature(shape=[], dtype=tf.float32), "varlen_feature_1": tf.VarLenFeature(dtype=tf.float32), "varlen_feature_2": tf.VarLenFeature(dtype=tf.string), "1d_vector_feature": tf.FixedLenFeature(shape=[1], dtype=tf.string), "2d_vector_feature": tf.FixedLenFeature(shape=[2, 2], dtype=tf.float32), "sparse_feature": tf.SparseFeature("idx", "value", tf.float32, 10), } inferred_schema = feature_spec_to_schema(feature_spec) inferred_feature_spec = schema_to_feature_spec(inferred_schema) self.assertEqual(inferred_feature_spec, feature_spec)
def _parse_sparse_feature(cls, feature): # type: (Schema.Feature) -> Tuple[str, tf.SparseFeature] if len(feature.index_feature) == 1: index_key = feature.index_feature[0].name else: index_key = [idf.name for idf in feature.index_feature] dtype = cls._tf_type_mapper.proto_to_tf_type(feature, is_sparse=True) if len(feature.dense_shape.dim) == 1: size = feature.dense_shape.dim[0].size else: size = [d.size for d in feature.dense_shape.dim] return feature.name, tf.SparseFeature(index_key=index_key, value_key=feature.value_feature.name, dtype=dtype, size=size)
def as_feature_spec(self, column): ind = self.index_fields if len(ind) != 1 or len(column.axes) != 1: raise ValueError( 'tf.Example parser supports only 1-d sparse features.') index = ind[0] if column.domain.dtype not in _TF_EXAMPLE_ALLOWED_TYPES: raise ValueError( 'tf.Example parser supports only types {}, so it is ' 'invalid to generate a feature_spec with type ' '{}.'.format(_TF_EXAMPLE_ALLOWED_TYPES, repr(column.domain.dtype))) return tf.SparseFeature(index.name, self._value_field_name, column.domain.dtype, column.axes[0].size, index.is_sorted)
def test_sparse_feature_incorrect_values(self): input_schema = dataset_schema.from_feature_spec({ 'a': tf.SparseFeature('idx', 'value', tf.float32, 10), }) coder = csv_coder.CsvCoder(column_names=['idx', 'value'], schema=input_schema) # Index negative. with self.assertRaisesRegexp(ValueError, 'has index -1 out of range'): coder.decode('-1,12.0') # Index equal to size. with self.assertRaisesRegexp(ValueError, 'has index 10 out of range'): coder.decode('10,12.0') # Index greater than size. with self.assertRaisesRegexp(ValueError, 'has index 11 out of range'): coder.decode('11,12.0')
def test_sparse_feature_missing_values(self): input_schema = dataset_schema.from_feature_spec({ 'a': tf.SparseFeature('idx', 'value', tf.float32, 10), }) coder = csv_coder.CsvCoder(column_names=['idx', 'value'], schema=input_schema) # Missing both value and index (which is allowed). self.assertEqual(coder.decode(','), {'a': ([], [])}) # Missing index only (not allowed). with self.assertRaisesRegexp(ValueError, 'expected an index in column "idx"'): coder.decode(',12.0') # Missing value only (not allowed). with self.assertRaisesRegexp(ValueError, 'expected a value in column "value"'): coder.decode('1,')
def testMakeOutputDict(self): schema = self.toSchema({ 'a': tf.FixedLenFeature(None, tf.int64), 'b': tf.FixedLenFeature([2, 2], tf.float32), 'c': tf.VarLenFeature(tf.string), 'd': tf.SparseFeature('idx', 'val', tf.float32, 10) }) fetches = { 'a': np.asarray([100, 200]), 'b': np.asarray([[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0], [7.0, 8.0]]]), 'c': tf.SparseTensorValue( indices=[(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)], values=['doe', 'a', 'deer', 'a', 'female', 'deer'], dense_shape=(2, 3)), 'd': tf.SparseTensorValue(indices=[(0, 2), (0, 4), (0, 8)], values=[10.0, 20.0, 30.0], dense_shape=(2, 20)) } output_dicts = impl_helper.make_output_dict(schema, fetches) self.assertEqual(2, len(output_dicts)) self.assertSetEqual(set(output_dicts[0].keys()), set(['a', 'b', 'c', 'idx', 'val'])) self.assertAllEqual(output_dicts[0]['a'], 100) self.assertAllEqual(output_dicts[0]['b'], [[1.0, 2.0], [3.0, 4.0]]) self.assertAllEqual(output_dicts[0]['c'], ['doe', 'a', 'deer']) self.assertAllEqual(output_dicts[0]['idx'], [2, 4, 8]) self.assertAllEqual(output_dicts[0]['val'], [10.0, 20.0, 30.0]) self.assertAllEqual(output_dicts[1]['a'], 200) self.assertAllEqual(output_dicts[1]['b'], [[5.0, 6.0], [7.0, 8.0]]) self.assertAllEqual(output_dicts[1]['c'], ['a', 'female', 'deer']) self.assertAllEqual(output_dicts[1]['idx'], []) self.assertAllEqual(output_dicts[1]['val'], [])
def testSerializedContainingSparseAndSparseFeatureWithReuse(self): expected_idx = ( # indices, values, shape np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64), np.array([0, 3, 7, 1]), np.array([2, 2], dtype=np.int64)) # batch == 4, max_elems = 2 expected_sp = ( # indices, values, shape np.array([[0, 0], [0, 3], [1, 1], [1, 7]], dtype=np.int64), np.array(["a", "b", "d", "c"], dtype="|S"), np.array([2, 13], dtype=np.int64)) # batch == 4, max_elems = 13 original = [ example(features=features({ "val": bytes_feature([b"a", b"b"]), "idx": int64_feature([0, 3]) })), example(features=features({ "val": bytes_feature([b"c", b"d"]), "idx": int64_feature([7, 1]) })) ] names = ["in1", "in2"] serialized = [m.SerializeToString() for m in original] expected_output = { "idx": expected_idx, "sp": expected_sp, } self._test( { "example_names": names, "serialized": tf.convert_to_tensor(serialized), "features": { "idx": tf.VarLenFeature(tf.int64), "sp": tf.SparseFeature("idx", "val", tf.string, 13), } }, expected_output)
def testSerializedContainingSparseFeature(self): original = [ example(features=features({ "val": float_feature([3, 4]), "idx": int64_feature([5, 10]) })), example(features=features({ "val": float_feature([]), # empty float list "idx": int64_feature([]) })), example(features=features({ "val": feature(), # feature with nothing in it # missing idx feature })), example(features=features({ "val": float_feature([1, 2, -1]), "idx": int64_feature([0, 9, 3]) # unsorted })) ] serialized = [m.SerializeToString() for m in original] expected_sp = ( # indices, values, shape np.array([[0, 5], [0, 10], [3, 0], [3, 3], [3, 9]], dtype=np.int64), np.array([3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32), np.array([4, 13], dtype=np.int64)) # batch == 4, max_elems = 13 expected_output = { "sp": expected_sp, } self._test( { "serialized": tf.convert_to_tensor(serialized), "features": { "sp": tf.SparseFeature("idx", "val", tf.float32, 13) } }, expected_output)
def test_all_values_present(self): columns = ['a', 'b', 'c', 'd', 'e'] input_schema = dataset_schema.from_feature_spec({ 'b': tf.FixedLenFeature(shape=[], dtype=tf.float32), 'a': tf.FixedLenFeature(shape=[], dtype=tf.string), 'c': tf.VarLenFeature(dtype=tf.string), 'y': tf.SparseFeature('d', 'e', tf.float32, 10), }) coder = csv_coder.CsvCoder(column_names=columns, schema=input_schema) self.assertEqual( coder.decode('a_value,1.0,0,1,12.0'), # Column 'c' is specified as a string so the value is not casted. { 'a': 'a_value', 'b': 1.0, 'c': ['0'], 'y': ([12.0], [1]) })
class ExampleProtoCoderTest(unittest.TestCase): _INPUT_SCHEMA = dataset_schema.from_feature_spec({ 'scalar_feature_1': tf.FixedLenFeature(shape=[], dtype=tf.int64), 'scalar_feature_2': tf.FixedLenFeature(shape=[], dtype=tf.int64), 'scalar_feature_3': tf.FixedLenFeature(shape=[], dtype=tf.float32), 'varlen_feature_1': tf.VarLenFeature(dtype=tf.float32), '1d_vector_feature': tf.FixedLenFeature(shape=[0], dtype=tf.string), 'varlen_feature_2': tf.VarLenFeature(dtype=tf.string), 'sparse_feature': tf.SparseFeature('idx', 'value', tf.float32, 10), }) def _assert_encode_decode(self, coder, expected_proto_text, expected_decoded): example = tf.train.Example() text_format.Merge(expected_proto_text, example) data = example.SerializeToString() # Assert the data is decoded into the expected format. decoded = coder.decode(data) np.testing.assert_equal(expected_decoded, decoded) # Assert the decoded data can be encoded back into the original proto. encoded = coder.encode(decoded) parsed_example = tf.train.Example() parsed_example.ParseFromString(encoded) self.assertEqual(example, parsed_example) # Assert the data can be decoded from the encoded string. decoded_again = coder.decode(encoded) np.testing.assert_equal(expected_decoded, decoded_again) def _assert_decode_encode(self, coder, expected_proto_text, expected_decoded): example = tf.train.Example() text_format.Merge(expected_proto_text, example) # Assert the expected decoded data can be encoded into the expected proto. encoded = coder.encode(expected_decoded) parsed_example = tf.train.Example() parsed_example.ParseFromString(encoded) self.assertEqual(example, parsed_example) # Assert the encoded data can be decoded into the original input. decoded = coder.decode(encoded) np.testing.assert_equal(expected_decoded, decoded) # Assert the decoded data can be encoded back into the expected proto. encoded_again = coder.encode(decoded) parsed_example_again = tf.train.Example() parsed_example_again.ParseFromString(encoded_again) np.testing.assert_equal(example, parsed_example_again) def test_example_proto_coder(self): # We use a single coder and invoke multiple encodes and decodes on it to # make sure that cache consistency is implemented properly. coder = example_proto_coder.ExampleProtoCoder(self._INPUT_SCHEMA) # Python types. example_proto_text = """ features { feature { key: "scalar_feature_1" value { int64_list { value: [ 12 ] } } } feature { key: "varlen_feature_1" value { float_list { value: [ 89.0 ] } } } feature { key: "scalar_feature_2" value { int64_list { value: [ 12 ] } } } feature { key: "scalar_feature_3" value { float_list { value: [ 1.0 ] } } } feature { key: "1d_vector_feature" value { bytes_list { value: [ 'this is a ,text' ] } } } feature { key: "varlen_feature_2" value { bytes_list { value: [ 'female' ] } } } feature { key: "value" value { float_list { value: [ 12.0, 20.0 ] } } } feature { key: "idx" value { int64_list { value: [ 1, 4 ] } } } } """ expected_decoded = { 'scalar_feature_1': 12, 'scalar_feature_2': 12, 'scalar_feature_3': 1.0, 'varlen_feature_1': [89.0], '1d_vector_feature': ['this is a ,text'], 'varlen_feature_2': ['female'], 'sparse_feature': ([12.0, 20.0], [1, 4]) } self._assert_encode_decode(coder, example_proto_text, expected_decoded) self._assert_decode_encode(coder, example_proto_text, expected_decoded) # Numpy types (with different values from above). example_proto_text = """ features { feature { key: "scalar_feature_1" value { int64_list { value: [ 13 ] } } } feature { key: "varlen_feature_1" value { float_list { } } } feature { key: "scalar_feature_2" value { int64_list { value: [ 14 ] } } } feature { key: "scalar_feature_3" value { float_list { value: [ 2.0 ] } } } feature { key: "1d_vector_feature" value { bytes_list { value: [ 'this is another ,text' ] } } } feature { key: "varlen_feature_2" value { bytes_list { value: [ 'male' ] } } } feature { key: "value" value { float_list { value: [ 13.0, 21.0 ] } } } feature { key: "idx" value { int64_list { value: [ 2, 5 ] } } } } """ expected_decoded = { 'scalar_feature_1': np.array(13), 'scalar_feature_2': np.int32(14), 'scalar_feature_3': np.array(2.0), 'varlen_feature_1': np.array([]), '1d_vector_feature': np.array(['this is another ,text']), 'varlen_feature_2': np.array(['male']), 'sparse_feature': (np.array([13.0, 21.0]), np.array([2, 5])) } self._assert_encode_decode(coder, example_proto_text, expected_decoded) self._assert_decode_encode(coder, example_proto_text, expected_decoded) def test_example_proto_coder_picklable(self): coder = example_proto_coder.ExampleProtoCoder(self._INPUT_SCHEMA) example_proto_text = """ features { feature { key: "scalar_feature_1" value { int64_list { value: [ 12 ] } } } feature { key: "varlen_feature_1" value { float_list { value: [ 89.0 ] } } } feature { key: "scalar_feature_2" value { int64_list { value: [ 12 ] } } } feature { key: "scalar_feature_3" value { float_list { value: [ 2.0 ] } } } feature { key: "1d_vector_feature" value { bytes_list { value: [ 'this is a ,text' ] } } } feature { key: "varlen_feature_2" value { bytes_list { value: [ 'female' ] } } } feature { key: "value" value { float_list { value: [ 12.0, 20.0 ] } } } feature { key: "idx" value { int64_list { value: [ 1, 4 ] } } } } """ expected_decoded = { 'scalar_feature_1': 12, 'scalar_feature_2': 12, 'scalar_feature_3': 2.0, 'varlen_feature_1': [89.0], '1d_vector_feature': ['this is a ,text'], 'varlen_feature_2': ['female'], 'sparse_feature': ([12.0, 20.0], [1, 4]) } # Ensure we can pickle right away. coder = pickle.loads(pickle.dumps(coder)) self._assert_encode_decode(coder, example_proto_text, expected_decoded) self._assert_decode_encode(coder, example_proto_text, expected_decoded) # And after use. coder = pickle.loads(pickle.dumps(coder)) self._assert_encode_decode(coder, example_proto_text, expected_decoded) self._assert_decode_encode(coder, example_proto_text, expected_decoded)
def testMakeFeedDict(self): tensors = { 'a': tf.placeholder(tf.int64), 'b': tf.placeholder(tf.float32), 'c': tf.placeholder(tf.float32), 'd': tf.placeholder(tf.float32), 'e': tf.sparse_placeholder(tf.string), 'f': tf.sparse_placeholder(tf.float32) } schema = self.toSchema({ 'a': tf.FixedLenFeature(None, tf.int64), 'b': tf.FixedLenFeature([], tf.float32), 'c': tf.FixedLenFeature([1], tf.float32), 'd': tf.FixedLenFeature([2, 2], tf.float32), 'e': tf.VarLenFeature(tf.string), 'f': tf.SparseFeature('idx', 'val', tf.float32, 10) }) # Feed some dense and sparse values. instances = [{ 'a': 100, 'b': 1.0, 'c': [2.0], 'd': [[1.0, 2.0], [3.0, 4.0]], 'e': ['doe', 'a', 'deer'], 'f': ([2, 4, 8], [10.0, 20.0, 30.0]) }, { 'a': 100, 'b': 2.0, 'c': [4.0], 'd': [[5.0, 6.0], [7.0, 8.0]], 'e': ['a', 'female', 'deer'], 'f': ([], []) }] feed_dict = impl_helper.make_feed_dict(tensors, schema, instances) self.assertSetEqual(set(six.iterkeys(feed_dict)), set(six.itervalues(tensors))) self.assertAllEqual(feed_dict[tensors['a']], [100, 100]) self.assertAllEqual(feed_dict[tensors['b']], [1.0, 2.0]) self.assertAllEqual(feed_dict[tensors['c']], [[2.0], [4.0]]) self.assertAllEqual( feed_dict[tensors['d']], [[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0], [7.0, 8.0]]]) self.assertSparseValuesEqual( feed_dict[tensors['e']], tf.SparseTensorValue( indices=[(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)], values=['doe', 'a', 'deer', 'a', 'female', 'deer'], dense_shape=(2, 3))) self.assertSparseValuesEqual( feed_dict[tensors['f']], tf.SparseTensorValue(indices=[(0, 2), (0, 4), (0, 8)], values=[10.0, 20.0, 30.0], dense_shape=(2, 10))) # Feed numpy versions of everything. instances = [{ 'a': np.int64(100), 'b': np.array(1.0, np.float32), 'c': np.array([2.0], np.float32), 'd': np.array([[1.0, 2.0], [3.0, 4.0]], np.float32), 'e': ['doe', 'a', 'deer'], 'f': (np.array([2, 4, 8]), np.array([10.0, 20.0, 30.0])), }, { 'a': np.int64(100), 'b': np.array(2.0, np.float32), 'c': np.array([4.0], np.float32), 'd': np.array([[5.0, 6.0], [7.0, 8.0]], np.float32), 'e': ['a', 'female', 'deer'], 'f': (np.array([], np.int32), np.array([], np.float32)) }] feed_dict = impl_helper.make_feed_dict(tensors, schema, instances) self.assertSetEqual(set(six.iterkeys(feed_dict)), set(six.itervalues(tensors))) self.assertAllEqual(feed_dict[tensors['a']], [100, 100]) self.assertAllEqual(feed_dict[tensors['b']], [1.0, 2.0]) self.assertAllEqual(feed_dict[tensors['c']], [[2.0], [4.0]]) self.assertAllEqual( feed_dict[tensors['d']], [[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0], [7.0, 8.0]]]) self.assertSparseValuesEqual( feed_dict[tensors['e']], tf.SparseTensorValue( indices=[(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)], values=['doe', 'a', 'deer', 'a', 'female', 'deer'], dense_shape=(2, 3))) self.assertSparseValuesEqual( feed_dict[tensors['f']], tf.SparseTensorValue(indices=[(0, 2), (0, 4), (0, 8)], values=[10.0, 20.0, 30.0], dense_shape=(2, 10))) # Feed some empty sparse values instances = [{ 'a': 100, 'b': 5.0, 'c': [1.0], 'd': [[1.0, 2.0], [3.0, 4.0]], 'e': [], 'f': ([], []) }] feed_dict = impl_helper.make_feed_dict(tensors, schema, instances) self.assertSparseValuesEqual( feed_dict[tensors['e']], tf.SparseTensorValue(indices=np.empty([0, 2], np.int64), values=[], dense_shape=(1, 0))) self.assertSparseValuesEqual( feed_dict[tensors['f']], tf.SparseTensorValue(indices=np.empty([0, 2], np.int64), values=[], dense_shape=(1, 10)))
import tensorflow as tf from tensorflow_transform import test_case from tensorflow_transform.coders import csv_coder from tensorflow_transform.tf_metadata import dataset_schema _COLUMNS = [ 'numeric1', 'text1', 'category1', 'idx', 'numeric2', 'value', 'numeric3' ] _FEATURE_SPEC = { 'numeric1': tf.FixedLenFeature([], tf.int64), 'numeric2': tf.VarLenFeature(tf.float32), 'numeric3': tf.FixedLenFeature([1], tf.int64), 'text1': tf.FixedLenFeature([], tf.string), 'category1': tf.VarLenFeature(tf.string), 'y': tf.SparseFeature('idx', 'value', tf.float32, 10), } _ENCODE_DECODE_CASES = [ dict( testcase_name='multiple_columns', columns=_COLUMNS, feature_spec=_FEATURE_SPEC, csv_line='12,"this is a ,text",categorical_value,1,89.0,12.0,20', instance={ 'category1': [b'categorical_value'], 'numeric1': 12, 'numeric2': [89.0], 'numeric3': [20], 'text1': b'this is a ,text', 'y': ([1], [12.0])
class TestCSVCoder(unittest.TestCase): _COLUMNS = [ 'numeric1', 'text1', 'category1', 'idx', 'numeric2', 'value', 'numeric3' ] _INPUT_SCHEMA = dataset_schema.from_feature_spec({ 'numeric1': tf.FixedLenFeature(shape=[], dtype=tf.int64), 'numeric2': tf.VarLenFeature(dtype=tf.float32), 'numeric3': tf.FixedLenFeature(shape=[1], dtype=tf.int64), 'text1': tf.FixedLenFeature(shape=[], dtype=tf.string), 'category1': tf.VarLenFeature(dtype=tf.string), 'y': tf.SparseFeature('idx', 'value', tf.float32, 10), }) _ENCODE_DECODE_CASES = [ # FixedLenFeature scalar int. ('12', 12, False, tf.FixedLenFeature(shape=[], dtype=tf.int64)), # FixedLenFeature scalar float without decimal point. ('12', 12, False, tf.FixedLenFeature(shape=[], dtype=tf.float32)), # FixedLenFeature length 1 vector int. ('12', [12], False, tf.FixedLenFeature(shape=[1], dtype=tf.int64)), # FixedLenFeature size 1 matrix int. ('12', [[12]], False, tf.FixedLenFeature(shape=[1, 1], dtype=tf.int64)), # FixedLenFeature unquoted text. ('this is unquoted text', 'this is unquoted text', False, tf.FixedLenFeature(shape=[], dtype=tf.string)), # FixedLenFeature quoted text. ('"this is a ,text"', 'this is a ,text', False, tf.FixedLenFeature(shape=[], dtype=tf.string)), # VarLenFeature text. ('a test', ['a test'], False, tf.VarLenFeature(dtype=tf.string)), # SparseFeature float one value. ('5,2.0', ([5], [2.0]), False, tf.SparseFeature('idx', 'value', tf.float32, 10)), # SparseFeature float no values. (',', ([], []), False, tf.SparseFeature('idx', 'value', tf.float32, 10)), # FixedLenFeature scalar int, multivalent. ('12', 12, True, tf.FixedLenFeature(shape=[], dtype=tf.int64)), # FixedLenFeature length 1 vector int, multivalent. ('12', [12], True, tf.FixedLenFeature(shape=[1], dtype=tf.int64)), # FixedLenFeature length 2 vector int, multivalent. ('12|14', [12, 14], True, tf.FixedLenFeature(shape=[2], dtype=tf.int64)), # FixedLenFeature size 1 matrix int. ('12', [[12]], True, tf.FixedLenFeature(shape=[1, 1], dtype=tf.int64)), # FixedLenFeature size (2, 2) matrix int. ('12|13|14|15', [[12, 13], [14, 15]], True, tf.FixedLenFeature(shape=[2, 2], dtype=tf.int64)), ] _DECODE_ERROR_CASES = [ # FixedLenFeature scalar numeric missing value. ('', ValueError, r'expected a value on column \"x\"', False, tf.FixedLenFeature(shape=[], dtype=tf.int64)), # FixedLenFeature length 1 vector numeric missing value. ('', ValueError, r'expected a value on column \"x\"', False, tf.FixedLenFeature(shape=[1], dtype=tf.int64)), # FixedLenFeature length >1 vector. ('1', ValueError, r'FixedLenFeature \"x\" was not multivalent', False, tf.FixedLenFeature(shape=[2], dtype=tf.int64)), # FixedLenFeature scalar text missing value. ('', ValueError, r'expected a value on column \"x\"', False, tf.FixedLenFeature(shape=[], dtype=tf.string)), # SparseFeature with missing value but present index. ('5,', ValueError, r'SparseFeature \"x\" has indices and values of different lengths', False, tf.SparseFeature('idx', 'value', tf.float32, 10)), # SparseFeature with missing index but present value. (',2.0', ValueError, r'SparseFeature \"x\" has indices and values of different lengths', False, tf.SparseFeature('idx', 'value', tf.float32, 10)), # SparseFeature with negative index. ('-1,2.0', ValueError, r'has index -1 out of range', False, tf.SparseFeature('idx', 'value', tf.float32, 10)), # SparseFeature with index equal to size. ('10,2.0', ValueError, r'has index 10 out of range', False, tf.SparseFeature('idx', 'value', tf.float32, 10)), # SparseFeature with index greater than size. ('11,2.0', ValueError, r'has index 11 out of range', False, tf.SparseFeature('idx', 'value', tf.float32, 10)), # FixedLenFeature with text missing value. ('test', ValueError, r'could not convert string to float: test', False, tf.FixedLenFeature(shape=[], dtype=tf.float32)), # FixedLenFeature scalar int, multivalent, too many values. ('1|2', ValueError, r'FixedLenFeature \"x\" got wrong number of values', True, tf.FixedLenFeature(shape=[], dtype=tf.float32)), # FixedLenFeature length 1 int, multivalent, too many values. ('1|2', ValueError, r'FixedLenFeature \"x\" got wrong number of values', True, tf.FixedLenFeature(shape=[1], dtype=tf.float32)), # FixedLenFeature length 2 int, multivalent, too few values. ('1', ValueError, r'FixedLenFeature \"x\" got wrong number of values', True, tf.FixedLenFeature(shape=[2], dtype=tf.float32)), ] _ENCODE_ERROR_CASES = [ # FixedLenFeature length 2 vector, multivalent with wrong number of # values. ([1, 2, 3], ValueError, r'FixedLenFeature \"x\" got wrong number of values', True, tf.FixedLenFeature(shape=[2], dtype=tf.string)) ] _DECODE_ONLY_CASES = [ # FixedLenFeature scalar float with decimal point. ('12.0', 12, False, tf.FixedLenFeature(shape=[], dtype=tf.float32)), # FixedLenFeature scalar float with quoted value. ('"12.0"', 12, False, tf.FixedLenFeature(shape=[], dtype=tf.float32)), # VarLenFeature text with missing value. ('', [], False, tf.VarLenFeature(dtype=tf.string)), ] longMessage = True def _msg_for_decode_case(self, csv_line, feature_spec): return 'While decoding "{csv_line}" with FeatureSpec {feature_spec}'.format( csv_line=csv_line, feature_spec=feature_spec) def _msg_for_encode_case(self, value, feature_spec): return 'While encoding {value} with FeatureSpec {feature_spec}'.format( value=value, feature_spec=feature_spec) def _assert_encode_decode(self, coder, data, expected_decoded): decoded = coder.decode(data) np.testing.assert_equal(decoded, expected_decoded) encoded = coder.encode(decoded) np.testing.assert_equal(encoded, data.encode('utf-8')) decoded_again = coder.decode(encoded) np.testing.assert_equal(decoded_again, expected_decoded) def test_csv_coder(self): data = '12,"this is a ,text",categorical_value,1,89.0,12.0,20' coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA) # Python types. expected_decoded = { 'category1': ['categorical_value'], 'numeric1': 12, 'numeric2': [89.0], 'numeric3': [20], 'text1': 'this is a ,text', 'y': ([1], [12.0]) } self._assert_encode_decode(coder, data, expected_decoded) # Numpy types. expected_decoded = { 'category1': np.array(['categorical_value']), 'numeric1': np.array(12), 'numeric2': np.array([89.0]), 'numeric3': np.array([20]), 'text1': np.array(['this is a ,text']), 'y': (np.array(1), np.array([12.0])) } self._assert_encode_decode(coder, data, expected_decoded) def test_csv_coder_with_unicode(self): data = u'12,"this is a ,text",Hello κόσμε,1,89.0,12.0,20' coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA) # Python types. expected_decoded = { 'category1': [u'Hello κόσμε'.encode('utf-8')], 'numeric1': 12, 'numeric2': [89.0], 'numeric3': [20], 'text1': 'this is a ,text', 'y': ([1], [12.0]) } self._assert_encode_decode(coder, data, expected_decoded) # Numpy types. expected_decoded = { 'category1': np.array([u'Hello κόσμε'.encode('utf-8')]), 'numeric1': np.array(12), 'numeric2': np.array([89.0]), 'numeric3': np.array([20]), 'text1': np.array(['this is a ,text']), 'y': (np.array(1), np.array([12.0])) } self._assert_encode_decode(coder, data, expected_decoded) def test_tsv_coder(self): data = '12\t"this is a \ttext"\tcategorical_value\t1\t89.0\t12.0\t20' coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA, delimiter='\t') expected_decoded = { 'category1': ['categorical_value'], 'numeric1': 12, 'numeric2': [89.0], 'numeric3': [20], 'text1': 'this is a \ttext', 'y': ([1], [12.0]) } self._assert_encode_decode(coder, data, expected_decoded) def test_valency(self): data = ( '11|12,"this is a ,text",categorical_value|other_value,1|3,89.0|' '91.0,12.0|15.0,20') feature_spec = self._INPUT_SCHEMA.as_feature_spec().copy() feature_spec['numeric1'] = tf.FixedLenFeature(shape=[2], dtype=tf.int64) schema = dataset_schema.from_feature_spec(feature_spec) multivalent_columns = ['numeric1', 'numeric2', 'y'] coder = csv_coder.CsvCoder(self._COLUMNS, schema, delimiter=',', secondary_delimiter='|', multivalent_columns=multivalent_columns) expected_decoded = { 'category1': ['categorical_value|other_value'], 'numeric1': [11, 12], 'numeric2': [89.0, 91.0], 'numeric3': [20], 'text1': 'this is a ,text', 'y': ([1, 3], [12.0, 15.0]) } self._assert_encode_decode(coder, data, expected_decoded) # Test successful decoding with a single column. def testDecode(self): for csv_line, value, multivalent, feature_spec in ( self._ENCODE_DECODE_CASES + self._DECODE_ONLY_CASES): schema = dataset_schema.from_feature_spec({'x': feature_spec}) if isinstance(feature_spec, tf.SparseFeature): columns = [feature_spec.index_key, feature_spec.value_key] else: columns = 'x' if multivalent: coder = csv_coder.CsvCoder(columns, schema, secondary_delimiter='|', multivalent_columns=columns) else: coder = csv_coder.CsvCoder(columns, schema) np.testing.assert_equal( coder.decode(csv_line), {'x': value}, self._msg_for_decode_case(csv_line, feature_spec)) # Test decode errors with a single column. def testDecodeErrors(self): for csv_line, error_type, error_msg, multivalent, feature_spec in ( self._DECODE_ERROR_CASES): schema = dataset_schema.from_feature_spec({'x': feature_spec}) if isinstance(feature_spec, tf.SparseFeature): columns = [feature_spec.index_key, feature_spec.value_key] else: columns = 'x' with self.assertRaisesRegexp(error_type, error_msg, msg=self._msg_for_decode_case( csv_line, feature_spec)): # We don't distinguish between errors in the coder constructor and in # the decode method. if multivalent: coder = csv_coder.CsvCoder(columns, schema, secondary_delimiter='|', multivalent_columns=columns) else: coder = csv_coder.CsvCoder(columns, schema) coder.decode(csv_line) # Test successful encoding with a single column. def testEncode(self): for csv_line, value, multivalent, feature_spec in self._ENCODE_DECODE_CASES: schema = dataset_schema.from_feature_spec({'x': feature_spec}) if isinstance(feature_spec, tf.SparseFeature): columns = [feature_spec.index_key, feature_spec.value_key] else: columns = 'x' if multivalent: coder = csv_coder.CsvCoder(columns, schema, secondary_delimiter='|', multivalent_columns=columns) else: coder = csv_coder.CsvCoder(columns, schema) self.assertEqual(coder.encode({'x': value}), csv_line, msg=self._msg_for_encode_case( value, feature_spec)) # Test successful encoding with a single column. def testEncodeErrors(self): for value, error_type, error_msg, multivalent, feature_spec in ( self._ENCODE_ERROR_CASES): schema = dataset_schema.from_feature_spec({'x': feature_spec}) if isinstance(feature_spec, tf.SparseFeature): columns = [feature_spec.index_key, feature_spec.value_key] else: columns = 'x' with self.assertRaisesRegexp(error_type, error_msg, msg=self._msg_for_encode_case( value, feature_spec)): if multivalent: coder = csv_coder.CsvCoder(columns, schema, secondary_delimiter='|', multivalent_columns=columns) else: coder = csv_coder.CsvCoder(columns, schema) coder.encode({'x': value}) def test_missing_data(self): coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA) data = '12,,categorical_value,1,89.0,12.0,20' with self.assertRaisesRegexp(ValueError, 'expected a value on column \"text1\"'): coder.decode(data) def test_bad_row(self): coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA) # The data has a more columns than expected. data = ('12,"this is a ,text",categorical_value,1,89.0,12.0,' '"oh no, I\'m an error",14') with self.assertRaisesRegexp( Exception, 'Columns do not match specified csv headers'): coder.decode(data) # The data has a fewer columns than expected. data = '12,"this is a ,text",categorical_value"' with self.assertRaisesRegexp( Exception, 'Columns do not match specified csv headers'): coder.decode(data) def test_column_not_found(self): with self.assertRaisesRegexp(ValueError, 'Column not found: '): csv_coder.CsvCoder([], self._INPUT_SCHEMA) def test_picklable(self): encoded_data = '12,"this is a ,text",categorical_value,1,89.0,12.0,20' expected_decoded = { 'category1': ['categorical_value'], 'numeric1': 12, 'numeric2': [89.0], 'numeric3': [20], 'text1': 'this is a ,text', 'y': ([1], [12.0]) } coder = csv_coder.CsvCoder(self._COLUMNS, self._INPUT_SCHEMA) # Ensure we can pickle right away. coder = pickle.loads(pickle.dumps(coder)) self._assert_encode_decode(coder, encoded_data, expected_decoded) # And after use. coder = pickle.loads(pickle.dumps(coder)) self._assert_encode_decode(coder, encoded_data, expected_decoded) def test_decode_errors(self): input_schema = dataset_schema.from_feature_spec({ 'b': tf.FixedLenFeature(shape=[], dtype=tf.float32), 'a': tf.FixedLenFeature(shape=[], dtype=tf.string), }) coder = csv_coder.CsvCoder(column_names=['a', 'b'], schema=input_schema) # Test bad csv. with self.assertRaisesRegexp( csv_coder.DecodeError, '\'int\' object has no attribute \'encode\': 123'): coder.decode(123) # Test extra column. with self.assertRaisesRegexp( csv_coder.DecodeError, 'Columns do not match specified csv headers'): coder.decode('1,2,') # Test missing column. with self.assertRaisesRegexp( csv_coder.DecodeError, 'Columns do not match specified csv headers'): coder.decode('a_value') # Test empty row. with self.assertRaisesRegexp( csv_coder.DecodeError, 'Columns do not match specified csv headers'): coder.decode('')