def testSerializedContainingVarLenDenseLargerBatch(self, batch_size): np.random.seed(3456) # During parsing, data read from the serialized proto is stored in buffers. # For small batch sizes, a buffer will contain one minibatch entry. # For larger batch sizes, a buffer may contain several minibatch # entries. This test identified a bug where the code that copied # data out of the buffers and into the output tensors assumed each # buffer only contained one minibatch entry. The bug has since been fixed. truth_int = [i for i in range(batch_size)] truth_str = [[("foo%d" % i).encode(), ("bar%d" % i).encode()] for i in range(batch_size)] expected_str = copy.deepcopy(truth_str) # Delete some intermediate entries for i in range(batch_size): col = 1 if np.random.rand() < 0.25: # w.p. 25%, drop out the second entry expected_str[i][col] = b"default" col -= 1 truth_str[i].pop() if np.random.rand() < 0.25: # w.p. 25%, drop out the second entry (possibly again) expected_str[i][col] = b"default" truth_str[i].pop() expected_output = { # Batch size batch_size, 1 time step. "a": np.array(truth_int, dtype=np.int64).reshape(batch_size, 1), # Batch size batch_size, 2 time steps. "b": np.array(expected_str, dtype="|S").reshape(batch_size, 2), } original = [ example(features=features( {"a": int64_feature([truth_int[i]]), "b": bytes_feature(truth_str[i])})) for i in range(batch_size) ] serialized = [m.SerializeToString() for m in original] self._test( ops.convert_to_tensor(serialized, dtype=dtypes.string), { "a": parsing_ops.FixedLenSequenceFeature( shape=(), dtype=dtypes.int64, allow_missing=True, default_value=-1), "b": parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.string, allow_missing=True, default_value="default"), }, expected_values=expected_output, create_iterator_twice=True)
def testSequenceExampleWithMultipleSizeFeatureLists(self): original = sequence_example(feature_lists=feature_lists({ "a": feature_list([ int64_feature([-1, 0, 1]), int64_feature([2, 3, 4]), int64_feature([5, 6, 7]), int64_feature([8, 9, 10]), ]), "b": feature_list([bytes_feature([b"r00", b"r01", b"r10", b"r11"])]), "c": feature_list([float_feature([3, 4]), float_feature([-1, 2])]), })) serialized = original.SerializeToString() expected_feature_list_output = { "a": np.array( [ # outer dimension is time. [[-1, 0, 1]], # inside are 1x3 matrices [[2, 3, 4]], [[5, 6, 7]], [[8, 9, 10]] ], dtype=np.int64), "b": np.array( [ # outer dimension is time, inside are 2x2 matrices [[b"r00", b"r01"], [b"r10", b"r11"]] ], dtype=bytes), "c": np.array( [ # outer dimension is time, inside are 2-vectors [3, 4], [-1, 2] ], dtype=np.float32), "d": np.empty( shape=(0, 5), dtype=np.float32), # empty_allowed_missing } self._test( { "example_name": "in1", "serialized": ops.convert_to_tensor(serialized), "sequence_features": { "a": parsing_ops.FixedLenSequenceFeature((1, 3), dtypes.int64), "b": parsing_ops.FixedLenSequenceFeature((2, 2), dtypes.string), "c": parsing_ops.FixedLenSequenceFeature((2,), dtypes.float32), "d": parsing_ops.FixedLenSequenceFeature( (5,), dtypes.float32, allow_missing=True), } }, expected_feat_list_values=expected_feature_list_output)
def testDecodeExampleWithBoundingBoxDense(self): num_bboxes = 10 np_ymin = np.random.rand(num_bboxes, 1) np_xmin = np.random.rand(num_bboxes, 1) np_ymax = np.random.rand(num_bboxes, 1) np_xmax = np.random.rand(num_bboxes, 1) np_bboxes = np.hstack([np_ymin, np_xmin, np_ymax, np_xmax]) example = example_pb2.Example(features=feature_pb2.Features( feature={ 'image/object/bbox/ymin': self._EncodedFloatFeature(np_ymin), 'image/object/bbox/xmin': self._EncodedFloatFeature(np_xmin), 'image/object/bbox/ymax': self._EncodedFloatFeature(np_ymax), 'image/object/bbox/xmax': self._EncodedFloatFeature(np_xmax), })) serialized_example = example.SerializeToString() with self.test_session(): serialized_example = array_ops.reshape(serialized_example, shape=[]) keys_to_features = { 'image/object/bbox/ymin': parsing_ops.FixedLenSequenceFeature([], dtypes.float32, allow_missing=True), 'image/object/bbox/xmin': parsing_ops.FixedLenSequenceFeature([], dtypes.float32, allow_missing=True), 'image/object/bbox/ymax': parsing_ops.FixedLenSequenceFeature([], dtypes.float32, allow_missing=True), 'image/object/bbox/xmax': parsing_ops.FixedLenSequenceFeature([], dtypes.float32, allow_missing=True), } items_to_handlers = { 'object/bbox': tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), } decoder = tfexample_decoder.TFExampleDecoder( keys_to_features, items_to_handlers) [tf_bboxes] = decoder.decode(serialized_example, ['object/bbox']) bboxes = tf_bboxes.eval() self.assertAllClose(np_bboxes, bboxes)
def testSerializedShapeMismatch(self): aname = "a" bname = "b" cname = "c" original = [ example(features=features({ cname: int64_feature([2]), })), example( features=features({ aname: float_feature([1, 1]), bname: bytes_feature([b"b0_str", b"b1_str"]), })), example(features=features({ aname: float_feature([-1, -1, 2, 2]), bname: bytes_feature([b"b1"]), })), example(features=features({ aname: float_feature([]), cname: int64_feature([3]), })), ] serialized = [m.SerializeToString() for m in original] if context.executing_eagerly(): self._test( ops.convert_to_tensor(serialized), { aname: parsing_ops.FixedLenSequenceFeature((2, 1), dtype=dtypes.float32, allow_missing=True, default_value=[]), bname: parsing_ops.FixedLenSequenceFeature( (2, 1, 1), dtype=dtypes.string, allow_missing=True), }, expected_err=(errors_impl.InvalidArgumentError, "Input to reshape is a tensor with 0 values")) else: self._test(ops.convert_to_tensor(serialized), { aname: parsing_ops.FixedLenSequenceFeature((2, 1), dtype=dtypes.float32, allow_missing=True, default_value=[]), bname: parsing_ops.FixedLenSequenceFeature( (2, 1, 1), dtype=dtypes.string, allow_missing=True), }, expected_err=( ValueError, "Cannot reshape a tensor with 0 elements to shape"))
def testCreateSequenceFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys("id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column", dimension=2) real_valued_col2 = fc.real_valued_column( "real_valued_default_column", dimension=5, default_value=3.0) real_valued_col3 = fc._real_valued_var_len_column( "real_valued_var_len_column", default_value=3.0, is_sparse=True) real_valued_col4 = fc._real_valued_var_len_column( "real_valued_var_len_dense_column", default_value=4.0, is_sparse=False) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4 ]) feature_spec = fc._create_sequence_feature_spec_for_parsing(feature_columns) expected_feature_spec = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column": parsing_ops.FixedLenSequenceFeature( shape=[2], dtype=dtypes.float32, allow_missing=False), "real_valued_default_column": parsing_ops.FixedLenSequenceFeature( shape=[5], dtype=dtypes.float32, allow_missing=True), "real_valued_var_len_column": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_var_len_dense_column": parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.float32, allow_missing=True, default_value=4.0), } self.assertDictEqual(expected_feature_spec, feature_spec)
def testRealValuedVarLenColumnDtypes(self): rvc = fc._real_valued_var_len_column("rvc", is_sparse=True) self.assertDictEqual( { "rvc": parsing_ops.VarLenFeature(dtype=dtypes.float32) }, rvc.config) rvc = fc._real_valued_var_len_column("rvc", default_value=0, is_sparse=False) self.assertDictEqual( { "rvc": parsing_ops.FixedLenSequenceFeature(shape=[], dtype=dtypes.float32, allow_missing=True, default_value=0.0) }, rvc.config) rvc = fc._real_valued_var_len_column("rvc", dtype=dtypes.int32, default_value=0, is_sparse=True) self.assertDictEqual( { "rvc": parsing_ops.VarLenFeature(dtype=dtypes.int32) }, rvc.config) with self.assertRaisesRegexp(TypeError, "dtype must be convertible to float"): fc._real_valued_var_len_column("rvc", dtype=dtypes.string, default_value="", is_sparse=True)
def _parse_example_spec(self): return { self.key: parsing_ops.FixedLenSequenceFeature(self.shape, self.dtype, allow_missing=True) }
def testSingleExampleWithSparseAndSparseFeatureAndDense(self): original = example(features=features({ "c": float_feature([3, 4]), "d": float_feature([0.0, 1.0]), "val": bytes_feature([b"a", b"b"]), "idx": int64_feature([0, 3]), "st_a": float_feature([3.0, 4.0]) })) serialized = original.SerializeToString() expected_st_a = ( np.array([[0], [1]], dtype=np.int64), # indices np.array([3.0, 4.0], dtype=np.float32), # values np.array([2], dtype=np.int64)) # shape: max_values = 2 expected_sp = ( # indices, values, shape np.array([[0], [3]], dtype=np.int64), np.array(["a", "b"], dtype="|S"), np.array([13], dtype=np.int64)) # max_values = 13 a_default = [1, 2, 3] b_default = np.random.rand(3, 3).astype(bytes) expected_output = { "st_a": expected_st_a, "sp": expected_sp, "a": [a_default], "b": b_default, "c": np.array([3, 4], dtype=np.float32), "d": np.array([0.0, 1.0], dtype=np.float32), } self._test( { "serialized": ops.convert_to_tensor(serialized), "features": { "st_a": parsing_ops.VarLenFeature(dtypes.float32), "sp": parsing_ops.SparseFeature(["idx"], "val", dtypes.string, [13]), "a": parsing_ops.FixedLenFeature( (1, 3), dtypes.int64, default_value=a_default), "b": parsing_ops.FixedLenFeature( (3, 3), dtypes.string, default_value=b_default), # Feature "c" must be provided, since it has no default_value. "c": parsing_ops.FixedLenFeature(2, dtypes.float32), "d": parsing_ops.FixedLenSequenceFeature( [], dtypes.float32, allow_missing=True) } }, expected_output)
def testCreateFeatureSpec_ExperimentalColumns(self): real_valued_col0 = fc._real_valued_var_len_column( "real_valued_column0", is_sparse=True) real_valued_col1 = fc._real_valued_var_len_column( "real_valued_column1", dtype=dtypes.int64, default_value=0, is_sparse=False) feature_columns = set([real_valued_col0, real_valued_col1]) expected_config = { "real_valued_column0": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column1": parsing_ops.FixedLenSequenceFeature( [], dtype=dtypes.int64, allow_missing=True, default_value=0), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config)
def testSequenceExampleListWithInconsistentDataFails(self): original = sequence_example(feature_lists=feature_lists({ "a": feature_list([int64_feature([-1, 0]), float_feature([2, 3])]) })) serialized = original.SerializeToString() self._test( { "example_name": "in1", "serialized": ops.convert_to_tensor(serialized), "sequence_features": { "a": parsing_ops.FixedLenSequenceFeature((2,), dtypes.int64) } }, expected_err=(errors_impl.OpError, "Feature list: a, Index: 1." " Data types don't match. Expected type: int64"))
def testSequenceExampleListWithWrongShapeFails(self): original = sequence_example(feature_lists=feature_lists({ "a": feature_list([int64_feature([2, 3]), int64_feature([2, 3, 4])]), })) serialized = original.SerializeToString() self._test( { "example_name": "in1", "serialized": ops.convert_to_tensor(serialized), "sequence_features": { "a": parsing_ops.FixedLenSequenceFeature((2,), dtypes.int64) } }, expected_err=(errors_impl.OpError, r"Name: in1, Key: a, Index: 1." r" Number of int64 values != expected." r" values size: 3 but output shape: \[2\]"))
def testCreateFeatureSpec_RealValuedColumnWithDefaultValue(self): real_valued_col1 = fc.real_valued_column( "real_valued_column1", default_value=2) real_valued_col2 = fc.real_valued_column( "real_valued_column2", 5, default_value=4) real_valued_col3 = fc.real_valued_column( "real_valued_column3", default_value=[8]) real_valued_col4 = fc.real_valued_column( "real_valued_column4", 3, default_value=[1, 0, 6]) real_valued_col5 = fc._real_valued_var_len_column( "real_valued_column5", default_value=2, is_sparse=True) real_valued_col6 = fc._real_valued_var_len_column( "real_valued_column6", dtype=dtypes.int64, default_value=1, is_sparse=False) feature_columns = [ real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4, real_valued_col5, real_valued_col6 ] config = fc.create_feature_spec_for_parsing(feature_columns) self.assertEqual(6, len(config)) self.assertDictEqual( { "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32, default_value=[2.]), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32, default_value=[4., 4., 4., 4., 4.]), "real_valued_column3": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32, default_value=[8.]), "real_valued_column4": parsing_ops.FixedLenFeature( [3], dtype=dtypes.float32, default_value=[1., 0., 6.]), "real_valued_column5": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column6": parsing_ops.FixedLenSequenceFeature( [], dtype=dtypes.int64, allow_missing=True, default_value=1) }, config)
def testSequenceExampleWithMissingFeatureListFails(self): original = sequence_example(feature_lists=feature_lists({})) # Test fails because we didn't add: # feature_list_dense_defaults = {"a": None} self._test( { "example_name": "in1", "serialized": ops.convert_to_tensor(original.SerializeToString()), "sequence_features": { "a": parsing_ops.FixedLenSequenceFeature((2,), dtypes.int64) } }, expected_err=( errors_impl.OpError, "Name: in1, Feature list 'a' is required but could not be found." " Did you mean to include it in" " feature_list_dense_missing_assumed_empty or" " feature_list_dense_defaults?"))
def testVaryingFieldsInGenerator(self): def simple_generator(): for i in range(2): yield {"value": i, "seqlen_value": np.ones((i, 1))} simple_features = { "value": parsing_ops.FixedLenFeature(shape=[], dtype=dtypes.int32), "seqlen_value": parsing_ops.FixedLenSequenceFeature(shape=[1], dtype=dtypes.float32, allow_missing=True), "empty_value": parsing_ops.FixedLenFeature(default_value=[-1, -2], dtype=dtypes.int32, shape=[2]) } tensors = python_input.python_input(simple_generator, simple_features) self.assertEqual(set(["value", "seqlen_value", "empty_value"]), set(tensors.keys())) self.assertEqual(dtypes.int32, tensors["value"].dtype) self.assertEqual((), tensors["value"].shape) self.assertEqual(dtypes.float32, tensors["seqlen_value"].dtype) self.assertEqual([None, 1], tensors["seqlen_value"].shape.as_list()) self.assertEqual(dtypes.int32, tensors["empty_value"].dtype) self.assertEqual([2], tensors["empty_value"].shape) with self.test_session() as sess: r1 = sess.run(tensors) self.assertAllEqual(0, r1["value"]) self.assertAllEqual(np.ones((0, 1)), r1["seqlen_value"]) self.assertAllEqual([-1, -2], r1["empty_value"]) r2 = sess.run(tensors) self.assertAllEqual(1, r2["value"]) self.assertAllEqual([[1]], r2["seqlen_value"]) self.assertAllEqual([-1, -2], r2["empty_value"]) with self.assertRaisesOpError("Iteration finished"): sess.run(tensors)
def testSequenceExampleListWithWrongSparseDataTypeFails(self): original = sequence_example(feature_lists=feature_lists({ "a": feature_list([ int64_feature([3, 4]), int64_feature([1, 2]), float_feature([2.0, 3.0]) ]) })) serialized = original.SerializeToString() self._test( { "example_name": "in1", "serialized": ops.convert_to_tensor(serialized), "sequence_features": { "a": parsing_ops.FixedLenSequenceFeature((2,), dtypes.int64) } }, expected_err=(errors_impl.OpError, "Name: in1, Feature list: a, Index: 2." " Data types don't match. Expected type: int64" " Feature is: float_list"))
def parse_from_sequence_example(serialized, list_size, context_feature_spec=None, example_feature_spec=None): """Parses SequenceExample to feature maps. Args: serialized: (Tensor) A string Tensor for a batch of serialized SequenceExample. list_size: (int) number of required frames in a SequenceExample. This is needed to normalize output tensor shapes across batches. context_feature_spec: (dict) A mapping from feature keys to `FixedLenFeature` or `VarLenFeature` values for context. example_feature_spec: (dict) A mapping from feature keys to `FixedLenFeature` or `VarLenFeature` values for the list of examples. These features are stored in the `feature_lists` field in SequenceExample. `FixedLenFeature` is translated to `FixedLenSequenceFeature` to parse SequenceExample. Note that no missing value in the middle of a `feature_list` is allowed for frames. Returns: A mapping from feature keys to `Tensor` or `SparseTensor`. """ # Convert `FixedLenFeature` in `example_feature_spec` to # `FixedLenSequenceFeature` to parse the `feature_lists` in SequenceExample. # TODO(xuanhui): Handle missing feature_list since allow_missing=True. fixed_len_sequence_features = { k: parsing_ops.FixedLenSequenceFeature(s.shape, s.dtype, allow_missing=True) for k, s in six.iteritems(example_feature_spec) if isinstance(s, parsing_ops.FixedLenFeature) } sequence_features = example_feature_spec.copy() sequence_features.update(fixed_len_sequence_features) context, examples, _ = parsing_ops.parse_sequence_example( serialized, context_features=context_feature_spec, sequence_features=sequence_features) features = {} features.update(context) # Slice or pad example features to normalize the tensor shape: # [batch_size, num_frames, ...] --> [batch_size, list_size, ...] for k, t in six.iteritems(examples): # Old shape: [batch_size, num_frames, ...] shape = array_ops.unstack(array_ops.shape(t)) ndims = len(shape) num_frames = shape[1] # New shape: [batch_size, list_size, ...] new_shape = array_ops.concat([[shape[0], list_size], shape[2:]], 0) def slice_fn(t=t, ndims=ndims, new_shape=new_shape): """Slices the tensor.""" if isinstance(t, sparse_tensor.SparseTensor): return sparse_ops.sparse_slice(t, [0] * ndims, math_ops.to_int64(new_shape)) else: return array_ops.slice(t, [0] * ndims, new_shape) def pad_fn(k=k, t=t, ndims=ndims, num_frames=num_frames, new_shape=new_shape): """Pads the tensor.""" if isinstance(t, sparse_tensor.SparseTensor): return sparse_ops.sparse_reset_shape(t, new_shape) else: # Padding is n * 2 tensor where n is the ndims or rank of the padded # tensor. paddings = array_ops.stack( [[0, 0], [0, list_size - num_frames]] + [[0, 0]] * (ndims - 2)) return array_ops.pad( t, paddings, constant_values=array_ops.squeeze( example_feature_spec[k].default_value[0])) tensor = control_flow_ops.cond(num_frames > list_size, slice_fn, pad_fn) # Infer static shape for Tensor. if not isinstance(tensor, sparse_tensor.SparseTensor): static_shape = t.get_shape().as_list() static_shape[1] = list_size tensor.set_shape(static_shape) features[k] = tensor return features
def testSerializedContainingVarLenDense(self): aname = "a" bname = "b" cname = "c" dname = "d" original = [ example(features=features({ cname: int64_feature([2]), })), example(features=features({ aname: float_feature([1, 1]), bname: bytes_feature([b"b0_str", b"b1_str"]), })), example(features=features({ aname: float_feature([-1, -1, 2, 2]), bname: bytes_feature([b"b1"]), })), example(features=features({ aname: float_feature([]), cname: int64_feature([3]), })), ] serialized = [m.SerializeToString() for m in original] expected_output = { aname: np.array( [ [0, 0, 0, 0], [1, 1, 0, 0], [-1, -1, 2, 2], [0, 0, 0, 0], ], dtype=np.float32).reshape(4, 2, 2, 1), bname: np.array( [["", ""], ["b0_str", "b1_str"], ["b1", ""], ["", ""]], dtype=bytes).reshape(4, 2, 1, 1, 1), cname: np.array([2, 0, 0, 3], dtype=np.int64).reshape(4, 1), dname: np.empty(shape=(4, 0), dtype=bytes), } self._test( ops.convert_to_tensor(serialized), { aname: parsing_ops.FixedLenSequenceFeature( (2, 1), dtype=dtypes.float32, allow_missing=True), bname: parsing_ops.FixedLenSequenceFeature( (1, 1, 1), dtype=dtypes.string, allow_missing=True), cname: parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.int64, allow_missing=True), dname: parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.string, allow_missing=True), }, expected_values=expected_output) # Test with padding values. expected_output_custom_padding = dict(expected_output) expected_output_custom_padding[aname] = np.array( [ [-2, -2, -2, -2], [1, 1, -2, -2], [-1, -1, 2, 2], [-2, -2, -2, -2], ], dtype=np.float32).reshape(4, 2, 2, 1) self._test( ops.convert_to_tensor(serialized), { aname: parsing_ops.FixedLenSequenceFeature( (2, 1), dtype=dtypes.float32, allow_missing=True, default_value=-2.0), bname: parsing_ops.FixedLenSequenceFeature( (1, 1, 1), dtype=dtypes.string, allow_missing=True), cname: parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.int64, allow_missing=True), dname: parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.string, allow_missing=True), }, expected_output_custom_padding) # Change number of required values so the inputs are not a # multiple of this size. self._test( ops.convert_to_tensor(serialized), { aname: parsing_ops.FixedLenSequenceFeature( (2, 1), dtype=dtypes.float32, allow_missing=True), bname: parsing_ops.FixedLenSequenceFeature( (2, 1, 1), dtype=dtypes.string, allow_missing=True), }, expected_err=( errors_impl.OpError, "Key: b, Index: 2. " "Number of bytes values is not a multiple of stride length.")) self._test( ops.convert_to_tensor(serialized), { aname: parsing_ops.FixedLenSequenceFeature( (2, 1), dtype=dtypes.float32, allow_missing=True, default_value=[]), bname: parsing_ops.FixedLenSequenceFeature( (2, 1, 1), dtype=dtypes.string, allow_missing=True), }, expected_err=(ValueError, "Cannot reshape a tensor with 0 elements to shape")) self._test( ops.convert_to_tensor(serialized), { aname: parsing_ops.FixedLenFeature((None, 2, 1), dtype=dtypes.float32), bname: parsing_ops.FixedLenSequenceFeature( (2, 1, 1), dtype=dtypes.string, allow_missing=True), }, expected_err=(ValueError, "First dimension of shape for feature a unknown. " "Consider using FixedLenSequenceFeature.")) self._test( ops.convert_to_tensor(serialized), { cname: parsing_ops.FixedLenFeature( (1, None), dtype=dtypes.int64, default_value=[[1]]), }, expected_err=(ValueError, "All dimensions of shape for feature c need to be known " r"but received \(1, None\).")) self._test( ops.convert_to_tensor(serialized), { aname: parsing_ops.FixedLenSequenceFeature( (2, 1), dtype=dtypes.float32, allow_missing=True), bname: parsing_ops.FixedLenSequenceFeature( (1, 1, 1), dtype=dtypes.string, allow_missing=True), cname: parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.int64, allow_missing=False), dname: parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.string, allow_missing=True), }, expected_err=(ValueError, "Unsupported: FixedLenSequenceFeature requires " "allow_missing to be True."))
def ReplayDataset(replay_stream, max_sequence_length=200, name=None): """Creates a `tf.data.Dataset` from a `ay.contrib.rl.ReplayStream` instance. Arguments: replay_stream: `ay.contrib.rl.ReplayStream` instance. Must implement `replay_stream.read`. The method is called `replay_stream.read(limit=max_sequence_length)` each time an instance is requested by the dataset. This method should return `None` or raise an `tf.errors.OutOfRangeError` when the stream is done and execution of the dataset should stop. `replay_stream.read` should always return a `tf.SequenceExample` proto. Returns: A `tf.data.Dataset`. Raises: An `tf.errors.OutOfRangeError` when the stream returns a `None` or raises `tf.errors.OutOfRangeError`. """ assert_utils.assert_true( isinstance(replay_stream, streams.ReplayStream), '`replay_stream` must be an instance of `ay.contrib.rl.ReplayStream`') with ops.name_scope(name or 'replay_dataset'): state_shape = list(replay_stream.state_shape) state_dtype = replay_stream.state_dtype action_shape = list(replay_stream.action_shape) action_dtype = replay_stream.action_dtype action_value_shape = list(replay_stream.action_value_shape) action_value_dtype = replay_stream.action_value_dtype reward_shape = list(replay_stream.reward_shape) reward_dtype = replay_stream.reward_dtype replay_dtypes = { 'state': state_dtype, 'next_state': state_dtype, 'action': action_dtype, 'action_value': action_value_dtype, 'reward': reward_dtype, 'terminal': dtypes.bool, 'sequence_length': dtypes.int32, } if replay_stream.with_values: replay_dtypes['value'] = reward_dtype def convert_to_safe_feature_type(dtype): return type_utils.safe_tf_dtype( serialize.type_to_feature[dtype][-1]) replay_features = { 'state': parsing_ops.FixedLenSequenceFeature( shape=state_shape, dtype=convert_to_safe_feature_type(state_dtype)), 'next_state': parsing_ops.FixedLenSequenceFeature( shape=state_shape, dtype=convert_to_safe_feature_type(state_dtype)), 'action': parsing_ops.FixedLenSequenceFeature( shape=action_shape, dtype=convert_to_safe_feature_type(action_dtype)), 'action_value': parsing_ops.FixedLenSequenceFeature( shape=action_value_shape, dtype=convert_to_safe_feature_type(action_value_dtype)), 'reward': parsing_ops.FixedLenSequenceFeature( shape=reward_shape, dtype=convert_to_safe_feature_type(reward_dtype)), 'terminal': parsing_ops.FixedLenSequenceFeature( shape=[], dtype=convert_to_safe_feature_type(dtypes.bool)), 'sequence_length': parsing_ops.FixedLenSequenceFeature( shape=[], dtype=convert_to_safe_feature_type(dtypes.int32)), } if replay_stream.with_values: replay_features['value'] = parsing_ops.FixedLenSequenceFeature( shape=reward_shape, dtype=convert_to_safe_feature_type(reward_dtype)) def convert_and_fix_dtypes(replay): """Cast dtypes back to their original types.""" fixed_replay = {} for k, v in replay.items(): fixed_replay[k] = math_ops.cast(v, dtype=replay_dtypes[k]) return fixed_replay def generator(): """Create `tf.Tensor`s from the `ay.contrib.rl.ReplayStream` instance.""" while True: replay_example = None try: replay_example = replay_stream.read( limit=max_sequence_length) except: yield "" else: yield replay_example.SerializeToString() def serialize_map(replay_example_str): """Parse each example string to `tf.Tensor`.""" try: assert_op = control_flow_ops.Assert(replay_example_str != "", [replay_example_str]) with ops.control_dependencies([assert_op]): _, replay = parsing_ops.parse_single_sequence_example( replay_example_str, sequence_features=replay_features) except errors_impl.InvalidArgumentError: raise errors_impl.OutOfRangeError() return convert_and_fix_dtypes(replay) def pad_or_truncate_map(replay): """Truncate or pad replays.""" with_values = 'value' in replay if with_values: replay = experience.ReplayWithValues(**replay) else: replay = experience.Replay(**replay) sequence_length = math_ops.minimum(max_sequence_length, replay.sequence_length) sequence_length.set_shape([1]) state = sequence_utils.pad_or_truncate(replay.state, max_sequence_length, axis=0, pad_value=0) state.set_shape([max_sequence_length] + state_shape) next_state = sequence_utils.pad_or_truncate(replay.next_state, max_sequence_length, axis=0, pad_value=0) next_state.set_shape([max_sequence_length] + state_shape) action = sequence_utils.pad_or_truncate(replay.action, max_sequence_length, axis=0, pad_value=0) action.set_shape([max_sequence_length] + action_shape) action_value = sequence_utils.pad_or_truncate(replay.action_value, max_sequence_length, axis=0, pad_value=0) action_value.set_shape([max_sequence_length] + action_value_shape) reward = sequence_utils.pad_or_truncate(replay.reward, max_sequence_length, axis=0, pad_value=0) reward.set_shape([max_sequence_length] + reward_shape) terminal = sequence_utils.pad_or_truncate( replay.terminal, max_sequence_length, axis=0, pad_value=ops.convert_to_tensor(False)) terminal.set_shape([max_sequence_length]) if with_values: value = sequence_utils.pad_or_truncate(replay.value, max_sequence_length, axis=0, pad_value=0) value.set_shape([max_sequence_length] + reward_shape) return experience.ReplayWithValues( state=state, next_state=next_state, action=action, action_value=action_value, value=value, reward=reward, terminal=terminal, sequence_length=sequence_length) return experience.Replay(state=state, next_state=next_state, action=action, action_value=action_value, reward=reward, terminal=terminal, sequence_length=sequence_length) dataset = dataset_ops.Dataset.from_generator(generator, dtypes.string) dataset = dataset.map(serialize_map) return dataset.map(pad_or_truncate_map)
def testSequenceExampleWithSparseAndDenseFeatureLists(self): original = sequence_example(feature_lists=feature_lists({ "a": feature_list([int64_feature([3, 4]), int64_feature([1, 0])]), "st_a": feature_list([ float_feature([3.0, 4.0]), float_feature([5.0]), float_feature([]) ]), "st_b": feature_list([ bytes_feature([b"a"]), bytes_feature([]), bytes_feature([]), bytes_feature([b"b", b"c"]) ]) })) serialized = original.SerializeToString() expected_st_a = ( np.array( [[0, 0], [0, 1], [1, 0]], dtype=np.int64), # indices np.array( [3.0, 4.0, 5.0], dtype=np.float32), # values np.array( [3, 2], dtype=np.int64)) # shape: num_time = 3, max_feat = 2 expected_st_b = ( np.array( [[0, 0], [3, 0], [3, 1]], dtype=np.int64), # indices np.array( ["a", "b", "c"], dtype="|S"), # values np.array( [4, 2], dtype=np.int64)) # shape: num_time = 4, max_feat = 2 expected_st_c = ( np.empty( (0, 2), dtype=np.int64), # indices np.empty( (0,), dtype=np.int64), # values np.array( [0, 0], dtype=np.int64)) # shape: num_time = 0, max_feat = 0 expected_feature_list_output = { "a": np.array( [[3, 4], [1, 0]], dtype=np.int64), "st_a": expected_st_a, "st_b": expected_st_b, "st_c": expected_st_c, } self._test( { "example_name": "in1", "serialized": ops.convert_to_tensor(serialized), "sequence_features": { "st_a": parsing_ops.VarLenFeature(dtypes.float32), "st_b": parsing_ops.VarLenFeature(dtypes.string), "st_c": parsing_ops.VarLenFeature(dtypes.int64), "a": parsing_ops.FixedLenSequenceFeature((2,), dtypes.int64), } }, expected_feat_list_values=expected_feature_list_output)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc._real_valued_var_len_column( "real_valued_column3", is_sparse=True) real_valued_col4 = fc._real_valued_var_len_column( "real_valued_column4", dtype=dtypes.int64, default_value=0, is_sparse=False) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column4": parsing_ops.FixedLenSequenceFeature( [], dtype=dtypes.int64, allow_missing=True, default_value=0), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testSerializedContainingVarLenDense(self): aname = "a" bname = "b" cname = "c" dname = "d" original = [ example(features=features({ cname: int64_feature([2]), })), example( features=features({ aname: float_feature([1, 1]), bname: bytes_feature([b"b0_str", b"b1_str"]), })), example(features=features({ aname: float_feature([-1, -1, 2, 2]), bname: bytes_feature([b"b1"]), })), example(features=features({ aname: float_feature([]), cname: int64_feature([3]), })), ] expected_outputs = [ { aname: np.empty(shape=(0, 2, 1), dtype=np.int64), bname: np.empty(shape=(0, 1, 1, 1), dtype=bytes), cname: np.array([2], dtype=np.int64), dname: np.empty(shape=(0, ), dtype=bytes) }, { aname: np.array([[[1], [1]]], dtype=np.float32), bname: np.array(["b0_str", "b1_str"], dtype=bytes).reshape(2, 1, 1, 1), cname: np.empty(shape=(0, ), dtype=np.int64), dname: np.empty(shape=(0, ), dtype=bytes) }, { aname: np.array([[[-1], [-1]], [[2], [2]]], dtype=np.float32), bname: np.array(["b1"], dtype=bytes).reshape(1, 1, 1, 1), cname: np.empty(shape=(0, ), dtype=np.int64), dname: np.empty(shape=(0, ), dtype=bytes) }, { aname: np.empty(shape=(0, 2, 1), dtype=np.int64), bname: np.empty(shape=(0, 1, 1, 1), dtype=bytes), cname: np.array([3], dtype=np.int64), dname: np.empty(shape=(0, ), dtype=bytes) }, ] for proto, expected_output in zip(original, expected_outputs): self._test( { "serialized": ops.convert_to_tensor( proto.SerializeToString()), "features": { aname: parsing_ops.FixedLenSequenceFeature( (2, 1), dtype=dtypes.float32, allow_missing=True), bname: parsing_ops.FixedLenSequenceFeature( (1, 1, 1), dtype=dtypes.string, allow_missing=True), cname: parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.int64, allow_missing=True), dname: parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.string, allow_missing=True), } }, expected_output) # Test with padding values. # NOTE(mrry): Since we parse a single example at a time, the fixed-length # sequences will not be padded, and the padding value will be ignored. for proto, expected_output in zip(original, expected_outputs): self._test( { "serialized": ops.convert_to_tensor( proto.SerializeToString()), "features": { aname: parsing_ops.FixedLenSequenceFeature( (2, 1), dtype=dtypes.float32, allow_missing=True), bname: parsing_ops.FixedLenSequenceFeature( (1, 1, 1), dtype=dtypes.string, allow_missing=True), cname: parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.int64, allow_missing=True), dname: parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.string, allow_missing=True), } }, expected_output) # Change number of required values so the inputs are not a # multiple of this size. self._test( { "serialized": ops.convert_to_tensor(original[2].SerializeToString()), "features": { aname: parsing_ops.FixedLenSequenceFeature( (2, 1), dtype=dtypes.float32, allow_missing=True), bname: parsing_ops.FixedLenSequenceFeature( (2, 1, 1), dtype=dtypes.string, allow_missing=True), } }, # TODO(mrry): Consider matching the `tf.parse_example()` error message. expected_err=(errors_impl.OpError, "Key: b.")) self._test( { "serialized": ops.convert_to_tensor(""), "features": { aname: parsing_ops.FixedLenSequenceFeature((2, 1), dtype=dtypes.float32, allow_missing=True, default_value=[]), bname: parsing_ops.FixedLenSequenceFeature( (2, 1, 1), dtype=dtypes.string, allow_missing=True), } }, expected_err=(ValueError, "Cannot reshape a tensor with 0 elements to shape")) self._test( { "serialized": ops.convert_to_tensor(""), "features": { aname: parsing_ops.FixedLenFeature( (None, 2, 1), dtype=dtypes.float32), bname: parsing_ops.FixedLenSequenceFeature( (2, 1, 1), dtype=dtypes.string, allow_missing=True), } }, expected_err=(ValueError, "First dimension of shape for feature a unknown. " "Consider using FixedLenSequenceFeature.")) self._test( { "serialized": ops.convert_to_tensor(""), "features": { cname: parsing_ops.FixedLenFeature( (1, None), dtype=dtypes.int64, default_value=[[1]]), } }, expected_err=( ValueError, "All dimensions of shape for feature c need to be known " r"but received \(1, None\).")) self._test( { "serialized": ops.convert_to_tensor(""), "features": { aname: parsing_ops.FixedLenSequenceFeature( (2, 1), dtype=dtypes.float32, allow_missing=True), bname: parsing_ops.FixedLenSequenceFeature( (1, 1, 1), dtype=dtypes.string, allow_missing=True), cname: parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.int64, allow_missing=False), dname: parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.string, allow_missing=True), } }, expected_err=(ValueError, "Unsupported: FixedLenSequenceFeature requires " "allow_missing to be True."))