def testSerializedContainingVarLenDenseLargerBatch(self, batch_size):
    np.random.seed(3456)
    # During parsing, data read from the serialized proto is stored in buffers.
    # For small batch sizes, a buffer will contain one minibatch entry.
    # For larger batch sizes, a buffer may contain several minibatch
    # entries.  This test identified a bug where the code that copied
    # data out of the buffers and into the output tensors assumed each
    # buffer only contained one minibatch entry.  The bug has since been fixed.
    truth_int = [i for i in range(batch_size)]
    truth_str = [[("foo%d" % i).encode(), ("bar%d" % i).encode()]
                 for i in range(batch_size)]

    expected_str = copy.deepcopy(truth_str)

    # Delete some intermediate entries
    for i in range(batch_size):
      col = 1
      if np.random.rand() < 0.25:
        # w.p. 25%, drop out the second entry
        expected_str[i][col] = b"default"
        col -= 1
        truth_str[i].pop()
      if np.random.rand() < 0.25:
        # w.p. 25%, drop out the second entry (possibly again)
        expected_str[i][col] = b"default"
        truth_str[i].pop()

    expected_output = {
        # Batch size batch_size, 1 time step.
        "a": np.array(truth_int, dtype=np.int64).reshape(batch_size, 1),
        # Batch size batch_size, 2 time steps.
        "b": np.array(expected_str, dtype="|S").reshape(batch_size, 2),
    }

    original = [
        example(features=features(
            {"a": int64_feature([truth_int[i]]),
             "b": bytes_feature(truth_str[i])}))
        for i in range(batch_size)
    ]

    serialized = [m.SerializeToString() for m in original]

    self._test(
        ops.convert_to_tensor(serialized, dtype=dtypes.string), {
            "a":
                parsing_ops.FixedLenSequenceFeature(
                    shape=(),
                    dtype=dtypes.int64,
                    allow_missing=True,
                    default_value=-1),
            "b":
                parsing_ops.FixedLenSequenceFeature(
                    shape=[],
                    dtype=dtypes.string,
                    allow_missing=True,
                    default_value="default"),
        },
        expected_values=expected_output,
        create_iterator_twice=True)
예제 #2
0
  def testSequenceExampleWithMultipleSizeFeatureLists(self):
    original = sequence_example(feature_lists=feature_lists({
        "a":
            feature_list([
                int64_feature([-1, 0, 1]),
                int64_feature([2, 3, 4]),
                int64_feature([5, 6, 7]),
                int64_feature([8, 9, 10]),
            ]),
        "b":
            feature_list([bytes_feature([b"r00", b"r01", b"r10", b"r11"])]),
        "c":
            feature_list([float_feature([3, 4]), float_feature([-1, 2])]),
    }))

    serialized = original.SerializeToString()

    expected_feature_list_output = {
        "a": np.array(
            [  # outer dimension is time.
                [[-1, 0, 1]],  # inside are 1x3 matrices
                [[2, 3, 4]],
                [[5, 6, 7]],
                [[8, 9, 10]]
            ],
            dtype=np.int64),
        "b": np.array(
            [  # outer dimension is time, inside are 2x2 matrices
                [[b"r00", b"r01"], [b"r10", b"r11"]]
            ],
            dtype=bytes),
        "c": np.array(
            [  # outer dimension is time, inside are 2-vectors
                [3, 4], [-1, 2]
            ],
            dtype=np.float32),
        "d": np.empty(
            shape=(0, 5), dtype=np.float32),  # empty_allowed_missing
    }

    self._test(
        {
            "example_name":
                "in1",
            "serialized":
                ops.convert_to_tensor(serialized),
            "sequence_features": {
                "a":
                    parsing_ops.FixedLenSequenceFeature((1, 3), dtypes.int64),
                "b":
                    parsing_ops.FixedLenSequenceFeature((2, 2), dtypes.string),
                "c":
                    parsing_ops.FixedLenSequenceFeature((2,), dtypes.float32),
                "d":
                    parsing_ops.FixedLenSequenceFeature(
                        (5,), dtypes.float32, allow_missing=True),
            }
        },
        expected_feat_list_values=expected_feature_list_output)
    def testDecodeExampleWithBoundingBoxDense(self):
        num_bboxes = 10
        np_ymin = np.random.rand(num_bboxes, 1)
        np_xmin = np.random.rand(num_bboxes, 1)
        np_ymax = np.random.rand(num_bboxes, 1)
        np_xmax = np.random.rand(num_bboxes, 1)
        np_bboxes = np.hstack([np_ymin, np_xmin, np_ymax, np_xmax])

        example = example_pb2.Example(features=feature_pb2.Features(
            feature={
                'image/object/bbox/ymin': self._EncodedFloatFeature(np_ymin),
                'image/object/bbox/xmin': self._EncodedFloatFeature(np_xmin),
                'image/object/bbox/ymax': self._EncodedFloatFeature(np_ymax),
                'image/object/bbox/xmax': self._EncodedFloatFeature(np_xmax),
            }))
        serialized_example = example.SerializeToString()

        with self.test_session():
            serialized_example = array_ops.reshape(serialized_example,
                                                   shape=[])

            keys_to_features = {
                'image/object/bbox/ymin':
                parsing_ops.FixedLenSequenceFeature([],
                                                    dtypes.float32,
                                                    allow_missing=True),
                'image/object/bbox/xmin':
                parsing_ops.FixedLenSequenceFeature([],
                                                    dtypes.float32,
                                                    allow_missing=True),
                'image/object/bbox/ymax':
                parsing_ops.FixedLenSequenceFeature([],
                                                    dtypes.float32,
                                                    allow_missing=True),
                'image/object/bbox/xmax':
                parsing_ops.FixedLenSequenceFeature([],
                                                    dtypes.float32,
                                                    allow_missing=True),
            }

            items_to_handlers = {
                'object/bbox':
                tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'],
                                              'image/object/bbox/'),
            }

            decoder = tfexample_decoder.TFExampleDecoder(
                keys_to_features, items_to_handlers)
            [tf_bboxes] = decoder.decode(serialized_example, ['object/bbox'])
            bboxes = tf_bboxes.eval()

        self.assertAllClose(np_bboxes, bboxes)
예제 #4
0
    def testSerializedShapeMismatch(self):
        aname = "a"
        bname = "b"
        cname = "c"
        original = [
            example(features=features({
                cname: int64_feature([2]),
            })),
            example(
                features=features({
                    aname: float_feature([1, 1]),
                    bname: bytes_feature([b"b0_str", b"b1_str"]),
                })),
            example(features=features({
                aname: float_feature([-1, -1, 2, 2]),
                bname: bytes_feature([b"b1"]),
            })),
            example(features=features({
                aname: float_feature([]),
                cname: int64_feature([3]),
            })),
        ]

        serialized = [m.SerializeToString() for m in original]
        if context.executing_eagerly():
            self._test(
                ops.convert_to_tensor(serialized), {
                    aname:
                    parsing_ops.FixedLenSequenceFeature((2, 1),
                                                        dtype=dtypes.float32,
                                                        allow_missing=True,
                                                        default_value=[]),
                    bname:
                    parsing_ops.FixedLenSequenceFeature(
                        (2, 1, 1), dtype=dtypes.string, allow_missing=True),
                },
                expected_err=(errors_impl.InvalidArgumentError,
                              "Input to reshape is a tensor with 0 values"))
        else:
            self._test(ops.convert_to_tensor(serialized), {
                aname:
                parsing_ops.FixedLenSequenceFeature((2, 1),
                                                    dtype=dtypes.float32,
                                                    allow_missing=True,
                                                    default_value=[]),
                bname:
                parsing_ops.FixedLenSequenceFeature(
                    (2, 1, 1), dtype=dtypes.string, allow_missing=True),
            },
                       expected_err=(
                           ValueError,
                           "Cannot reshape a tensor with 0 elements to shape"))
예제 #5
0
  def testCreateSequenceFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    sparse_id_col = fc.sparse_column_with_keys("id_column",
                                               ["marlo", "omar", "stringer"])
    weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                "id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column", dimension=2)
    real_valued_col2 = fc.real_valued_column(
        "real_valued_default_column", dimension=5, default_value=3.0)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_var_len_column", default_value=3.0, is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_var_len_dense_column", default_value=4.0, is_sparse=False)

    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, real_valued_col1,
        real_valued_col2, real_valued_col3, real_valued_col4
    ])

    feature_spec = fc._create_sequence_feature_spec_for_parsing(feature_columns)

    expected_feature_spec = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[2], dtype=dtypes.float32, allow_missing=False),
        "real_valued_default_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[5], dtype=dtypes.float32, allow_missing=True),
        "real_valued_var_len_column":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_var_len_dense_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[], dtype=dtypes.float32, allow_missing=True,
                default_value=4.0),
    }

    self.assertDictEqual(expected_feature_spec, feature_spec)
예제 #6
0
  def testRealValuedVarLenColumnDtypes(self):
    rvc = fc._real_valued_var_len_column("rvc", is_sparse=True)
    self.assertDictEqual(
        {
            "rvc": parsing_ops.VarLenFeature(dtype=dtypes.float32)
        }, rvc.config)

    rvc = fc._real_valued_var_len_column("rvc", default_value=0,
                                         is_sparse=False)
    self.assertDictEqual(
        {
            "rvc": parsing_ops.FixedLenSequenceFeature(shape=[],
                                                       dtype=dtypes.float32,
                                                       allow_missing=True,
                                                       default_value=0.0)
        }, rvc.config)

    rvc = fc._real_valued_var_len_column("rvc", dtype=dtypes.int32,
                                         default_value=0, is_sparse=True)
    self.assertDictEqual(
        {
            "rvc": parsing_ops.VarLenFeature(dtype=dtypes.int32)
        }, rvc.config)

    with self.assertRaisesRegexp(TypeError,
                                 "dtype must be convertible to float"):
      fc._real_valued_var_len_column("rvc", dtype=dtypes.string,
                                     default_value="", is_sparse=True)
 def _parse_example_spec(self):
     return {
         self.key:
         parsing_ops.FixedLenSequenceFeature(self.shape,
                                             self.dtype,
                                             allow_missing=True)
     }
    def testSingleExampleWithSparseAndSparseFeatureAndDense(self):
        original = example(features=features({
            "c": float_feature([3, 4]),
            "d": float_feature([0.0, 1.0]),
            "val": bytes_feature([b"a", b"b"]),
            "idx": int64_feature([0, 3]),
            "st_a": float_feature([3.0, 4.0])
        }))

        serialized = original.SerializeToString()

        expected_st_a = (
            np.array([[0], [1]], dtype=np.int64),  # indices
            np.array([3.0, 4.0], dtype=np.float32),  # values
            np.array([2], dtype=np.int64))  # shape: max_values = 2

        expected_sp = (  # indices, values, shape
            np.array([[0], [3]],
                     dtype=np.int64), np.array(["a", "b"], dtype="|S"),
            np.array([13], dtype=np.int64))  # max_values = 13

        a_default = [1, 2, 3]
        b_default = np.random.rand(3, 3).astype(bytes)
        expected_output = {
            "st_a": expected_st_a,
            "sp": expected_sp,
            "a": [a_default],
            "b": b_default,
            "c": np.array([3, 4], dtype=np.float32),
            "d": np.array([0.0, 1.0], dtype=np.float32),
        }

        self._test(
            {
                "serialized": ops.convert_to_tensor(serialized),
                "features": {
                    "st_a":
                    parsing_ops.VarLenFeature(dtypes.float32),
                    "sp":
                    parsing_ops.SparseFeature(["idx"], "val", dtypes.string,
                                              [13]),
                    "a":
                    parsing_ops.FixedLenFeature(
                        (1, 3), dtypes.int64, default_value=a_default),
                    "b":
                    parsing_ops.FixedLenFeature(
                        (3, 3), dtypes.string, default_value=b_default),
                    # Feature "c" must be provided, since it has no default_value.
                    "c":
                    parsing_ops.FixedLenFeature(2, dtypes.float32),
                    "d":
                    parsing_ops.FixedLenSequenceFeature(
                        [], dtypes.float32, allow_missing=True)
                }
            },
            expected_output)
  def testCreateFeatureSpec_ExperimentalColumns(self):
    real_valued_col0 = fc._real_valued_var_len_column(
        "real_valued_column0", is_sparse=True)
    real_valued_col1 = fc._real_valued_var_len_column(
        "real_valued_column1", dtype=dtypes.int64, default_value=0,
        is_sparse=False)
    feature_columns = set([real_valued_col0, real_valued_col1])
    expected_config = {
        "real_valued_column0": parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenSequenceFeature(
                [], dtype=dtypes.int64, allow_missing=True, default_value=0),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)
예제 #10
0
  def testSequenceExampleListWithInconsistentDataFails(self):
    original = sequence_example(feature_lists=feature_lists({
        "a": feature_list([int64_feature([-1, 0]), float_feature([2, 3])])
    }))

    serialized = original.SerializeToString()

    self._test(
        {
            "example_name": "in1",
            "serialized": ops.convert_to_tensor(serialized),
            "sequence_features": {
                "a": parsing_ops.FixedLenSequenceFeature((2,), dtypes.int64)
            }
        },
        expected_err=(errors_impl.OpError, "Feature list: a, Index: 1."
                      "  Data types don't match. Expected type: int64"))
예제 #11
0
  def testSequenceExampleListWithWrongShapeFails(self):
    original = sequence_example(feature_lists=feature_lists({
        "a": feature_list([int64_feature([2, 3]), int64_feature([2, 3, 4])]),
    }))

    serialized = original.SerializeToString()

    self._test(
        {
            "example_name": "in1",
            "serialized": ops.convert_to_tensor(serialized),
            "sequence_features": {
                "a": parsing_ops.FixedLenSequenceFeature((2,), dtypes.int64)
            }
        },
        expected_err=(errors_impl.OpError, r"Name: in1, Key: a, Index: 1."
                      r"  Number of int64 values != expected."
                      r"  values size: 3 but output shape: \[2\]"))
예제 #12
0
 def testCreateFeatureSpec_RealValuedColumnWithDefaultValue(self):
   real_valued_col1 = fc.real_valued_column(
       "real_valued_column1", default_value=2)
   real_valued_col2 = fc.real_valued_column(
       "real_valued_column2", 5, default_value=4)
   real_valued_col3 = fc.real_valued_column(
       "real_valued_column3", default_value=[8])
   real_valued_col4 = fc.real_valued_column(
       "real_valued_column4", 3, default_value=[1, 0, 6])
   real_valued_col5 = fc._real_valued_var_len_column(
       "real_valued_column5", default_value=2, is_sparse=True)
   real_valued_col6 = fc._real_valued_var_len_column(
       "real_valued_column6", dtype=dtypes.int64, default_value=1,
       is_sparse=False)
   feature_columns = [
       real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4,
       real_valued_col5, real_valued_col6
   ]
   config = fc.create_feature_spec_for_parsing(feature_columns)
   self.assertEqual(6, len(config))
   self.assertDictEqual(
       {
           "real_valued_column1":
               parsing_ops.FixedLenFeature(
                   [1], dtype=dtypes.float32, default_value=[2.]),
           "real_valued_column2":
               parsing_ops.FixedLenFeature(
                   [5],
                   dtype=dtypes.float32,
                   default_value=[4., 4., 4., 4., 4.]),
           "real_valued_column3":
               parsing_ops.FixedLenFeature(
                   [1], dtype=dtypes.float32, default_value=[8.]),
           "real_valued_column4":
               parsing_ops.FixedLenFeature(
                   [3], dtype=dtypes.float32, default_value=[1., 0., 6.]),
           "real_valued_column5":
               parsing_ops.VarLenFeature(dtype=dtypes.float32),
           "real_valued_column6":
               parsing_ops.FixedLenSequenceFeature(
                   [], dtype=dtypes.int64, allow_missing=True,
                   default_value=1)
       },
       config)
예제 #13
0
  def testSequenceExampleWithMissingFeatureListFails(self):
    original = sequence_example(feature_lists=feature_lists({}))

    # Test fails because we didn't add:
    #  feature_list_dense_defaults = {"a": None}
    self._test(
        {
            "example_name": "in1",
            "serialized": ops.convert_to_tensor(original.SerializeToString()),
            "sequence_features": {
                "a": parsing_ops.FixedLenSequenceFeature((2,), dtypes.int64)
            }
        },
        expected_err=(
            errors_impl.OpError,
            "Name: in1, Feature list 'a' is required but could not be found."
            "  Did you mean to include it in"
            " feature_list_dense_missing_assumed_empty or"
            " feature_list_dense_defaults?"))
예제 #14
0
    def testVaryingFieldsInGenerator(self):
        def simple_generator():
            for i in range(2):
                yield {"value": i, "seqlen_value": np.ones((i, 1))}

        simple_features = {
            "value":
            parsing_ops.FixedLenFeature(shape=[], dtype=dtypes.int32),
            "seqlen_value":
            parsing_ops.FixedLenSequenceFeature(shape=[1],
                                                dtype=dtypes.float32,
                                                allow_missing=True),
            "empty_value":
            parsing_ops.FixedLenFeature(default_value=[-1, -2],
                                        dtype=dtypes.int32,
                                        shape=[2])
        }
        tensors = python_input.python_input(simple_generator, simple_features)
        self.assertEqual(set(["value", "seqlen_value", "empty_value"]),
                         set(tensors.keys()))
        self.assertEqual(dtypes.int32, tensors["value"].dtype)
        self.assertEqual((), tensors["value"].shape)
        self.assertEqual(dtypes.float32, tensors["seqlen_value"].dtype)
        self.assertEqual([None, 1], tensors["seqlen_value"].shape.as_list())
        self.assertEqual(dtypes.int32, tensors["empty_value"].dtype)
        self.assertEqual([2], tensors["empty_value"].shape)

        with self.test_session() as sess:
            r1 = sess.run(tensors)
            self.assertAllEqual(0, r1["value"])
            self.assertAllEqual(np.ones((0, 1)), r1["seqlen_value"])
            self.assertAllEqual([-1, -2], r1["empty_value"])

            r2 = sess.run(tensors)
            self.assertAllEqual(1, r2["value"])
            self.assertAllEqual([[1]], r2["seqlen_value"])
            self.assertAllEqual([-1, -2], r2["empty_value"])

            with self.assertRaisesOpError("Iteration finished"):
                sess.run(tensors)
예제 #15
0
  def testSequenceExampleListWithWrongSparseDataTypeFails(self):
    original = sequence_example(feature_lists=feature_lists({
        "a":
            feature_list([
                int64_feature([3, 4]), int64_feature([1, 2]),
                float_feature([2.0, 3.0])
            ])
    }))

    serialized = original.SerializeToString()

    self._test(
        {
            "example_name": "in1",
            "serialized": ops.convert_to_tensor(serialized),
            "sequence_features": {
                "a": parsing_ops.FixedLenSequenceFeature((2,), dtypes.int64)
            }
        },
        expected_err=(errors_impl.OpError,
                      "Name: in1, Feature list: a, Index: 2."
                      "  Data types don't match. Expected type: int64"
                      "  Feature is: float_list"))
예제 #16
0
def parse_from_sequence_example(serialized,
                                list_size,
                                context_feature_spec=None,
                                example_feature_spec=None):
    """Parses SequenceExample to feature maps.

  Args:
    serialized: (Tensor) A string Tensor for a batch of serialized
      SequenceExample.
    list_size: (int) number of required frames in a SequenceExample. This is
      needed to normalize output tensor shapes across batches.
    context_feature_spec: (dict) A mapping from feature keys to
      `FixedLenFeature` or `VarLenFeature` values for context.
    example_feature_spec: (dict) A mapping from feature keys to
      `FixedLenFeature` or `VarLenFeature` values for the list of examples.
      These features are stored in the `feature_lists` field in SequenceExample.
      `FixedLenFeature` is translated to `FixedLenSequenceFeature` to parse
      SequenceExample. Note that no missing value in the middle of a
      `feature_list` is allowed for frames.

  Returns:
    A mapping from feature keys to `Tensor` or `SparseTensor`.
  """
    # Convert `FixedLenFeature` in `example_feature_spec` to
    # `FixedLenSequenceFeature` to parse the `feature_lists` in SequenceExample.
    # TODO(xuanhui): Handle missing feature_list since allow_missing=True.
    fixed_len_sequence_features = {
        k: parsing_ops.FixedLenSequenceFeature(s.shape,
                                               s.dtype,
                                               allow_missing=True)
        for k, s in six.iteritems(example_feature_spec)
        if isinstance(s, parsing_ops.FixedLenFeature)
    }
    sequence_features = example_feature_spec.copy()
    sequence_features.update(fixed_len_sequence_features)
    context, examples, _ = parsing_ops.parse_sequence_example(
        serialized,
        context_features=context_feature_spec,
        sequence_features=sequence_features)

    features = {}
    features.update(context)
    # Slice or pad example features to normalize the tensor shape:
    # [batch_size, num_frames, ...] --> [batch_size, list_size, ...]
    for k, t in six.iteritems(examples):
        # Old shape: [batch_size, num_frames, ...]
        shape = array_ops.unstack(array_ops.shape(t))
        ndims = len(shape)
        num_frames = shape[1]
        # New shape: [batch_size, list_size, ...]
        new_shape = array_ops.concat([[shape[0], list_size], shape[2:]], 0)

        def slice_fn(t=t, ndims=ndims, new_shape=new_shape):
            """Slices the tensor."""
            if isinstance(t, sparse_tensor.SparseTensor):
                return sparse_ops.sparse_slice(t, [0] * ndims,
                                               math_ops.to_int64(new_shape))
            else:
                return array_ops.slice(t, [0] * ndims, new_shape)

        def pad_fn(k=k,
                   t=t,
                   ndims=ndims,
                   num_frames=num_frames,
                   new_shape=new_shape):
            """Pads the tensor."""
            if isinstance(t, sparse_tensor.SparseTensor):
                return sparse_ops.sparse_reset_shape(t, new_shape)
            else:
                # Padding is n * 2 tensor where n is the ndims or rank of the padded
                # tensor.
                paddings = array_ops.stack(
                    [[0, 0], [0, list_size - num_frames]] + [[0, 0]] *
                    (ndims - 2))
                return array_ops.pad(
                    t,
                    paddings,
                    constant_values=array_ops.squeeze(
                        example_feature_spec[k].default_value[0]))

        tensor = control_flow_ops.cond(num_frames > list_size, slice_fn,
                                       pad_fn)
        # Infer static shape for Tensor.
        if not isinstance(tensor, sparse_tensor.SparseTensor):
            static_shape = t.get_shape().as_list()
            static_shape[1] = list_size
            tensor.set_shape(static_shape)
        features[k] = tensor
    return features
예제 #17
0
  def testSerializedContainingVarLenDense(self):
    aname = "a"
    bname = "b"
    cname = "c"
    dname = "d"
    original = [
        example(features=features({
            cname: int64_feature([2]),
        })),
        example(features=features({
            aname: float_feature([1, 1]),
            bname: bytes_feature([b"b0_str", b"b1_str"]),
        })),
        example(features=features({
            aname: float_feature([-1, -1, 2, 2]),
            bname: bytes_feature([b"b1"]),
        })),
        example(features=features({
            aname: float_feature([]),
            cname: int64_feature([3]),
        })),
    ]

    serialized = [m.SerializeToString() for m in original]

    expected_output = {
        aname:
            np.array(
                [
                    [0, 0, 0, 0],
                    [1, 1, 0, 0],
                    [-1, -1, 2, 2],
                    [0, 0, 0, 0],
                ],
                dtype=np.float32).reshape(4, 2, 2, 1),
        bname:
            np.array(
                [["", ""], ["b0_str", "b1_str"], ["b1", ""], ["", ""]],
                dtype=bytes).reshape(4, 2, 1, 1, 1),
        cname:
            np.array([2, 0, 0, 3], dtype=np.int64).reshape(4, 1),
        dname:
            np.empty(shape=(4, 0), dtype=bytes),
    }

    self._test(
        ops.convert_to_tensor(serialized), {
            aname:
                parsing_ops.FixedLenSequenceFeature(
                    (2, 1), dtype=dtypes.float32, allow_missing=True),
            bname:
                parsing_ops.FixedLenSequenceFeature(
                    (1, 1, 1), dtype=dtypes.string, allow_missing=True),
            cname:
                parsing_ops.FixedLenSequenceFeature(
                    shape=[], dtype=dtypes.int64, allow_missing=True),
            dname:
                parsing_ops.FixedLenSequenceFeature(
                    shape=[], dtype=dtypes.string, allow_missing=True),
        },
        expected_values=expected_output)

    # Test with padding values.
    expected_output_custom_padding = dict(expected_output)
    expected_output_custom_padding[aname] = np.array(
        [
            [-2, -2, -2, -2],
            [1, 1, -2, -2],
            [-1, -1, 2, 2],
            [-2, -2, -2, -2],
        ],
        dtype=np.float32).reshape(4, 2, 2, 1)

    self._test(
        ops.convert_to_tensor(serialized), {
            aname:
                parsing_ops.FixedLenSequenceFeature(
                    (2, 1),
                    dtype=dtypes.float32,
                    allow_missing=True,
                    default_value=-2.0),
            bname:
                parsing_ops.FixedLenSequenceFeature(
                    (1, 1, 1), dtype=dtypes.string, allow_missing=True),
            cname:
                parsing_ops.FixedLenSequenceFeature(
                    shape=[], dtype=dtypes.int64, allow_missing=True),
            dname:
                parsing_ops.FixedLenSequenceFeature(
                    shape=[], dtype=dtypes.string, allow_missing=True),
        }, expected_output_custom_padding)

    # Change number of required values so the inputs are not a
    # multiple of this size.
    self._test(
        ops.convert_to_tensor(serialized), {
            aname:
                parsing_ops.FixedLenSequenceFeature(
                    (2, 1), dtype=dtypes.float32, allow_missing=True),
            bname:
                parsing_ops.FixedLenSequenceFeature(
                    (2, 1, 1), dtype=dtypes.string, allow_missing=True),
        },
        expected_err=(
            errors_impl.OpError, "Key: b, Index: 2.  "
            "Number of bytes values is not a multiple of stride length."))

    self._test(
        ops.convert_to_tensor(serialized), {
            aname:
                parsing_ops.FixedLenSequenceFeature(
                    (2, 1),
                    dtype=dtypes.float32,
                    allow_missing=True,
                    default_value=[]),
            bname:
                parsing_ops.FixedLenSequenceFeature(
                    (2, 1, 1), dtype=dtypes.string, allow_missing=True),
        },
        expected_err=(ValueError,
                      "Cannot reshape a tensor with 0 elements to shape"))

    self._test(
        ops.convert_to_tensor(serialized), {
            aname:
                parsing_ops.FixedLenFeature((None, 2, 1), dtype=dtypes.float32),
            bname:
                parsing_ops.FixedLenSequenceFeature(
                    (2, 1, 1), dtype=dtypes.string, allow_missing=True),
        },
        expected_err=(ValueError,
                      "First dimension of shape for feature a unknown. "
                      "Consider using FixedLenSequenceFeature."))

    self._test(
        ops.convert_to_tensor(serialized), {
            cname:
                parsing_ops.FixedLenFeature(
                    (1, None), dtype=dtypes.int64, default_value=[[1]]),
        },
        expected_err=(ValueError,
                      "All dimensions of shape for feature c need to be known "
                      r"but received \(1, None\)."))

    self._test(
        ops.convert_to_tensor(serialized), {
            aname:
                parsing_ops.FixedLenSequenceFeature(
                    (2, 1), dtype=dtypes.float32, allow_missing=True),
            bname:
                parsing_ops.FixedLenSequenceFeature(
                    (1, 1, 1), dtype=dtypes.string, allow_missing=True),
            cname:
                parsing_ops.FixedLenSequenceFeature(
                    shape=[], dtype=dtypes.int64, allow_missing=False),
            dname:
                parsing_ops.FixedLenSequenceFeature(
                    shape=[], dtype=dtypes.string, allow_missing=True),
        },
        expected_err=(ValueError,
                      "Unsupported: FixedLenSequenceFeature requires "
                      "allow_missing to be True."))
예제 #18
0
파일: dataset.py 프로젝트: wenkesj/alchemy
def ReplayDataset(replay_stream, max_sequence_length=200, name=None):
    """Creates a `tf.data.Dataset` from a `ay.contrib.rl.ReplayStream` instance.

  Arguments:
    replay_stream: `ay.contrib.rl.ReplayStream` instance. Must implement `replay_stream.read`.
        The method is called `replay_stream.read(limit=max_sequence_length)` each time an instance
        is requested by the dataset. This method should return `None` or raise an
        `tf.errors.OutOfRangeError` when the stream is done and execution of the dataset should stop.
        `replay_stream.read` should always return a `tf.SequenceExample` proto.

  Returns:
    A `tf.data.Dataset`.

  Raises:
    An `tf.errors.OutOfRangeError` when the stream returns a `None` or raises
        `tf.errors.OutOfRangeError`.
  """
    assert_utils.assert_true(
        isinstance(replay_stream, streams.ReplayStream),
        '`replay_stream` must be an instance of `ay.contrib.rl.ReplayStream`')

    with ops.name_scope(name or 'replay_dataset'):
        state_shape = list(replay_stream.state_shape)
        state_dtype = replay_stream.state_dtype
        action_shape = list(replay_stream.action_shape)
        action_dtype = replay_stream.action_dtype
        action_value_shape = list(replay_stream.action_value_shape)
        action_value_dtype = replay_stream.action_value_dtype
        reward_shape = list(replay_stream.reward_shape)
        reward_dtype = replay_stream.reward_dtype

        replay_dtypes = {
            'state': state_dtype,
            'next_state': state_dtype,
            'action': action_dtype,
            'action_value': action_value_dtype,
            'reward': reward_dtype,
            'terminal': dtypes.bool,
            'sequence_length': dtypes.int32,
        }

        if replay_stream.with_values:
            replay_dtypes['value'] = reward_dtype

        def convert_to_safe_feature_type(dtype):
            return type_utils.safe_tf_dtype(
                serialize.type_to_feature[dtype][-1])

        replay_features = {
            'state':
            parsing_ops.FixedLenSequenceFeature(
                shape=state_shape,
                dtype=convert_to_safe_feature_type(state_dtype)),
            'next_state':
            parsing_ops.FixedLenSequenceFeature(
                shape=state_shape,
                dtype=convert_to_safe_feature_type(state_dtype)),
            'action':
            parsing_ops.FixedLenSequenceFeature(
                shape=action_shape,
                dtype=convert_to_safe_feature_type(action_dtype)),
            'action_value':
            parsing_ops.FixedLenSequenceFeature(
                shape=action_value_shape,
                dtype=convert_to_safe_feature_type(action_value_dtype)),
            'reward':
            parsing_ops.FixedLenSequenceFeature(
                shape=reward_shape,
                dtype=convert_to_safe_feature_type(reward_dtype)),
            'terminal':
            parsing_ops.FixedLenSequenceFeature(
                shape=[], dtype=convert_to_safe_feature_type(dtypes.bool)),
            'sequence_length':
            parsing_ops.FixedLenSequenceFeature(
                shape=[], dtype=convert_to_safe_feature_type(dtypes.int32)),
        }

        if replay_stream.with_values:
            replay_features['value'] = parsing_ops.FixedLenSequenceFeature(
                shape=reward_shape,
                dtype=convert_to_safe_feature_type(reward_dtype))

        def convert_and_fix_dtypes(replay):
            """Cast dtypes back to their original types."""
            fixed_replay = {}
            for k, v in replay.items():
                fixed_replay[k] = math_ops.cast(v, dtype=replay_dtypes[k])
            return fixed_replay

        def generator():
            """Create `tf.Tensor`s from the `ay.contrib.rl.ReplayStream` instance."""
            while True:
                replay_example = None
                try:
                    replay_example = replay_stream.read(
                        limit=max_sequence_length)
                except:
                    yield ""
                else:
                    yield replay_example.SerializeToString()

        def serialize_map(replay_example_str):
            """Parse each example string to `tf.Tensor`."""
            try:
                assert_op = control_flow_ops.Assert(replay_example_str != "",
                                                    [replay_example_str])
                with ops.control_dependencies([assert_op]):
                    _, replay = parsing_ops.parse_single_sequence_example(
                        replay_example_str, sequence_features=replay_features)
            except errors_impl.InvalidArgumentError:
                raise errors_impl.OutOfRangeError()

            return convert_and_fix_dtypes(replay)

        def pad_or_truncate_map(replay):
            """Truncate or pad replays."""
            with_values = 'value' in replay

            if with_values:
                replay = experience.ReplayWithValues(**replay)
            else:
                replay = experience.Replay(**replay)

            sequence_length = math_ops.minimum(max_sequence_length,
                                               replay.sequence_length)
            sequence_length.set_shape([1])

            state = sequence_utils.pad_or_truncate(replay.state,
                                                   max_sequence_length,
                                                   axis=0,
                                                   pad_value=0)
            state.set_shape([max_sequence_length] + state_shape)

            next_state = sequence_utils.pad_or_truncate(replay.next_state,
                                                        max_sequence_length,
                                                        axis=0,
                                                        pad_value=0)
            next_state.set_shape([max_sequence_length] + state_shape)

            action = sequence_utils.pad_or_truncate(replay.action,
                                                    max_sequence_length,
                                                    axis=0,
                                                    pad_value=0)
            action.set_shape([max_sequence_length] + action_shape)

            action_value = sequence_utils.pad_or_truncate(replay.action_value,
                                                          max_sequence_length,
                                                          axis=0,
                                                          pad_value=0)
            action_value.set_shape([max_sequence_length] + action_value_shape)

            reward = sequence_utils.pad_or_truncate(replay.reward,
                                                    max_sequence_length,
                                                    axis=0,
                                                    pad_value=0)
            reward.set_shape([max_sequence_length] + reward_shape)

            terminal = sequence_utils.pad_or_truncate(
                replay.terminal,
                max_sequence_length,
                axis=0,
                pad_value=ops.convert_to_tensor(False))
            terminal.set_shape([max_sequence_length])

            if with_values:
                value = sequence_utils.pad_or_truncate(replay.value,
                                                       max_sequence_length,
                                                       axis=0,
                                                       pad_value=0)
                value.set_shape([max_sequence_length] + reward_shape)

                return experience.ReplayWithValues(
                    state=state,
                    next_state=next_state,
                    action=action,
                    action_value=action_value,
                    value=value,
                    reward=reward,
                    terminal=terminal,
                    sequence_length=sequence_length)

            return experience.Replay(state=state,
                                     next_state=next_state,
                                     action=action,
                                     action_value=action_value,
                                     reward=reward,
                                     terminal=terminal,
                                     sequence_length=sequence_length)

        dataset = dataset_ops.Dataset.from_generator(generator, dtypes.string)
        dataset = dataset.map(serialize_map)
        return dataset.map(pad_or_truncate_map)
예제 #19
0
  def testSequenceExampleWithSparseAndDenseFeatureLists(self):
    original = sequence_example(feature_lists=feature_lists({
        "a":
            feature_list([int64_feature([3, 4]), int64_feature([1, 0])]),
        "st_a":
            feature_list([
                float_feature([3.0, 4.0]), float_feature([5.0]),
                float_feature([])
            ]),
        "st_b":
            feature_list([
                bytes_feature([b"a"]), bytes_feature([]), bytes_feature([]),
                bytes_feature([b"b", b"c"])
            ])
    }))

    serialized = original.SerializeToString()

    expected_st_a = (
        np.array(
            [[0, 0], [0, 1], [1, 0]], dtype=np.int64),  # indices
        np.array(
            [3.0, 4.0, 5.0], dtype=np.float32),  # values
        np.array(
            [3, 2], dtype=np.int64))  # shape: num_time = 3, max_feat = 2

    expected_st_b = (
        np.array(
            [[0, 0], [3, 0], [3, 1]], dtype=np.int64),  # indices
        np.array(
            ["a", "b", "c"], dtype="|S"),  # values
        np.array(
            [4, 2], dtype=np.int64))  # shape: num_time = 4, max_feat = 2

    expected_st_c = (
        np.empty(
            (0, 2), dtype=np.int64),  # indices
        np.empty(
            (0,), dtype=np.int64),  # values
        np.array(
            [0, 0], dtype=np.int64))  # shape: num_time = 0, max_feat = 0

    expected_feature_list_output = {
        "a": np.array(
            [[3, 4], [1, 0]], dtype=np.int64),
        "st_a": expected_st_a,
        "st_b": expected_st_b,
        "st_c": expected_st_c,
    }

    self._test(
        {
            "example_name": "in1",
            "serialized": ops.convert_to_tensor(serialized),
            "sequence_features": {
                "st_a": parsing_ops.VarLenFeature(dtypes.float32),
                "st_b": parsing_ops.VarLenFeature(dtypes.string),
                "st_c": parsing_ops.VarLenFeature(dtypes.int64),
                "a": parsing_ops.FixedLenSequenceFeature((2,), dtypes.int64),
            }
        },
        expected_feat_list_values=expected_feature_list_output)
예제 #20
0
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_column3", is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_column4", dtype=dtypes.int64, default_value=0,
        is_sparse=False)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2,
        cross_col, one_hot_col, scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_column4":
            parsing_ops.FixedLenSequenceFeature(
                [], dtype=dtypes.int64, allow_missing=True, default_value=0),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
    def testSerializedContainingVarLenDense(self):
        aname = "a"
        bname = "b"
        cname = "c"
        dname = "d"
        original = [
            example(features=features({
                cname: int64_feature([2]),
            })),
            example(
                features=features({
                    aname: float_feature([1, 1]),
                    bname: bytes_feature([b"b0_str", b"b1_str"]),
                })),
            example(features=features({
                aname: float_feature([-1, -1, 2, 2]),
                bname: bytes_feature([b"b1"]),
            })),
            example(features=features({
                aname: float_feature([]),
                cname: int64_feature([3]),
            })),
        ]

        expected_outputs = [
            {
                aname: np.empty(shape=(0, 2, 1), dtype=np.int64),
                bname: np.empty(shape=(0, 1, 1, 1), dtype=bytes),
                cname: np.array([2], dtype=np.int64),
                dname: np.empty(shape=(0, ), dtype=bytes)
            },
            {
                aname:
                np.array([[[1], [1]]], dtype=np.float32),
                bname:
                np.array(["b0_str", "b1_str"],
                         dtype=bytes).reshape(2, 1, 1, 1),
                cname:
                np.empty(shape=(0, ), dtype=np.int64),
                dname:
                np.empty(shape=(0, ), dtype=bytes)
            },
            {
                aname: np.array([[[-1], [-1]], [[2], [2]]], dtype=np.float32),
                bname: np.array(["b1"], dtype=bytes).reshape(1, 1, 1, 1),
                cname: np.empty(shape=(0, ), dtype=np.int64),
                dname: np.empty(shape=(0, ), dtype=bytes)
            },
            {
                aname: np.empty(shape=(0, 2, 1), dtype=np.int64),
                bname: np.empty(shape=(0, 1, 1, 1), dtype=bytes),
                cname: np.array([3], dtype=np.int64),
                dname: np.empty(shape=(0, ), dtype=bytes)
            },
        ]

        for proto, expected_output in zip(original, expected_outputs):
            self._test(
                {
                    "serialized": ops.convert_to_tensor(
                        proto.SerializeToString()),
                    "features": {
                        aname:
                        parsing_ops.FixedLenSequenceFeature(
                            (2, 1), dtype=dtypes.float32, allow_missing=True),
                        bname:
                        parsing_ops.FixedLenSequenceFeature(
                            (1, 1, 1), dtype=dtypes.string,
                            allow_missing=True),
                        cname:
                        parsing_ops.FixedLenSequenceFeature(
                            shape=[], dtype=dtypes.int64, allow_missing=True),
                        dname:
                        parsing_ops.FixedLenSequenceFeature(
                            shape=[], dtype=dtypes.string, allow_missing=True),
                    }
                }, expected_output)

        # Test with padding values.
        # NOTE(mrry): Since we parse a single example at a time, the fixed-length
        # sequences will not be padded, and the padding value will be ignored.
        for proto, expected_output in zip(original, expected_outputs):
            self._test(
                {
                    "serialized": ops.convert_to_tensor(
                        proto.SerializeToString()),
                    "features": {
                        aname:
                        parsing_ops.FixedLenSequenceFeature(
                            (2, 1), dtype=dtypes.float32, allow_missing=True),
                        bname:
                        parsing_ops.FixedLenSequenceFeature(
                            (1, 1, 1), dtype=dtypes.string,
                            allow_missing=True),
                        cname:
                        parsing_ops.FixedLenSequenceFeature(
                            shape=[], dtype=dtypes.int64, allow_missing=True),
                        dname:
                        parsing_ops.FixedLenSequenceFeature(
                            shape=[], dtype=dtypes.string, allow_missing=True),
                    }
                }, expected_output)

        # Change number of required values so the inputs are not a
        # multiple of this size.
        self._test(
            {
                "serialized":
                ops.convert_to_tensor(original[2].SerializeToString()),
                "features": {
                    aname:
                    parsing_ops.FixedLenSequenceFeature(
                        (2, 1), dtype=dtypes.float32, allow_missing=True),
                    bname:
                    parsing_ops.FixedLenSequenceFeature(
                        (2, 1, 1), dtype=dtypes.string, allow_missing=True),
                }
            },
            # TODO(mrry): Consider matching the `tf.parse_example()` error message.
            expected_err=(errors_impl.OpError, "Key: b."))

        self._test(
            {
                "serialized": ops.convert_to_tensor(""),
                "features": {
                    aname:
                    parsing_ops.FixedLenSequenceFeature((2, 1),
                                                        dtype=dtypes.float32,
                                                        allow_missing=True,
                                                        default_value=[]),
                    bname:
                    parsing_ops.FixedLenSequenceFeature(
                        (2, 1, 1), dtype=dtypes.string, allow_missing=True),
                }
            },
            expected_err=(ValueError,
                          "Cannot reshape a tensor with 0 elements to shape"))

        self._test(
            {
                "serialized": ops.convert_to_tensor(""),
                "features": {
                    aname:
                    parsing_ops.FixedLenFeature(
                        (None, 2, 1), dtype=dtypes.float32),
                    bname:
                    parsing_ops.FixedLenSequenceFeature(
                        (2, 1, 1), dtype=dtypes.string, allow_missing=True),
                }
            },
            expected_err=(ValueError,
                          "First dimension of shape for feature a unknown. "
                          "Consider using FixedLenSequenceFeature."))

        self._test(
            {
                "serialized": ops.convert_to_tensor(""),
                "features": {
                    cname:
                    parsing_ops.FixedLenFeature(
                        (1, None), dtype=dtypes.int64, default_value=[[1]]),
                }
            },
            expected_err=(
                ValueError,
                "All dimensions of shape for feature c need to be known "
                r"but received \(1, None\)."))

        self._test(
            {
                "serialized": ops.convert_to_tensor(""),
                "features": {
                    aname:
                    parsing_ops.FixedLenSequenceFeature(
                        (2, 1), dtype=dtypes.float32, allow_missing=True),
                    bname:
                    parsing_ops.FixedLenSequenceFeature(
                        (1, 1, 1), dtype=dtypes.string, allow_missing=True),
                    cname:
                    parsing_ops.FixedLenSequenceFeature(
                        shape=[], dtype=dtypes.int64, allow_missing=False),
                    dname:
                    parsing_ops.FixedLenSequenceFeature(
                        shape=[], dtype=dtypes.string, allow_missing=True),
                }
            },
            expected_err=(ValueError,
                          "Unsupported: FixedLenSequenceFeature requires "
                          "allow_missing to be True."))