Пример #1
0
def read_records(index=0):
  train_queue = tf.train.string_input_producer(['training.tfrecords'], num_epochs=FLAGS.epochs)
  validation_queue = tf.train.string_input_producer(['validation.tfrecords'], num_epochs=FLAGS.epochs)
  test_queue = tf.train.string_input_producer(['test.tfrecords'], num_epochs=FLAGS.epochs)

  queue = tf.QueueBase.from_list(index, [train_queue, validation_queue, test_queue])
  reader = tf.TFRecordReader()
  _, serialized_example = reader.read(queue)
  features = tf.parse_single_example(
      serialized_example,
      features={
        'document': tf.VarLenFeature(tf.int64),
        'query': tf.VarLenFeature(tf.int64),
        'answer': tf.FixedLenFeature([], tf.int64)
      })
  document = sparse_ops.serialize_sparse(features['document'])
  query = sparse_ops.serialize_sparse(features['query'])
  answer = features['answer']


  document_batch_serialized, query_batch_serialized, answer_batch = tf.train.shuffle_batch(
      [document, query, answer], batch_size=FLAGS.batch_size,
      capacity=2000,
      min_after_dequeue=1000)

  sparse_document_batch = sparse_ops.deserialize_many_sparse(document_batch_serialized, dtype=tf.int64)
  sparse_query_batch = sparse_ops.deserialize_many_sparse(query_batch_serialized, dtype=tf.int64)

  document_batch = tf.sparse_tensor_to_dense(sparse_document_batch)
  document_weights = tf.sparse_to_dense(sparse_document_batch.indices, tf.cast(tf.shape(sparse_document_batch),tf.int64), 1)

  query_batch = tf.sparse_tensor_to_dense(sparse_query_batch)
  query_weights = tf.sparse_to_dense(sparse_query_batch.indices, tf.cast(tf.shape(sparse_query_batch),tf.int64), 1)

  return document_batch, document_weights, query_batch, query_weights, answer_batch
Пример #2
0
def read_records(index=0):
    # 生成读取数据的队列,要指定epoches
    train_queue = tf.train.string_input_producer(['training.tfrecords'],
                                                 num_epochs=FLAGS.epochs)
    validation_queue = tf.train.string_input_producer(['validation.tfrecords'],
                                                      num_epochs=FLAGS.epochs)
    test_queue = tf.train.string_input_producer(['test.tfrecords'],
                                                num_epochs=FLAGS.epochs)

    queue = tf.QueueBase.from_list(index,
                                   [train_queue, validation_queue, test_queue])
    # 定义一个recordreader对象,用于数据的读取
    reader = tf.TFRecordReader()
    # 从之前的队列中读取数据到serialized_example
    _, serialized_example = reader.read(queue)
    # 调用parse_single_example函数解析数据
    features = tf.parse_single_example(serialized_example,
                                       features={
                                           'document':
                                           tf.VarLenFeature(tf.int64),
                                           'query':
                                           tf.VarLenFeature(tf.int64),
                                           'answer':
                                           tf.FixedLenFeature([], tf.int64)
                                       })

    # 返回索引、值、shape的三元组信息(1-D tensor)
    document = sparse_ops.serialize_sparse(features['document'])
    query = sparse_ops.serialize_sparse(features['query'])
    answer = features['answer']

    # 生成batch切分数据
    document_batch_serialized, query_batch_serialized, answer_batch = tf.train.shuffle_batch(
        [document, query, answer],
        batch_size=FLAGS.batch_size,
        capacity=2000,
        min_after_dequeue=1000)

    sparse_document_batch = sparse_ops.deserialize_many_sparse(
        document_batch_serialized, dtype=tf.int64)
    sparse_query_batch = sparse_ops.deserialize_many_sparse(
        query_batch_serialized, dtype=tf.int64)

    # 变相将不同长短的句子用0 padding
    # tf.sparse_tensor_to_dense ----return:[batch_size,seq_length]
    document_batch = tf.sparse_tensor_to_dense(sparse_document_batch)

    # 赋值权重,将每个batch多条句子中的word用1表示,其余用0表示
    # tf.sparse_to_dense( sparse_indices,output_shape,sparse_values(指定元素,默认为1),default_value(其他元素,默认为0))
    # tf.sparse_to_dense ---- return:[batch_size, seq_length]
    document_weights = tf.sparse_to_dense(sparse_document_batch.indices,
                                          sparse_document_batch.dense_shape, 1)

    query_batch = tf.sparse_tensor_to_dense(sparse_query_batch)
    query_weights = tf.sparse_to_dense(sparse_query_batch.indices,
                                       sparse_query_batch.dense_shape, 1)

    return document_batch, document_weights, query_batch, query_weights, answer_batch
    def testFeedSerializeDeserializeMany(self):
        with self.test_session(use_gpu=False) as sess:
            sp_input0 = self._SparseTensorPlaceholder()
            sp_input1 = self._SparseTensorPlaceholder()
            input0_val = self._SparseTensorValue_5x6(np.arange(6))
            input1_val = self._SparseTensorValue_3x4(np.arange(6))
            serialized0 = sparse_ops.serialize_sparse(sp_input0)
            serialized1 = sparse_ops.serialize_sparse(sp_input1)
            serialized_concat = array_ops.stack([serialized0, serialized1])

            sp_deserialized = sparse_ops.deserialize_many_sparse(
                serialized_concat, dtype=dtypes.int32)

            combined_indices, combined_values, combined_shape = sess.run(
                sp_deserialized, {
                    sp_input0: input0_val,
                    sp_input1: input1_val
                })

            self.assertAllEqual(combined_indices[:6, 0],
                                [0] * 6)  # minibatch 0
            self.assertAllEqual(combined_indices[:6, 1:], input0_val[0])
            self.assertAllEqual(combined_indices[6:, 0],
                                [1] * 6)  # minibatch 1
            self.assertAllEqual(combined_indices[6:, 1:], input1_val[0])
            self.assertAllEqual(combined_values[:6], input0_val[1])
            self.assertAllEqual(combined_values[6:], input1_val[1])
            self.assertAllEqual(combined_shape, [2, 5, 6])
Пример #4
0
def _deserialize_sparse_tensors(serialized_list, is_sparse_list, sparse_dtypes):
  """Deserialize SparseTensors after dequeue in batch, batch_join, etc."""
  received_sequence = isinstance(serialized_list, collections.Sequence)
  if not received_sequence:
    serialized_list = (serialized_list,)
  tensors = [sparse_ops.deserialize_many_sparse(s, sparse_dtype) if is_sparse
             else s
             for (s, is_sparse, sparse_dtype)
             in zip(serialized_list, is_sparse_list, sparse_dtypes)]
  return tensors if received_sequence else tensors[0]
Пример #5
0
def _deserialize_sparse_tensors(serialized_list, sparse_info_list):
    """Deserialize SparseTensors after dequeue in batch, batch_join, etc."""
    received_sequence = isinstance(serialized_list, collections.Sequence)
    if not received_sequence:
        serialized_list = (serialized_list,)
    tensors = [
        sparse_ops.deserialize_many_sparse(s, info.dtype, (info.rank + 1).value) if info.sparse else s
        for (s, info) in zip(serialized_list, sparse_info_list)
    ]
    return tensors if received_sequence else tensors[0]
Пример #6
0
def _deserialize_sparse_tensors(serialized_list, sparse_info_list):
  """Deserialize SparseTensors after dequeue in batch, batch_join, etc."""
  received_sequence = isinstance(serialized_list, collections.Sequence)
  if not received_sequence:
    serialized_list = (serialized_list,)
  tensors = [
      sparse_ops.deserialize_many_sparse(s, info.dtype, (info.rank + 1).value)
      if info.sparse else s
      for (s, info)
      in zip(serialized_list, sparse_info_list)]
  return tensors if received_sequence else tensors[0]
Пример #7
0
def _deserialize_sparse_tensors(serialized_list, is_sparse_list,
                                sparse_dtypes):
    """Deserialize SparseTensors after dequeue in batch, batch_join, etc."""
    received_sequence = isinstance(serialized_list, collections.Sequence)
    if not received_sequence:
        serialized_list = (serialized_list, )
    tensors = [
        sparse_ops.deserialize_many_sparse(s, sparse_dtype) if is_sparse else s
        for (s, is_sparse, sparse_dtype
             ) in zip(serialized_list, is_sparse_list, sparse_dtypes)
    ]
    return tensors if received_sequence else tensors[0]
    def testDeserializeFailsInvalidProto(self):
        with self.test_session(use_gpu=False) as sess:
            sp_input0 = self._SparseTensorPlaceholder()
            input0_val = self._SparseTensorValue_5x6(np.arange(6))
            serialized0 = sparse_ops.serialize_sparse(sp_input0)
            serialized1 = ["a", "b", "c"]
            serialized_concat = array_ops.stack([serialized0, serialized1])

            sp_deserialized = sparse_ops.deserialize_many_sparse(
                serialized_concat, dtype=dtypes.int32)

            with self.assertRaisesOpError(
                    r"Could not parse serialized_sparse\[1, 0\]"):
                sess.run(sp_deserialized, {sp_input0: input0_val})
Пример #9
0
    def benchmarkVeryLarge2DFloatSparseTensor(self):
        np.random.seed(127)
        num_elements = 10000
        batch_size = 64
        indices_batch = np.random.randint(batch_size,
                                          size=num_elements,
                                          dtype=np.int64)
        indices_value = np.arange(num_elements, dtype=np.int64)
        indices = np.asarray(sorted(zip(indices_batch, indices_value)),
                             dtype=np.int64)
        values = ["feature_value_for_embedding_lookup"] * num_elements
        shape = np.asarray([batch_size, num_elements], dtype=np.int64)
        with session.Session(config=benchmark.benchmark_config()) as sess:
            with ops.device("/cpu:0"):
                indices = variables.Variable(indices)
                values = variables.Variable(values)
                shape = variables.Variable(shape)
                st = sparse_tensor_lib.SparseTensor(indices, values, shape)

                st_handles = add_many_sparse_to_tensors_map(st)
                st_roundtrip = take_many_sparse_from_tensors_map(
                    sparse_map_op=st_handles.op, sparse_handles=st_handles)
                st_roundtrip_op = st_roundtrip.values.op

                st_serialized = sparse_ops.serialize_many_sparse(st)
                st_deserialized = sparse_ops.deserialize_many_sparse(
                    st_serialized, dtype=values.dtype)
                st_deserialized_op = st_deserialized.values.op

                variables.global_variables_initializer().run()

                st_roundtrip_values = self.evaluate(st_roundtrip)
                st_deserialized_values = self.evaluate(st_deserialized)
                np.testing.assert_equal(st_roundtrip_values.values,
                                        st_deserialized_values.values)
                np.testing.assert_equal(st_roundtrip_values.indices,
                                        st_deserialized_values.indices)
                np.testing.assert_equal(st_roundtrip_values.dense_shape,
                                        st_deserialized_values.dense_shape)

                self.run_op_benchmark(
                    sess,
                    st_roundtrip_op,
                    min_iters=2000,
                    name="benchmark_very_large_2d_float_st_tensor_maps")
                self.run_op_benchmark(
                    sess,
                    st_deserialized_op,
                    min_iters=2000,
                    name="benchmark_very_large_2d_float_st_serialization")
  def testDeserializeFailsInvalidProto(self):
    with self.test_session(use_gpu=False) as sess:
      sp_input0 = self._SparseTensorPlaceholder()
      input0_val = self._SparseTensorValue_5x6(np.arange(6))
      serialized0 = sparse_ops.serialize_sparse(sp_input0)
      serialized1 = ["a", "b", "c"]
      serialized_concat = array_ops.stack([serialized0, serialized1])

      sp_deserialized = sparse_ops.deserialize_many_sparse(
          serialized_concat, dtype=dtypes.int32)

      with self.assertRaisesOpError(
          r"Could not parse serialized_sparse\[1, 0\]"):
        sess.run(sp_deserialized, {sp_input0: input0_val})
  def benchmarkVeryLarge2DFloatSparseTensor(self):
    np.random.seed(127)
    num_elements = 10000
    batch_size = 64
    indices_batch = np.random.randint(
        batch_size, size=num_elements, dtype=np.int64)
    indices_value = np.arange(num_elements, dtype=np.int64)
    indices = np.asarray(
        sorted(zip(indices_batch, indices_value)), dtype=np.int64)
    values = ["feature_value_for_embedding_lookup"] * num_elements
    shape = np.asarray([batch_size, num_elements], dtype=np.int64)
    with session.Session(config=benchmark.benchmark_config()) as sess:
      with ops.device("/cpu:0"):
        indices = variables.Variable(indices)
        values = variables.Variable(values)
        shape = variables.Variable(shape)
        st = sparse_tensor_lib.SparseTensor(indices, values, shape)

        st_handles = add_many_sparse_to_tensors_map(st)
        st_roundtrip = take_many_sparse_from_tensors_map(
            sparse_map_op=st_handles.op, sparse_handles=st_handles)
        st_roundtrip_op = st_roundtrip.values.op

        st_serialized = sparse_ops.serialize_many_sparse(st)
        st_deserialized = sparse_ops.deserialize_many_sparse(
            st_serialized, dtype=values.dtype)
        st_deserialized_op = st_deserialized.values.op

        variables.global_variables_initializer().run()

        st_roundtrip_values = sess.run(st_roundtrip)
        st_deserialized_values = sess.run(st_deserialized)
        np.testing.assert_equal(st_roundtrip_values.values,
                                st_deserialized_values.values)
        np.testing.assert_equal(st_roundtrip_values.indices,
                                st_deserialized_values.indices)
        np.testing.assert_equal(st_roundtrip_values.dense_shape,
                                st_deserialized_values.dense_shape)

        self.run_op_benchmark(
            sess,
            st_roundtrip_op,
            min_iters=2000,
            name="benchmark_very_large_2d_float_st_tensor_maps")
        self.run_op_benchmark(
            sess,
            st_deserialized_op,
            min_iters=2000,
            name="benchmark_very_large_2d_float_st_serialization")
  def testDeserializeFailsInconsistentRank(self):
    with self.test_session(use_gpu=False) as sess:
      sp_input0 = self._SparseTensorPlaceholder()
      sp_input1 = self._SparseTensorPlaceholder()
      input0_val = self._SparseTensorValue_5x6(np.arange(6))
      input1_val = self._SparseTensorValue_1x1x1()
      serialized0 = sparse_ops.serialize_sparse(sp_input0)
      serialized1 = sparse_ops.serialize_sparse(sp_input1)
      serialized_concat = array_ops.stack([serialized0, serialized1])

      sp_deserialized = sparse_ops.deserialize_many_sparse(
          serialized_concat, dtype=dtypes.int32)

      with self.assertRaisesOpError(
          r"Inconsistent rank across SparseTensors: rank prior to "
          r"SparseTensor\[1\] was: 3 but rank of SparseTensor\[1\] is: 4"):
        sess.run(sp_deserialized,
                 {sp_input0: input0_val,
                  sp_input1: input1_val})
  def testDeserializeFailsWrongType(self):
    with self.test_session(use_gpu=False) as sess:
      sp_input0 = self._SparseTensorPlaceholder()
      sp_input1 = self._SparseTensorPlaceholder()
      input0_val = self._SparseTensorValue_5x6(np.arange(6))
      input1_val = self._SparseTensorValue_3x4(np.arange(6))
      serialized0 = sparse_ops.serialize_sparse(sp_input0)
      serialized1 = sparse_ops.serialize_sparse(sp_input1)
      serialized_concat = array_ops.stack([serialized0, serialized1])

      sp_deserialized = sparse_ops.deserialize_many_sparse(
          serialized_concat, dtype=dtypes.int64)

      with self.assertRaisesOpError(
          r"Requested SparseTensor of type int64 but "
          r"SparseTensor\[0\].values.dtype\(\) == int32"):
        sess.run(sp_deserialized,
                 {sp_input0: input0_val,
                  sp_input1: input1_val})
Пример #14
0
  def testDeserializeFailsInconsistentRank(self):
    with self.test_session(use_gpu=False) as sess:
      sp_input0 = self._SparseTensorPlaceholder()
      sp_input1 = self._SparseTensorPlaceholder()
      input0_val = self._SparseTensorValue_5x6(np.arange(6))
      input1_val = self._SparseTensorValue_1x1x1()
      serialized0 = sparse_ops.serialize_sparse(sp_input0)
      serialized1 = sparse_ops.serialize_sparse(sp_input1)
      serialized_concat = array_ops.stack([serialized0, serialized1])

      sp_deserialized = sparse_ops.deserialize_many_sparse(
          serialized_concat, dtype=dtypes.int32)

      with self.assertRaisesOpError(
          r"Inconsistent rank across SparseTensors: rank prior to "
          r"SparseTensor\[1\] was: 3 but rank of SparseTensor\[1\] is: 4"):
        sess.run(sp_deserialized,
                 {sp_input0: input0_val,
                  sp_input1: input1_val})
Пример #15
0
  def testDeserializeFailsWrongType(self):
    with self.test_session(use_gpu=False) as sess:
      sp_input0 = self._SparseTensorPlaceholder()
      sp_input1 = self._SparseTensorPlaceholder()
      input0_val = self._SparseTensorValue_5x6(np.arange(6))
      input1_val = self._SparseTensorValue_3x4(np.arange(6))
      serialized0 = sparse_ops.serialize_sparse(sp_input0)
      serialized1 = sparse_ops.serialize_sparse(sp_input1)
      serialized_concat = array_ops.stack([serialized0, serialized1])

      sp_deserialized = sparse_ops.deserialize_many_sparse(
          serialized_concat, dtype=dtypes.int64)

      with self.assertRaisesOpError(
          r"Requested SparseTensor of type int64 but "
          r"SparseTensor\[0\].values.dtype\(\) == int32"):
        sess.run(sp_deserialized,
                 {sp_input0: input0_val,
                  sp_input1: input1_val})
 def testSerializeManyDeserializeManyRoundTrip(self):
     with self.test_session(use_gpu=False) as sess:
         # N == 4 because shape_value == [4, 5]
         indices_value = np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64)
         values_value = np.array([b"a", b"b", b"c"])
         shape_value = np.array([4, 5], dtype=np.int64)
         sparse_tensor = self._SparseTensorPlaceholder(dtype=dtypes.string)
         serialized = sparse_ops.serialize_many_sparse(sparse_tensor)
         deserialized = sparse_ops.deserialize_many_sparse(
             serialized, dtype=dtypes.string)
         serialized_value, deserialized_value = sess.run(
             [serialized, deserialized],
             feed_dict={
                 sparse_tensor.indices: indices_value,
                 sparse_tensor.values: values_value,
                 sparse_tensor.dense_shape: shape_value
             })
         self.assertEqual(serialized_value.shape, (4, 3))
         self.assertAllEqual(deserialized_value.indices, indices_value)
         self.assertAllEqual(deserialized_value.values, values_value)
         self.assertAllEqual(deserialized_value.dense_shape, shape_value)
  def testSerializeDeserializeMany(self):
    with self.test_session(use_gpu=False) as sess:
      sp_input0 = self._SparseTensorValue_5x6(np.arange(6))
      sp_input1 = self._SparseTensorValue_3x4(np.arange(6))
      serialized0 = sparse_ops.serialize_sparse(sp_input0)
      serialized1 = sparse_ops.serialize_sparse(sp_input1)
      serialized_concat = array_ops.stack([serialized0, serialized1])

      sp_deserialized = sparse_ops.deserialize_many_sparse(
          serialized_concat, dtype=dtypes.int32)

      combined_indices, combined_values, combined_shape = sess.run(
          sp_deserialized)

      self.assertAllEqual(combined_indices[:6, 0], [0] * 6)  # minibatch 0
      self.assertAllEqual(combined_indices[:6, 1:], sp_input0[0])
      self.assertAllEqual(combined_indices[6:, 0], [1] * 6)  # minibatch 1
      self.assertAllEqual(combined_indices[6:, 1:], sp_input1[0])
      self.assertAllEqual(combined_values[:6], sp_input0[1])
      self.assertAllEqual(combined_values[6:], sp_input1[1])
      self.assertAllEqual(combined_shape, [2, 5, 6])
 def testSerializeManyDeserializeManyRoundTrip(self):
   with self.test_session(use_gpu=False) as sess:
     # N == 4 because shape_value == [4, 5]
     indices_value = np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64)
     values_value = np.array([b"a", b"b", b"c"])
     shape_value = np.array([4, 5], dtype=np.int64)
     sparse_tensor = self._SparseTensorPlaceholder(dtype=dtypes.string)
     serialized = sparse_ops.serialize_many_sparse(sparse_tensor)
     deserialized = sparse_ops.deserialize_many_sparse(
         serialized, dtype=dtypes.string)
     serialized_value, deserialized_value = sess.run(
         [serialized, deserialized],
         feed_dict={
             sparse_tensor.indices: indices_value,
             sparse_tensor.values: values_value,
             sparse_tensor.dense_shape: shape_value
         })
     self.assertEqual(serialized_value.shape, (4, 3))
     self.assertAllEqual(deserialized_value.indices, indices_value)
     self.assertAllEqual(deserialized_value.values, values_value)
     self.assertAllEqual(deserialized_value.dense_shape, shape_value)