def read_records(index=0): train_queue = tf.train.string_input_producer(['training.tfrecords'], num_epochs=FLAGS.epochs) validation_queue = tf.train.string_input_producer(['validation.tfrecords'], num_epochs=FLAGS.epochs) test_queue = tf.train.string_input_producer(['test.tfrecords'], num_epochs=FLAGS.epochs) queue = tf.QueueBase.from_list(index, [train_queue, validation_queue, test_queue]) reader = tf.TFRecordReader() _, serialized_example = reader.read(queue) features = tf.parse_single_example( serialized_example, features={ 'document': tf.VarLenFeature(tf.int64), 'query': tf.VarLenFeature(tf.int64), 'answer': tf.FixedLenFeature([], tf.int64) }) document = sparse_ops.serialize_sparse(features['document']) query = sparse_ops.serialize_sparse(features['query']) answer = features['answer'] document_batch_serialized, query_batch_serialized, answer_batch = tf.train.shuffle_batch( [document, query, answer], batch_size=FLAGS.batch_size, capacity=2000, min_after_dequeue=1000) sparse_document_batch = sparse_ops.deserialize_many_sparse(document_batch_serialized, dtype=tf.int64) sparse_query_batch = sparse_ops.deserialize_many_sparse(query_batch_serialized, dtype=tf.int64) document_batch = tf.sparse_tensor_to_dense(sparse_document_batch) document_weights = tf.sparse_to_dense(sparse_document_batch.indices, tf.cast(tf.shape(sparse_document_batch),tf.int64), 1) query_batch = tf.sparse_tensor_to_dense(sparse_query_batch) query_weights = tf.sparse_to_dense(sparse_query_batch.indices, tf.cast(tf.shape(sparse_query_batch),tf.int64), 1) return document_batch, document_weights, query_batch, query_weights, answer_batch
def read_records(index=0): # 生成读取数据的队列,要指定epoches train_queue = tf.train.string_input_producer(['training.tfrecords'], num_epochs=FLAGS.epochs) validation_queue = tf.train.string_input_producer(['validation.tfrecords'], num_epochs=FLAGS.epochs) test_queue = tf.train.string_input_producer(['test.tfrecords'], num_epochs=FLAGS.epochs) queue = tf.QueueBase.from_list(index, [train_queue, validation_queue, test_queue]) # 定义一个recordreader对象,用于数据的读取 reader = tf.TFRecordReader() # 从之前的队列中读取数据到serialized_example _, serialized_example = reader.read(queue) # 调用parse_single_example函数解析数据 features = tf.parse_single_example(serialized_example, features={ 'document': tf.VarLenFeature(tf.int64), 'query': tf.VarLenFeature(tf.int64), 'answer': tf.FixedLenFeature([], tf.int64) }) # 返回索引、值、shape的三元组信息(1-D tensor) document = sparse_ops.serialize_sparse(features['document']) query = sparse_ops.serialize_sparse(features['query']) answer = features['answer'] # 生成batch切分数据 document_batch_serialized, query_batch_serialized, answer_batch = tf.train.shuffle_batch( [document, query, answer], batch_size=FLAGS.batch_size, capacity=2000, min_after_dequeue=1000) sparse_document_batch = sparse_ops.deserialize_many_sparse( document_batch_serialized, dtype=tf.int64) sparse_query_batch = sparse_ops.deserialize_many_sparse( query_batch_serialized, dtype=tf.int64) # 变相将不同长短的句子用0 padding # tf.sparse_tensor_to_dense ----return:[batch_size,seq_length] document_batch = tf.sparse_tensor_to_dense(sparse_document_batch) # 赋值权重,将每个batch多条句子中的word用1表示,其余用0表示 # tf.sparse_to_dense( sparse_indices,output_shape,sparse_values(指定元素,默认为1),default_value(其他元素,默认为0)) # tf.sparse_to_dense ---- return:[batch_size, seq_length] document_weights = tf.sparse_to_dense(sparse_document_batch.indices, sparse_document_batch.dense_shape, 1) query_batch = tf.sparse_tensor_to_dense(sparse_query_batch) query_weights = tf.sparse_to_dense(sparse_query_batch.indices, sparse_query_batch.dense_shape, 1) return document_batch, document_weights, query_batch, query_weights, answer_batch
def testFeedSerializeDeserializeMany(self): with self.test_session(use_gpu=False) as sess: sp_input0 = self._SparseTensorPlaceholder() sp_input1 = self._SparseTensorPlaceholder() input0_val = self._SparseTensorValue_5x6(np.arange(6)) input1_val = self._SparseTensorValue_3x4(np.arange(6)) serialized0 = sparse_ops.serialize_sparse(sp_input0) serialized1 = sparse_ops.serialize_sparse(sp_input1) serialized_concat = array_ops.stack([serialized0, serialized1]) sp_deserialized = sparse_ops.deserialize_many_sparse( serialized_concat, dtype=dtypes.int32) combined_indices, combined_values, combined_shape = sess.run( sp_deserialized, { sp_input0: input0_val, sp_input1: input1_val }) self.assertAllEqual(combined_indices[:6, 0], [0] * 6) # minibatch 0 self.assertAllEqual(combined_indices[:6, 1:], input0_val[0]) self.assertAllEqual(combined_indices[6:, 0], [1] * 6) # minibatch 1 self.assertAllEqual(combined_indices[6:, 1:], input1_val[0]) self.assertAllEqual(combined_values[:6], input0_val[1]) self.assertAllEqual(combined_values[6:], input1_val[1]) self.assertAllEqual(combined_shape, [2, 5, 6])
def _deserialize_sparse_tensors(serialized_list, is_sparse_list, sparse_dtypes): """Deserialize SparseTensors after dequeue in batch, batch_join, etc.""" received_sequence = isinstance(serialized_list, collections.Sequence) if not received_sequence: serialized_list = (serialized_list,) tensors = [sparse_ops.deserialize_many_sparse(s, sparse_dtype) if is_sparse else s for (s, is_sparse, sparse_dtype) in zip(serialized_list, is_sparse_list, sparse_dtypes)] return tensors if received_sequence else tensors[0]
def _deserialize_sparse_tensors(serialized_list, sparse_info_list): """Deserialize SparseTensors after dequeue in batch, batch_join, etc.""" received_sequence = isinstance(serialized_list, collections.Sequence) if not received_sequence: serialized_list = (serialized_list,) tensors = [ sparse_ops.deserialize_many_sparse(s, info.dtype, (info.rank + 1).value) if info.sparse else s for (s, info) in zip(serialized_list, sparse_info_list) ] return tensors if received_sequence else tensors[0]
def _deserialize_sparse_tensors(serialized_list, sparse_info_list): """Deserialize SparseTensors after dequeue in batch, batch_join, etc.""" received_sequence = isinstance(serialized_list, collections.Sequence) if not received_sequence: serialized_list = (serialized_list,) tensors = [ sparse_ops.deserialize_many_sparse(s, info.dtype, (info.rank + 1).value) if info.sparse else s for (s, info) in zip(serialized_list, sparse_info_list)] return tensors if received_sequence else tensors[0]
def _deserialize_sparse_tensors(serialized_list, is_sparse_list, sparse_dtypes): """Deserialize SparseTensors after dequeue in batch, batch_join, etc.""" received_sequence = isinstance(serialized_list, collections.Sequence) if not received_sequence: serialized_list = (serialized_list, ) tensors = [ sparse_ops.deserialize_many_sparse(s, sparse_dtype) if is_sparse else s for (s, is_sparse, sparse_dtype ) in zip(serialized_list, is_sparse_list, sparse_dtypes) ] return tensors if received_sequence else tensors[0]
def testDeserializeFailsInvalidProto(self): with self.test_session(use_gpu=False) as sess: sp_input0 = self._SparseTensorPlaceholder() input0_val = self._SparseTensorValue_5x6(np.arange(6)) serialized0 = sparse_ops.serialize_sparse(sp_input0) serialized1 = ["a", "b", "c"] serialized_concat = array_ops.stack([serialized0, serialized1]) sp_deserialized = sparse_ops.deserialize_many_sparse( serialized_concat, dtype=dtypes.int32) with self.assertRaisesOpError( r"Could not parse serialized_sparse\[1, 0\]"): sess.run(sp_deserialized, {sp_input0: input0_val})
def benchmarkVeryLarge2DFloatSparseTensor(self): np.random.seed(127) num_elements = 10000 batch_size = 64 indices_batch = np.random.randint(batch_size, size=num_elements, dtype=np.int64) indices_value = np.arange(num_elements, dtype=np.int64) indices = np.asarray(sorted(zip(indices_batch, indices_value)), dtype=np.int64) values = ["feature_value_for_embedding_lookup"] * num_elements shape = np.asarray([batch_size, num_elements], dtype=np.int64) with session.Session(config=benchmark.benchmark_config()) as sess: with ops.device("/cpu:0"): indices = variables.Variable(indices) values = variables.Variable(values) shape = variables.Variable(shape) st = sparse_tensor_lib.SparseTensor(indices, values, shape) st_handles = add_many_sparse_to_tensors_map(st) st_roundtrip = take_many_sparse_from_tensors_map( sparse_map_op=st_handles.op, sparse_handles=st_handles) st_roundtrip_op = st_roundtrip.values.op st_serialized = sparse_ops.serialize_many_sparse(st) st_deserialized = sparse_ops.deserialize_many_sparse( st_serialized, dtype=values.dtype) st_deserialized_op = st_deserialized.values.op variables.global_variables_initializer().run() st_roundtrip_values = self.evaluate(st_roundtrip) st_deserialized_values = self.evaluate(st_deserialized) np.testing.assert_equal(st_roundtrip_values.values, st_deserialized_values.values) np.testing.assert_equal(st_roundtrip_values.indices, st_deserialized_values.indices) np.testing.assert_equal(st_roundtrip_values.dense_shape, st_deserialized_values.dense_shape) self.run_op_benchmark( sess, st_roundtrip_op, min_iters=2000, name="benchmark_very_large_2d_float_st_tensor_maps") self.run_op_benchmark( sess, st_deserialized_op, min_iters=2000, name="benchmark_very_large_2d_float_st_serialization")
def benchmarkVeryLarge2DFloatSparseTensor(self): np.random.seed(127) num_elements = 10000 batch_size = 64 indices_batch = np.random.randint( batch_size, size=num_elements, dtype=np.int64) indices_value = np.arange(num_elements, dtype=np.int64) indices = np.asarray( sorted(zip(indices_batch, indices_value)), dtype=np.int64) values = ["feature_value_for_embedding_lookup"] * num_elements shape = np.asarray([batch_size, num_elements], dtype=np.int64) with session.Session(config=benchmark.benchmark_config()) as sess: with ops.device("/cpu:0"): indices = variables.Variable(indices) values = variables.Variable(values) shape = variables.Variable(shape) st = sparse_tensor_lib.SparseTensor(indices, values, shape) st_handles = add_many_sparse_to_tensors_map(st) st_roundtrip = take_many_sparse_from_tensors_map( sparse_map_op=st_handles.op, sparse_handles=st_handles) st_roundtrip_op = st_roundtrip.values.op st_serialized = sparse_ops.serialize_many_sparse(st) st_deserialized = sparse_ops.deserialize_many_sparse( st_serialized, dtype=values.dtype) st_deserialized_op = st_deserialized.values.op variables.global_variables_initializer().run() st_roundtrip_values = sess.run(st_roundtrip) st_deserialized_values = sess.run(st_deserialized) np.testing.assert_equal(st_roundtrip_values.values, st_deserialized_values.values) np.testing.assert_equal(st_roundtrip_values.indices, st_deserialized_values.indices) np.testing.assert_equal(st_roundtrip_values.dense_shape, st_deserialized_values.dense_shape) self.run_op_benchmark( sess, st_roundtrip_op, min_iters=2000, name="benchmark_very_large_2d_float_st_tensor_maps") self.run_op_benchmark( sess, st_deserialized_op, min_iters=2000, name="benchmark_very_large_2d_float_st_serialization")
def testDeserializeFailsInconsistentRank(self): with self.test_session(use_gpu=False) as sess: sp_input0 = self._SparseTensorPlaceholder() sp_input1 = self._SparseTensorPlaceholder() input0_val = self._SparseTensorValue_5x6(np.arange(6)) input1_val = self._SparseTensorValue_1x1x1() serialized0 = sparse_ops.serialize_sparse(sp_input0) serialized1 = sparse_ops.serialize_sparse(sp_input1) serialized_concat = array_ops.stack([serialized0, serialized1]) sp_deserialized = sparse_ops.deserialize_many_sparse( serialized_concat, dtype=dtypes.int32) with self.assertRaisesOpError( r"Inconsistent rank across SparseTensors: rank prior to " r"SparseTensor\[1\] was: 3 but rank of SparseTensor\[1\] is: 4"): sess.run(sp_deserialized, {sp_input0: input0_val, sp_input1: input1_val})
def testDeserializeFailsWrongType(self): with self.test_session(use_gpu=False) as sess: sp_input0 = self._SparseTensorPlaceholder() sp_input1 = self._SparseTensorPlaceholder() input0_val = self._SparseTensorValue_5x6(np.arange(6)) input1_val = self._SparseTensorValue_3x4(np.arange(6)) serialized0 = sparse_ops.serialize_sparse(sp_input0) serialized1 = sparse_ops.serialize_sparse(sp_input1) serialized_concat = array_ops.stack([serialized0, serialized1]) sp_deserialized = sparse_ops.deserialize_many_sparse( serialized_concat, dtype=dtypes.int64) with self.assertRaisesOpError( r"Requested SparseTensor of type int64 but " r"SparseTensor\[0\].values.dtype\(\) == int32"): sess.run(sp_deserialized, {sp_input0: input0_val, sp_input1: input1_val})
def testSerializeManyDeserializeManyRoundTrip(self): with self.test_session(use_gpu=False) as sess: # N == 4 because shape_value == [4, 5] indices_value = np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64) values_value = np.array([b"a", b"b", b"c"]) shape_value = np.array([4, 5], dtype=np.int64) sparse_tensor = self._SparseTensorPlaceholder(dtype=dtypes.string) serialized = sparse_ops.serialize_many_sparse(sparse_tensor) deserialized = sparse_ops.deserialize_many_sparse( serialized, dtype=dtypes.string) serialized_value, deserialized_value = sess.run( [serialized, deserialized], feed_dict={ sparse_tensor.indices: indices_value, sparse_tensor.values: values_value, sparse_tensor.dense_shape: shape_value }) self.assertEqual(serialized_value.shape, (4, 3)) self.assertAllEqual(deserialized_value.indices, indices_value) self.assertAllEqual(deserialized_value.values, values_value) self.assertAllEqual(deserialized_value.dense_shape, shape_value)
def testSerializeDeserializeMany(self): with self.test_session(use_gpu=False) as sess: sp_input0 = self._SparseTensorValue_5x6(np.arange(6)) sp_input1 = self._SparseTensorValue_3x4(np.arange(6)) serialized0 = sparse_ops.serialize_sparse(sp_input0) serialized1 = sparse_ops.serialize_sparse(sp_input1) serialized_concat = array_ops.stack([serialized0, serialized1]) sp_deserialized = sparse_ops.deserialize_many_sparse( serialized_concat, dtype=dtypes.int32) combined_indices, combined_values, combined_shape = sess.run( sp_deserialized) self.assertAllEqual(combined_indices[:6, 0], [0] * 6) # minibatch 0 self.assertAllEqual(combined_indices[:6, 1:], sp_input0[0]) self.assertAllEqual(combined_indices[6:, 0], [1] * 6) # minibatch 1 self.assertAllEqual(combined_indices[6:, 1:], sp_input1[0]) self.assertAllEqual(combined_values[:6], sp_input0[1]) self.assertAllEqual(combined_values[6:], sp_input1[1]) self.assertAllEqual(combined_shape, [2, 5, 6])