def testFeedSerializeDeserializeMany(self):
        with self.test_session(use_gpu=False) as sess:
            sp_input0 = self._SparseTensorPlaceholder()
            sp_input1 = self._SparseTensorPlaceholder()
            input0_val = self._SparseTensorValue_5x6(np.arange(6))
            input1_val = self._SparseTensorValue_3x4(np.arange(6))
            serialized0 = tf.serialize_sparse(sp_input0)
            serialized1 = tf.serialize_sparse(sp_input1)
            serialized_concat = tf.pack([serialized0, serialized1])

            sp_deserialized = tf.deserialize_many_sparse(serialized_concat,
                                                         dtype=tf.int32)

            combined_indices, combined_values, combined_shape = sess.run(
                sp_deserialized, {
                    sp_input0: input0_val,
                    sp_input1: input1_val
                })

            self.assertAllEqual(combined_indices[:6, 0],
                                [0] * 6)  # minibatch 0
            self.assertAllEqual(combined_indices[:6, 1:], input0_val[0])
            self.assertAllEqual(combined_indices[6:, 0],
                                [1] * 6)  # minibatch 1
            self.assertAllEqual(combined_indices[6:, 1:], input1_val[0])
            self.assertAllEqual(combined_values[:6], input0_val[1])
            self.assertAllEqual(combined_values[6:], input1_val[1])
            self.assertAllEqual(combined_shape, [2, 5, 6])
Пример #2
0
def threaded_input_pipeline(base_dir,file_patterns,
                            num_threads=4,
                            batch_size=32,
                            batch_device=None,
                            preprocess_device=None,
                            num_epochs=None):

    queue_capacity = num_threads*batch_size*2
    # Allow a smaller final batch if we are going for a fixed number of epochs
    final_batch = (num_epochs!=None) 

    data_queue = _get_data_queue(base_dir, file_patterns, 
                                 capacity=queue_capacity,
                                 num_epochs=num_epochs)

    # each thread has a subgraph with its own reader (sharing filename queue)
    data_tuples = [] # list of subgraph [image, label, width, text] elements
    with tf.device(preprocess_device):
        for _ in range(num_threads):
            image, width, label, length, text, filename  = _read_word_record(
                data_queue)
            image = _preprocess_image(image) # move after batch?
            data_tuples.append([image, width, label, length, text, filename])

    with tf.device(batch_device): # Create batch queue
        image, width, label, length, text, filename  = tf.train.batch_join( 
            data_tuples, 
            batch_size=batch_size,
            capacity=queue_capacity,
            allow_smaller_final_batch=final_batch,
            dynamic_pad=True)
        label = tf.deserialize_many_sparse(label, tf.int64) # post-batching...
        label = tf.cast(label, tf.int32) # for ctc_loss
    return image, width, label, length, text, filename
Пример #3
0
    def make_batch(self, batch_size, single_char=False):
        filenames = self._get_filenames()
        dataset = tf.data.TFRecordDataset(filenames).repeat()
        dataset = dataset.map(
            functools.partial(self._parser,
                              distort=(single_char and
                                       self.subset == 'train')),
            num_parallel_calls=batch_size)

        if self.subset == 'train':
            min_q_exs = 0.4 * self.num_examples
            dataset = dataset.shuffle(
                buffer_size=int(min_q_exs + 3 * batch_size)
            )
        padded_shapes = tuple([self._padding[k] for k in self.feat_keys])
        dataset = dataset.padded_batch(batch_size, padded_shapes=padded_shapes)

        iterator = dataset.make_one_shot_iterator()
        batch = iterator.get_next()

        features = dict(zip(self.feat_keys, batch))
        for key, value in features.items():
            if 'sparse' in key:
                features[key] = tf.deserialize_many_sparse(value,
                                                           dtype=tf.int32)
        labels = features.pop(self._target_id, None)

        return features, labels
Пример #4
0
def threaded_input_pipeline(base_dir,file_patterns,
                            num_threads=4,
                            batch_size=32,
                            batch_device=None,
                            preprocess_device=None,
                            num_epochs=None):

    queue_capacity = num_threads*batch_size*2
    # Allow a smaller final batch if we are going for a fixed number of epochs, this function for test
    final_batch = (num_epochs!=None) 

    data_queue = _get_data_queue(base_dir, file_patterns, 
                                 capacity=queue_capacity,
                                 num_epochs=num_epochs)

    # each thread has a subgraph with its own reader (sharing filename queue)
    data_tuples = [] # list of subgraph [image, label, width, text] elements
    with tf.device(preprocess_device):
        for _ in range(num_threads):
            image, width, label, length, text, filename  = _read_word_record(
                data_queue)
            image = _preprocess_image(image) # move after batch?
            data_tuples.append([image, width, label, length, text, filename])

    with tf.device(batch_device): # Create batch queue
        image, width, label, length, text, filename  = tf.train.batch_join( 
            data_tuples, 
            batch_size=batch_size,
            capacity=queue_capacity,
            allow_smaller_final_batch=final_batch,
            dynamic_pad=True)
        label = tf.deserialize_many_sparse(label, tf.int64) # post-batching...
        label = tf.cast(label, tf.int32) # for ctc_loss
    return image, width, label, length, text, filename
Пример #5
0
def inputs(tfrecords, batch_size, num_epochs, is_sparse_label=True):
    with tf.name_scope('input'):
        # 1.push the '.tfrecords' files into File Queue.
        filename_queue = tf.train.string_input_producer([tfrecords],
                                                        num_epochs=num_epochs)

        # Even when reading in multiple threads, share the filename queue.
        image, label = read_and_decode(filename_queue)

        # Shuffle the examples and collect them into batch_size batches.
        # (Internally uses a RandomShuffleQueue.)
        # We run this in two threads to avoid being a bottleneck.
        images_batch, labels_batch_serialized = tf.train.shuffle_batch(
            [image, label],
            batch_size=batch_size,
            num_threads=2,
            capacity=1000 + 3 * batch_size,
            # Ensures a minimum amount of shuffling of examples.
            min_after_dequeue=1000)

        # for variable length labels
        sparse_labels_batch = tf.deserialize_many_sparse(
            labels_batch_serialized, dtype=tf.int64)
        if is_sparse_label:
            labels_batch = sparse_labels_batch
        else:
            labels_batch = tf.sparse_tensor_to_dense(sparse_labels_batch)
    return images_batch, labels_batch
Пример #6
0
 def get_inputs(self):
     """
 Return's tensors for inputs, sequence_lengths and labels
 """
     with tf.device("/cpu:0"):
         inputs, sequence_lengths, labels = self.queue.dequeue()
         labels = tf.deserialize_many_sparse(labels, dtype=tf.int32)
     return inputs, sequence_lengths, labels
Пример #7
0
def bucketed_input_pipeline(base_dir,
                            file_patterns,
                            num_threads=4,
                            batch_size=32,
                            boundaries=[32, 64, 96, 128, 160, 192, 224, 256],
                            input_device=None,
                            width_threshold=None,
                            length_threshold=None,
                            num_epochs=None):
    """Get input tensors bucketed by image width
    Returns:
      image : float32 image tensor [batch_size 32 ? 1] padded to batch max width
      width : int32 image widths (for calculating post-CNN sequence length)
      label : Sparse tensor with label sequences for the batch
      length : Length of label sequence (text length)
      text  :  Human readable string for the image
      filename : Source file path
    """
    queue_capacity = num_threads * batch_size * 2
    # Allow a smaller final batch if we are going for a fixed number of epochs
    final_batch = (num_epochs != None)

    data_queue, number_of_images = _get_data_queue(
        base_dir,
        file_patterns,  # возвращает tensor со списком длиной capacity, состоящим из файлов tfrecords
        capacity=queue_capacity,
        num_epochs=num_epochs)

    with tf.device(input_device):  # Create bucketing batcher
        image, width, label, length, text, filename = _read_word_record(
            data_queue)  # считывает изображения из tfrecord

        image = _preprocess_image(image)  # нормализация изображения

        keep_input = _get_input_filter(
            width,
            width_threshold,  # true или false, оставлять изображение в выборке или нет в зависимости от порога ширины/высоты изображения
            length,
            length_threshold)
        data_tuple = [image, width, label, length, text, filename]
        # bucket_by_sequence_length делит изображения на батчи, кластеризуя изображения в соответствии с их шириной
        # если размер батча слишком велик для имеющегося количества изображений данной ширины в датасете, то изображения в батче будут повторяться
        # это реализовано из необходимости паддинга для соблюдения одинаковой ширины изображений внутри одного батча
        # если изображений внутри датасета одинаковой ширины нет, то батч собирается из изображений с шириной с некоторым отнклонением
        # затем среди них проводится паддинг
        # https://blog.altoros.com/the-magic-behind-google-translate-sequence-to-sequence-models-and-tensorflow.html
        # ниже функция bucket_by_sequence_length была заменена на tf.train.batch, с которой нет подобных проблем
        data_tuple = tf.train.batch(tensors=data_tuple,
                                    batch_size=batch_size,
                                    num_threads=num_threads,
                                    capacity=queue_capacity,
                                    dynamic_pad=True,
                                    allow_smaller_final_batch=final_batch)
        [image, width, label, length, text, filename] = data_tuple
        label = tf.deserialize_many_sparse(label, tf.int64)  # post-batching...
        label = tf.cast(label, tf.int32)  # for ctc_loss
    return image, width, label, length, text, filename, number_of_images
Пример #8
0
def postbatch_fn(image, width, label, length, text):
    """Post-batching, postprocessing: packs raw tensors into a dictionary for 
       Dataset's iterator output"""

    # Batching is complete, so now we can re-sparsify our labels for ctc_loss
    label = tf.cast(tf.deserialize_many_sparse(label, tf.int64), tf.int32)

    # Format relevant features for estimator ingestion
    features = {"image": image, "width": width, "length": length, "text": text}

    return features, label
Пример #9
0
def ImageInput(input_pattern, num_threads, shape, using_ctc, reader=None):
  """Creates an input image tensor from the input_pattern filenames.

  TODO(rays) Expand for 2-d labels, 0-d labels, and logistic targets.
  Args:
    input_pattern:  Filenames of the dataset(s) to read.
    num_threads:    Number of preprocessing threads.
    shape:          ImageShape with the desired shape of the input.
    using_ctc:      Take the unpadded_class labels instead of padded.
    reader:         Function that returns an actual reader to read Examples from
      input files. If None, uses tf.TFRecordReader().
  Returns:
    images:   Float Tensor containing the input image scaled to [-1.28, 1.27].
    heights:  Tensor int64 containing the heights of the images.
    widths:   Tensor int64 containing the widths of the images.
    labels:   Serialized SparseTensor containing the int64 labels.
    sparse_labels:   Serialized SparseTensor containing the int64 labels.
    truths:   Tensor string of the utf8 truth texts.
  Raises:
    ValueError: if the optimizer type is unrecognized.
  """
  data_files = tf.gfile.Glob(input_pattern)
  assert data_files, 'no files found for dataset ' + input_pattern
  queue_capacity = shape.batch_size * num_threads * 2
  filename_queue = tf.train.string_input_producer(
      data_files, capacity=queue_capacity)

  # Create a subgraph with its own reader (but sharing the
  # filename_queue) for each preprocessing thread.
  images_and_label_lists = []
  for _ in range(num_threads):
    image, height, width, labels, text = _ReadExamples(filename_queue, shape,
                                                       using_ctc, reader)
    images_and_label_lists.append([image, height, width, labels, text])
  # Create a queue that produces the examples in batches.
  images, heights, widths, labels, truths = tf.train.batch_join(
      images_and_label_lists,
      batch_size=shape.batch_size,
      capacity=16 * shape.batch_size,
      dynamic_pad=True)
  # Deserialize back to sparse, because the batcher doesn't do sparse.
  labels = tf.deserialize_many_sparse(labels, tf.int64)
  sparse_labels = tf.cast(labels, tf.int32)
  labels = tf.sparse_tensor_to_dense(labels)
  labels = tf.reshape(labels, [shape.batch_size, -1], name='Labels')
  # Crush the other shapes to just the batch dimension.
  heights = tf.reshape(heights, [-1], name='Heights')
  widths = tf.reshape(widths, [-1], name='Widths')
  truths = tf.reshape(truths, [-1], name='Truths')
  # Give the images a nice name as well.
  images = tf.identity(images, name='Images')

  tf.image_summary('Images', images)
  return images, heights, widths, labels, sparse_labels, truths
Пример #10
0
def ImageInput(input_pattern, num_threads, shape, using_ctc, reader=None):
    """Creates an input image tensor from the input_pattern filenames.

  TODO(rays) Expand for 2-d labels, 0-d labels, and logistic targets.
  Args:
    input_pattern:  Filenames of the dataset(s) to read.
    num_threads:    Number of preprocessing threads.
    shape:          ImageShape with the desired shape of the input.
    using_ctc:      Take the unpadded_class labels instead of padded.
    reader:         Function that returns an actual reader to read Examples from
      input files. If None, uses tf.TFRecordReader().
  Returns:
    images:   Float Tensor containing the input image scaled to [-1.28, 1.27].
    heights:  Tensor int64 containing the heights of the images.
    widths:   Tensor int64 containing the widths of the images.
    labels:   Serialized SparseTensor containing the int64 labels.
    sparse_labels:   Serialized SparseTensor containing the int64 labels.
    truths:   Tensor string of the utf8 truth texts.
  Raises:
    ValueError: if the optimizer type is unrecognized.
  """
    data_files = tf.gfile.Glob(input_pattern)
    assert data_files, 'no files found for dataset ' + input_pattern
    queue_capacity = shape.batch_size * num_threads * 2
    filename_queue = tf.train.string_input_producer(data_files,
                                                    capacity=queue_capacity)

    # Create a subgraph with its own reader (but sharing the
    # filename_queue) for each preprocessing thread.
    images_and_label_lists = []
    for _ in range(num_threads):
        image, height, width, labels, text = _ReadExamples(
            filename_queue, shape, using_ctc, reader)
        images_and_label_lists.append([image, height, width, labels, text])
    # Create a queue that produces the examples in batches.
    images, heights, widths, labels, truths = tf.train.batch_join(
        images_and_label_lists,
        batch_size=shape.batch_size,
        capacity=16 * shape.batch_size,
        dynamic_pad=True)
    # Deserialize back to sparse, because the batcher doesn't do sparse.
    labels = tf.deserialize_many_sparse(labels, tf.int64)
    sparse_labels = tf.cast(labels, tf.int32)
    labels = tf.sparse_tensor_to_dense(labels)
    labels = tf.reshape(labels, [shape.batch_size, -1], name='Labels')
    # Crush the other shapes to just the batch dimension.
    heights = tf.reshape(heights, [-1], name='Heights')
    widths = tf.reshape(widths, [-1], name='Widths')
    truths = tf.reshape(truths, [-1], name='Truths')
    # Give the images a nice name as well.
    images = tf.identity(images, name='Images')

    tf.summary.image('Images', images)
    return images, heights, widths, labels, sparse_labels, truths
    def testDeserializeFailsInvalidProto(self):
        with self.test_session(use_gpu=False) as sess:
            sp_input0 = self._SparseTensorPlaceholder()
            input0_val = self._SparseTensorValue_5x6(np.arange(6))
            serialized0 = tf.serialize_sparse(sp_input0)
            serialized1 = ["a", "b", "c"]
            serialized_concat = tf.pack([serialized0, serialized1])

            sp_deserialized = tf.deserialize_many_sparse(serialized_concat,
                                                         dtype=tf.int32)

            with self.assertRaisesOpError(
                    r"Could not parse serialized_sparse\[1, 0\]"):
                sess.run(sp_deserialized, {sp_input0: input0_val})
Пример #12
0
    def benchmarkVeryLarge2DFloatSparseTensor(self):
        np.random.seed(127)
        num_elements = 10000
        batch_size = 64
        indices_batch = np.random.randint(batch_size,
                                          size=num_elements,
                                          dtype=np.int64)
        indices_value = np.arange(num_elements, dtype=np.int64)
        indices = np.asarray(sorted(zip(indices_batch, indices_value)),
                             dtype=np.int64)
        values = ["feature_value_for_embedding_lookup"] * num_elements
        shape = np.asarray([batch_size, num_elements], dtype=np.int64)
        with tf.Session() as sess:
            with tf.device("/cpu:0"):
                indices = tf.Variable(indices)
                values = tf.Variable(values)
                shape = tf.Variable(shape)
                st = tf.SparseTensor(indices, values, shape)

                st_handles = add_many_sparse_to_tensors_map(st)
                st_roundtrip = take_many_sparse_from_tensors_map(
                    sparse_map_op=st_handles.op, sparse_handles=st_handles)
                st_roundtrip_op = st_roundtrip.values.op

                st_serialized = tf.serialize_many_sparse(st)
                st_deserialized = tf.deserialize_many_sparse(
                    st_serialized, dtype=values.dtype)
                st_deserialized_op = st_deserialized.values.op

                tf.global_variables_initializer().run()

                st_roundtrip_values = sess.run(st_roundtrip)
                st_deserialized_values = sess.run(st_deserialized)
                np.testing.assert_equal(st_roundtrip_values.values,
                                        st_deserialized_values.values)
                np.testing.assert_equal(st_roundtrip_values.indices,
                                        st_deserialized_values.indices)
                np.testing.assert_equal(st_roundtrip_values.shape,
                                        st_deserialized_values.shape)

                self.run_op_benchmark(
                    sess,
                    st_roundtrip_op,
                    min_iters=2000,
                    name="benchmark_very_large_2d_float_st_tensor_maps")
                self.run_op_benchmark(
                    sess,
                    st_deserialized_op,
                    min_iters=2000,
                    name="benchmark_very_large_2d_float_st_serialization")
  def testDeserializeFailsInvalidProto(self):
    with self.test_session(use_gpu=False) as sess:
      sp_input0 = self._SparseTensorPlaceholder()
      input0_val = self._SparseTensorValue_5x6(np.arange(6))
      serialized0 = tf.serialize_sparse(sp_input0)
      serialized1 = ["a", "b", "c"]
      serialized_concat = tf.pack([serialized0, serialized1])

      sp_deserialized = tf.deserialize_many_sparse(
          serialized_concat, dtype=tf.int32)

      with self.assertRaisesOpError(
          r"Could not parse serialized_sparse\[1, 0\]"):
        sess.run(sp_deserialized, {sp_input0: input0_val})
Пример #14
0
def bucketed_input_pipeline(base_dir,
                            file_patterns,
                            num_threads=4,
                            batch_size=32,
                            boundaries=[32, 64, 96, 128, 160, 192, 224, 256],
                            input_device=None,
                            width_threshold=None,
                            length_threshold=None,
                            num_epochs=None):
    """Get input tensors bucketed by image width
    Returns:
      image : float32 image tensor [batch_size 32 ? 1] padded to batch max width
      width : int32 image widths (for calculating post-CNN sequence length)
      label : Sparse tensor with label sequences for the batch
      length : Length of label sequence (text length)
      text  :  Human readable string for the image
      filename : Source file path
    """
    queue_capacity = num_threads * batch_size * 2
    # Allow a smaller final batch if we are going for a fixed number of epochs
    final_batch = (num_epochs != None)

    data_queue = _get_data_queue(base_dir,
                                 file_patterns,
                                 capacity=queue_capacity,
                                 num_epochs=num_epochs)

    with tf.device(input_device):  # Create bucketing batcher
        image, width, label, length, text, filename = _read_word_record(
            data_queue)
        image = _preprocess_image(image)  # move after batch?

        keep_input = _get_input_filter(width, width_threshold, length,
                                       length_threshold)
        data_tuple = [image, label, length, text, filename]
        width, data_tuple = tf.contrib.training.bucket_by_sequence_length(
            input_length=width,
            tensors=data_tuple,
            bucket_boundaries=boundaries,
            batch_size=batch_size,
            capacity=queue_capacity,
            keep_input=keep_input,
            allow_smaller_final_batch=final_batch,
            dynamic_pad=True)
        [image, label, length, text, filename] = data_tuple
        label = tf.deserialize_many_sparse(label, tf.int64)  # post-batching...
        label = tf.cast(label, tf.int32)  # for ctc_loss
    return image, width, label, length, text, filename
Пример #15
0
def bucketed_input_pipeline(base_dir,file_patterns,
                            num_threads=4,
                            batch_size=32,
                            boundaries=[32, 64, 96, 128, 160, 192, 224, 256],
                            input_device=None,
                            width_threshold=None,
                            length_threshold=None,
                            num_epochs=None):
    """Get input tensors bucketed by image width
    Returns:
      image : float32 image tensor [batch_size 32 ? 1] padded to batch max width
      width : int32 image widths (for calculating post-CNN sequence length)
      label : Sparse tensor with label sequences for the batch
      length : Length of label sequence (text length)
      text  :  Human readable string for the image
      filename : Source file path
    """
    queue_capacity = num_threads*batch_size*2
    # Allow a smaller final batch if we are going for a fixed number of epochs
    final_batch = (num_epochs!=None) 

    data_queue = _get_data_queue(base_dir, file_patterns, 
                                 capacity=queue_capacity,
                                 num_epochs=num_epochs)

    with tf.device(input_device): # Create bucketing batcher
        image, width, label, length, text, filename  = _read_word_record(
            data_queue)
        image = _preprocess_image(image) # move after batch?

        keep_input = _get_input_filter(width, width_threshold,
                                       length, length_threshold)
        data_tuple = [image, label, length, text, filename]
        width,data_tuple = tf.contrib.training.bucket_by_sequence_length(
            input_length=width,
            tensors=data_tuple,
            bucket_boundaries=boundaries,
            batch_size=batch_size,
            capacity=queue_capacity,
            keep_input=keep_input,
            allow_smaller_final_batch=final_batch,
            dynamic_pad=True)
        [image, label, length, text, filename] = data_tuple
        label = tf.deserialize_many_sparse(label, tf.int64) # post-batching...
        label = tf.cast(label, tf.int32) # for ctc_loss
    return image, width, label, length, text, filename
  def benchmarkVeryLarge2DFloatSparseTensor(self):
    np.random.seed(127)
    num_elements = 10000
    batch_size = 64
    indices_batch = np.random.randint(
        batch_size, size=num_elements, dtype=np.int64)
    indices_value = np.arange(num_elements, dtype=np.int64)
    indices = np.asarray(
        sorted(zip(indices_batch, indices_value)), dtype=np.int64)
    values = ["feature_value_for_embedding_lookup"] * num_elements
    shape = np.asarray([batch_size, num_elements], dtype=np.int64)
    with tf.Session() as sess:
      with tf.device("/cpu:0"):
        indices = tf.Variable(indices)
        values = tf.Variable(values)
        shape = tf.Variable(shape)
        st = tf.SparseTensor(indices, values, shape)

        st_handles = add_many_sparse_to_tensors_map(st)
        st_roundtrip = take_many_sparse_from_tensors_map(
            sparse_map_op=st_handles.op, sparse_handles=st_handles)
        st_roundtrip_op = st_roundtrip.values.op

        st_serialized = tf.serialize_many_sparse(st)
        st_deserialized = tf.deserialize_many_sparse(
            st_serialized, dtype=values.dtype)
        st_deserialized_op = st_deserialized.values.op

        tf.initialize_all_variables().run()

        st_roundtrip_values = sess.run(st_roundtrip)
        st_deserialized_values = sess.run(st_deserialized)
        np.testing.assert_equal(
            st_roundtrip_values.values, st_deserialized_values.values)
        np.testing.assert_equal(
            st_roundtrip_values.indices, st_deserialized_values.indices)
        np.testing.assert_equal(
            st_roundtrip_values.shape, st_deserialized_values.shape)

        self.run_op_benchmark(
            sess, st_roundtrip_op, min_iters=2000,
            name="benchmark_very_large_2d_float_st_tensor_maps")
        self.run_op_benchmark(
            sess, st_deserialized_op, min_iters=2000,
            name="benchmark_very_large_2d_float_st_serialization")
    def testDeserializeFailsInconsistentRank(self):
        with self.test_session(use_gpu=False) as sess:
            sp_input0 = self._SparseTensorPlaceholder()
            sp_input1 = self._SparseTensorPlaceholder()
            input0_val = self._SparseTensorValue_5x6(np.arange(6))
            input1_val = self._SparseTensorValue_1x1x1()
            serialized0 = tf.serialize_sparse(sp_input0)
            serialized1 = tf.serialize_sparse(sp_input1)
            serialized_concat = tf.pack([serialized0, serialized1])

            sp_deserialized = tf.deserialize_many_sparse(
                serialized_concat, dtype=tf.int32)

            with self.assertRaisesOpError(
                    r"Inconsistent rank across SparseTensors: rank prior to "
                    r"SparseTensor\[1\] was: 3 but rank of SparseTensor\[1\] is: 4"):
                sess.run(
                    sp_deserialized, {sp_input0: input0_val, sp_input1: input1_val})
  def testDeserializeFailsInconsistentRank(self):
    with self.test_session(use_gpu=False) as sess:
      sp_input0 = self._SparseTensorPlaceholder()
      sp_input1 = self._SparseTensorPlaceholder()
      input0_val = self._SparseTensorValue_5x6(np.arange(6))
      input1_val = self._SparseTensorValue_1x1x1()
      serialized0 = tf.serialize_sparse(sp_input0)
      serialized1 = tf.serialize_sparse(sp_input1)
      serialized_concat = tf.pack([serialized0, serialized1])

      sp_deserialized = tf.deserialize_many_sparse(
          serialized_concat, dtype=tf.int32)

      with self.assertRaisesOpError(
          r"Inconsistent rank across SparseTensors: rank prior to "
          r"SparseTensor\[1\] was: 3 but rank of SparseTensor\[1\] is: 4"):
        sess.run(
            sp_deserialized, {sp_input0: input0_val, sp_input1: input1_val})
  def testDeserializeFailsWrongType(self):
    with self.test_session(use_gpu=False) as sess:
      sp_input0 = self._SparseTensorPlaceholder()
      sp_input1 = self._SparseTensorPlaceholder()
      input0_val = self._SparseTensorValue_5x6(np.arange(6))
      input1_val = self._SparseTensorValue_3x4(np.arange(6))
      serialized0 = tf.serialize_sparse(sp_input0)
      serialized1 = tf.serialize_sparse(sp_input1)
      serialized_concat = tf.pack([serialized0, serialized1])

      sp_deserialized = tf.deserialize_many_sparse(
          serialized_concat, dtype=tf.int64)

      with self.assertRaisesOpError(
          r"Requested SparseTensor of type int64 but "
          r"SparseTensor\[0\].values.dtype\(\) == int32"):
        sess.run(
            sp_deserialized, {sp_input0: input0_val, sp_input1: input1_val})
 def testSerializeManyDeserializeManyRoundTrip(self):
   with self.test_session(use_gpu=False) as sess:
     # N == 4 because shape_value == [4, 5]
     indices_value = np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64)
     values_value = np.array([b"a", b"b", b"c"])
     shape_value = np.array([4, 5], dtype=np.int64)
     sparse_tensor = self._SparseTensorPlaceholder(dtype=tf.string)
     serialized = tf.serialize_many_sparse(sparse_tensor)
     deserialized = tf.deserialize_many_sparse(serialized, dtype=tf.string)
     serialized_value, deserialized_value = sess.run(
         [serialized, deserialized],
         feed_dict={sparse_tensor.indices: indices_value,
                    sparse_tensor.values: values_value,
                    sparse_tensor.shape: shape_value})
     self.assertEqual(serialized_value.shape, (4, 3))
     self.assertAllEqual(deserialized_value.indices, indices_value)
     self.assertAllEqual(deserialized_value.values, values_value)
     self.assertAllEqual(deserialized_value.shape, shape_value)
    def testDeserializeFailsWrongType(self):
        with self.test_session(use_gpu=False) as sess:
            sp_input0 = self._SparseTensorPlaceholder()
            sp_input1 = self._SparseTensorPlaceholder()
            input0_val = self._SparseTensorValue_5x6(np.arange(6))
            input1_val = self._SparseTensorValue_3x4(np.arange(6))
            serialized0 = tf.serialize_sparse(sp_input0)
            serialized1 = tf.serialize_sparse(sp_input1)
            serialized_concat = tf.pack([serialized0, serialized1])

            sp_deserialized = tf.deserialize_many_sparse(
                serialized_concat, dtype=tf.int64)

            with self.assertRaisesOpError(
                    r"Requested SparseTensor of type int64 but "
                    r"SparseTensor\[0\].values.dtype\(\) == int32"):
                sess.run(
                    sp_deserialized, {sp_input0: input0_val, sp_input1: input1_val})
 def testSerializeManyDeserializeManyRoundTrip(self):
     with self.test_session(use_gpu=False) as sess:
         # N == 4 because shape_value == [4, 5]
         indices_value = np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64)
         values_value = np.array([b"a", b"b", b"c"])
         shape_value = np.array([4, 5], dtype=np.int64)
         sparse_tensor = self._SparseTensorPlaceholder(dtype=tf.string)
         serialized = tf.serialize_many_sparse(sparse_tensor)
         deserialized = tf.deserialize_many_sparse(serialized, dtype=tf.string)
         serialized_value, deserialized_value = sess.run(
             [serialized, deserialized],
             feed_dict={sparse_tensor.indices: indices_value,
                        sparse_tensor.values: values_value,
                        sparse_tensor.shape: shape_value})
         self.assertEqual(serialized_value.shape, (4, 3))
         self.assertAllEqual(deserialized_value.indices, indices_value)
         self.assertAllEqual(deserialized_value.values, values_value)
         self.assertAllEqual(deserialized_value.shape, shape_value)
  def testSerializeDeserializeMany(self):
    with self.test_session(use_gpu=False) as sess:
      sp_input0 = self._SparseTensorValue_5x6(np.arange(6))
      sp_input1 = self._SparseTensorValue_3x4(np.arange(6))
      serialized0 = tf.serialize_sparse(sp_input0)
      serialized1 = tf.serialize_sparse(sp_input1)
      serialized_concat = tf.pack([serialized0, serialized1])

      sp_deserialized = tf.deserialize_many_sparse(
          serialized_concat, dtype=tf.int32)

      combined_indices, combined_values, combined_shape = sess.run(
          sp_deserialized)

      self.assertAllEqual(combined_indices[:6, 0], [0] * 6)  # minibatch 0
      self.assertAllEqual(combined_indices[:6, 1:], sp_input0[0])
      self.assertAllEqual(combined_indices[6:, 0], [1] * 6)  # minibatch 1
      self.assertAllEqual(combined_indices[6:, 1:], sp_input1[0])
      self.assertAllEqual(combined_values[:6], sp_input0[1])
      self.assertAllEqual(combined_values[6:], sp_input1[1])
      self.assertAllEqual(combined_shape, [2, 5, 6])
Пример #24
0
def threaded_input_pipeline(base_dir,
                            file_patterns,
                            num_threads=4,
                            batch_size=32,
                            batch_device=None,
                            preprocess_device=None,
                            num_epochs=None):

    queue_capacity = num_threads * batch_size * 2
    # Allow a smaller final batch if we are going for a fixed number of epochs
    final_batch = (num_epochs != None)

    data_queue, number_of_images = _get_data_queue(base_dir,
                                                   file_patterns,
                                                   capacity=queue_capacity,
                                                   num_epochs=num_epochs)

    # each thread has a subgraph with its own reader (sharing filename queue)
    # можно заменить на tf.train.batch https://stackoverflow.com/questions/35689547/how-to-process-single-training-file-in-parallel
    data_tuples = []  # list of subgraph [image, label, width, text] elements
    with tf.device(preprocess_device):
        for _ in range(num_threads):
            image, width, label, length, text, filename = _read_word_record(
                data_queue)  # считывает изображения из tfrecord

            image = _preprocess_image(image)  # нормализация изображения
            data_tuples.append([image, width, label, length, text, filename])

    with tf.device(batch_device):  # Create batch queue
        image, width, label, length, text, filename = tf.train.batch_join(
            data_tuples,
            batch_size=batch_size,
            capacity=queue_capacity,
            allow_smaller_final_batch=final_batch,
            dynamic_pad=True)
        label = tf.deserialize_many_sparse(label, tf.int64)  # post-batching...
        label = tf.cast(label, tf.int32)  # for ctc_loss
    return image, width, label, length, text, filename, number_of_images
Пример #25
0
    def get_batch(self):
        """
        """
        # Get datasets
        datasets = []
        batch_sizes = []

        for i, (ds_name, ds_path, ds_portion) \
                in enumerate(zip(self.dataset_names,
                                 self.dataset_paths,
                                 self.dataset_portions)):

            # Extract
            if self.concat_batch:
                _batch_size = max(int(self.batch_size * ds_portion), 1) \
                        if i < len(self.dataset_names)-1 \
                        else max(self.batch_size - sum(batch_sizes), 1)
                batch_sizes.append(_batch_size)

            else:
                _batch_size = self.batch_size

            _data_files = glob.glob(ds_path, recursive=True)

            _dataset = tf.data.Dataset.list_files(
                _data_files, shuffle=self.shuffle_and_repeat, seed=self.seed)
            _dataset = _dataset.interleave(self.dataset_class,
                                           cycle_length=self.num_cpus,
                                           num_parallel_calls=self.num_cpus)

            if self.worker_index is not None:
                _dataset = _dataset.shard(self.num_gpus, self.worker_index)

            if self.shuffle_and_repeat:
                _dataset = _dataset.apply(
                    tf.contrib.data.shuffle_and_repeat(
                        buffer_size=_batch_size * self.buffer_size,
                        seed=self.seed))

            # Trasform
            _dataset = _dataset.map(lambda *e: self.parse_fn(*e, ds_name),
                                    num_parallel_calls=self.num_cpus)

            if self.preprocess_image:
                _dataset = _dataset.map(self.preprocess_fn,
                                        num_parallel_calls=self.num_cpus)

            _dataset = _dataset.filter(self.filter_fn)
            _dataset = _dataset.batch(_batch_size)

            datasets.append(_dataset)

        # Load
        if self.concat_batch:
            batches = []

            for _dataset in datasets:
                _dataset = _dataset.apply(
                    tf.contrib.data.prefetch_to_device(self.input_device, 2))

                _iterator = _dataset.make_initializable_iterator()
                tf.add_to_collection(self.iterator_name, _iterator.initializer)
                _batch = _iterator.get_next()
                batches.append(_batch)

            batch = [tf.concat(elements, axis=0)
                     for elements
                     in zip(*batches)] \
                if len(batches) > 1 \
                else batches[0]

            print('DATASET BATCHES : {} = {}'.format(
                ' + '.join([str(size) for size in batch_sizes]),
                sum(batch_sizes)))

        else:
            concatted = datasets[0]

            for i in range(1, len(datasets)):
                concatted = concatted.concatenate(datasets[i])

            concatted = concatted.apply(
                tf.data.experimental.prefetch_to_device(self.input_device, 2))

            iterator = concatted.make_initializable_iterator()
            tf.add_to_collection(self.iterator_name, iterator.initializer)
            batch = iterator.get_next()

        image, label, length, text, filename, dataset_name = \
            batch
        label = tf.deserialize_many_sparse(label, tf.int64)
        label = tf.cast(label, tf.int32)

        batch = Batch(image, label, length, text, filename, dataset_name)

        return batch