Пример #1
0
  def _benchmarkMapAndFilter(self, chain_length, optimize_dataset):
    with ops.Graph().as_default():
      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
      for _ in range(chain_length):
        dataset = dataset.map(lambda x: x + 5).filter(
            lambda x: math_ops.greater_equal(x - 5, 0))
      if optimize_dataset:
        dataset = dataset.apply(
            optimization.optimize(["map_and_filter_fusion"]))

      iterator = dataset.make_one_shot_iterator()
      next_element = iterator.get_next()

      with session.Session() as sess:
        for _ in range(10):
          sess.run(next_element.op)
        deltas = []
        for _ in range(100):
          start = time.time()
          for _ in range(100):
            sess.run(next_element.op)
          end = time.time()
          deltas.append(end - start)

        median_wall_time = np.median(deltas) / 100
        opt_mark = "opt" if optimize_dataset else "no-opt"
        print("Map and filter dataset {} chain length: {} Median wall time: {}".
              format(opt_mark, chain_length, median_wall_time))
        self.report_benchmark(
            iters=1000,
            wall_time=median_wall_time,
            name="benchmark_map_and_filter_dataset_chain_latency_{}_{}".format(
                opt_mark, chain_length))
Пример #2
0
  def _benchmarkMapAndFilter(self, chain_length, optimize_dataset):
    with ops.Graph().as_default():
      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
      for _ in range(chain_length):
        dataset = dataset.map(lambda x: x + 5).filter(
            lambda x: math_ops.greater_equal(x - 5, 0))
      if optimize_dataset:
        dataset = dataset.apply(
            optimization.optimize(["map_and_filter_fusion"]))

      iterator = dataset.make_one_shot_iterator()
      next_element = iterator.get_next()

      with session.Session() as sess:
        for _ in range(10):
          sess.run(next_element.op)
        deltas = []
        for _ in range(100):
          start = time.time()
          for _ in range(100):
            sess.run(next_element.op)
          end = time.time()
          deltas.append(end - start)

        median_wall_time = np.median(deltas) / 100
        opt_mark = "opt" if optimize_dataset else "no-opt"
        print("Map and filter dataset {} chain length: {} Median wall time: {}".
              format(opt_mark, chain_length, median_wall_time))
        self.report_benchmark(
            iters=1000,
            wall_time=median_wall_time,
            name="benchmark_map_and_filter_dataset_chain_latency_{}_{}".format(
                opt_mark, chain_length))
 def build_dataset():
     dataset = dataset_ops.Dataset.range(100)
     dataset = dataset.map(lambda x: x)
     dataset = dataset.batch(5)
     # map_vectorization adds a new vectorized function to the function
     # library.
     dataset = dataset.apply(
         optimization.optimize(["map_vectorization"]))
     return dataset
Пример #4
0
 def build_dataset(num_elements, batch_size):
     return dataset_ops.Dataset.range(num_elements).map(
         lambda x: x * x).batch(batch_size).apply(
             optimization.optimize(["map_and_batch_fusion"]))
 def build_dataset(num_elements, batch_size):
   return dataset_ops.Dataset.range(num_elements).map(lambda x: x * x).batch(
       batch_size).apply(optimization.optimize(["map_and_batch_fusion"]))
Пример #6
0
def read_tf_records(batch_size,
                    tf_records,
                    num_repeats=1,
                    shuffle_records=True,
                    shuffle_examples=True,
                    shuffle_buffer_size=None,
                    interleave=True,
                    filter_amount=1.0,
                    dist_train=False,
                    seed=0):
    """
    Args:
        batch_size: batch size to return
        tf_records: a list of tf_record filenames
        num_repeats: how many times the data should be read (default: One)
        shuffle_records: whether to shuffle the order of files read
        shuffle_examples: whether to shuffle the tf.Examples
        shuffle_buffer_size: how big of a buffer to fill before shuffling.
        interleave: iwhether to interleave examples from multiple tf_records
        filter_amount: what fraction of records to keep
    Returns:
        a tf dataset of batched tensors
    """
    if shuffle_examples and not shuffle_buffer_size:
        raise ValueError("Must set shuffle buffer size if shuffling examples")

    tf_records = list(tf_records)

    random.seed(seed)

    #if shuffle_records:
    #    random.shuffle(tf_records)

    record_list = tf.data.Dataset.from_tensor_slices(tf_records)

    if dist_train:
        record_list = record_list.shard(hvd.size(), hvd.rank())

    # compression_type here must agree with write_tf_examples
    map_func = functools.partial(tf.data.TFRecordDataset,
                                 buffer_size=8 * 1024 * 1024,
                                 compression_type='ZLIB')

    if interleave:
        # cycle_length = how many tfrecord files are read in parallel
        # The idea is to shuffle both the order of the files being read,
        # and the examples being read from the files.
        dataset = record_list.apply(
            tf.data.experimental.parallel_interleave(map_func,
                                                     cycle_length=1000,
                                                     sloppy=True))
    else:
        dataset = record_list.flat_map(map_func)

    if filter_amount < 1.0:
        dataset = dataset.filter(
            lambda _: tf.random.uniform([], seed=seed) < filter_amount)
        dataset = dataset.apply(
            optimization.optimize(["filter_with_random_uniform_fusion"]))

    #if dist_train:
    #    dataset = dataset.shard(hvd.size(), hvd.rank())

    dataset = dataset.repeat(num_repeats)

    if shuffle_examples:
        dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)

    dataset = dataset.batch(batch_size, drop_remainder=True)
    return dataset