Exemplo n.º 1
0
def read_tfrecords(path, proto=None, max_records=None, options=None):
    """Yields the parsed records in tfrecord formatted file path.

  Note that path can be sharded filespec (path@N) in which case this function
  will read each shard in order.

  Args:
    path: String. A path to a tfrecord formatted file containing protos.
    proto: A proto class. proto.FromString() will be called on each serialized
      record in path to parse it.
    max_records: int >= 0 or None. Maximum number of records to read from path.
      If None, the default, all records will be read.
    options: A python_io.TFRecordOptions object for the reader.

  Yields:
    proto.FromString() values on each record in path in order.
  """
    if not proto:
        proto = example_pb2.Example

    if not options:
        options = make_tfrecord_options(path)

    if IsShardedFileSpec(path):
        paths = GenerateShardedFilenames(path)
    else:
        paths = [path]

    i = 0
    for path in paths:
        for buf in python_io.tf_record_iterator(path, options):
            i += 1
            if max_records is not None and i > max_records:
                return
            yield proto.FromString(buf)
Exemplo n.º 2
0
def read_tfrecords(path, proto=None, max_records=None, options=None):
  """Yields the parsed records in tfrecord formatted file path.

  Note that path can be sharded filespec (path@N) in which case this function
  will read each shard in order.

  Args:
    path: String. A path to a tfrecord formatted file containing protos.
    proto: A proto class. proto.FromString() will be called on each serialized
      record in path to parse it.
    max_records: int >= 0 or None. Maximum number of records to read from path.
      If None, the default, all records will be read.
    options: A python_io.TFRecordOptions object for the reader.

  Yields:
    proto.FromString() values on each record in path in order.
  """
  if not proto:
    proto = example_pb2.Example

  if not options:
    options = make_tfrecord_options(path)

  if IsShardedFileSpec(path):
    paths = GenerateShardedFilenames(path)
  else:
    paths = [path]

  i = 0
  for path in paths:
    for buf in python_io.tf_record_iterator(path, options):
      i += 1
      if max_records is not None and i > max_records:
        return
      yield proto.FromString(buf)
def shuffle_records(fname):
    """Shuffle records in a single file."""
    print("Shuffling records in file %s" % fname)

    # Rename file prior to shuffling
    tmp_fname = fname + ".unshuffled"
    gfile.Rename(fname, tmp_fname)

    reader = python_io.tf_record_iterator(tmp_fname)
    records = []
    for record in reader:
        records.append(record)
        if len(records) % 100000 == 0:
            print("\tRead: %d", len(records))

    random.shuffle(records)

    # Write shuffled records to original file name
    with python_io.TFRecordWriter(fname) as w:
        for count, record in enumerate(records):
            w.write(record)
            if count > 0 and count % 100000 == 0:
                print("\tWriting record: %d" % count)

    gfile.Remove(tmp_fname)
Exemplo n.º 4
0
def read_shard_sorted_tfrecords(path,
                                key,
                                proto=None,
                                max_records=None,
                                options=None):
  """Yields the parsed records in a TFRecord file path in sorted order.

  The input TFRecord file must have each shard already in sorted order when
  using the key function for comparison (but elements can be interleaved across
  shards). Under those constraints, the elements will be yielded in a global
  sorted order.

  Args:
    path: String. A path to a TFRecord-formatted file containing protos.
    key: Callable. A function that takes as input a single instance of the proto
      class and returns a value on which the comparison for sorted ordering is
      performed.
    proto: A proto class. proto.FromString() will be called on each serialized
      record in path to parse it.
    max_records: int >= 0 or None. Maximum number of records to read from path.
      If None, the default, all records will be read.
    options: A python_io.TFRecordOptions object for the reader.

  Yields:
    proto.FromString() values on each record in path in sorted order.
  """
  if proto is None:
    proto = example_pb2.Example

  if options is None:
    options = make_tfrecord_options(path)

  if IsShardedFileSpec(path):
    paths = GenerateShardedFilenames(path)
  else:
    paths = [path]

  keyed_iterables = []
  for path in paths:
    protos = (
        proto.FromString(buf)
        for buf in python_io.tf_record_iterator(path, options))
    keyed_iterables.append(((key(elem), elem) for elem in protos))

  for i, (_, value) in enumerate(heapq.merge(*keyed_iterables)):
    if max_records is not None and i >= max_records:
      return
    yield value
Exemplo n.º 5
0
def read_shard_sorted_tfrecords(path,
                                key,
                                proto=None,
                                max_records=None,
                                options=None):
  """Yields the parsed records in TFRecord-formatted file path in sorted order.

  The input TFRecord file must have each shard already in sorted order when
  using the key function for comparison (but elements can be interleaved across
  shards). Under those constraints, the elements will be yielded in a global
  sorted order.

  Args:
    path: String. A path to a TFRecord-formatted file containing protos.
    key: Callable. A function that takes as input a single instance of the proto
      class and returns a value on which the comparison for sorted ordering is
      performed.
    proto: A proto class. proto.FromString() will be called on each serialized
      record in path to parse it.
    max_records: int >= 0 or None. Maximum number of records to read from path.
      If None, the default, all records will be read.
    options: A python_io.TFRecordOptions object for the reader.

  Yields:
    proto.FromString() values on each record in path in sorted order.
  """
  if proto is None:
    proto = example_pb2.Example

  if options is None:
    options = make_tfrecord_options(path)

  if IsShardedFileSpec(path):
    paths = GenerateShardedFilenames(path)
  else:
    paths = [path]

  keyed_iterables = []
  for path in paths:
    protos = (
        proto.FromString(buf)
        for buf in python_io.tf_record_iterator(path, options))
    keyed_iterables.append(((key(elem), elem) for elem in protos))

  for i, (_, value) in enumerate(heapq.merge(*keyed_iterables)):
    if max_records is not None and i >= max_records:
      return
    yield value
Exemplo n.º 6
0
 def iterate(self):
   # redacted
   for buf in python_io.tf_record_iterator(self.input_path, self.tf_options):
     yield self.proto.FromString(buf)
Exemplo n.º 7
0
 def iterate(self):
     """Returns an iterator for going through all the file's records."""
     # redacted
     for buf in python_io.tf_record_iterator(self.input_path,
                                             self.tf_options):
         yield self.proto.FromString(buf)
Exemplo n.º 8
0
 def iterate(self):
     # redacted
     for buf in python_io.tf_record_iterator(self.input_path,
                                             self.tf_options):
         yield self.proto.FromString(buf)
Exemplo n.º 9
0
 def iterate(self):
     # TODO(thomaswc): Support max_records?
     for buf in python_io.tf_record_iterator(self.input_path,
                                             self.tf_options):
         yield self.proto.FromString(buf)
Exemplo n.º 10
0
 def iterate(self):
     """Returns an iterator for going through all the file's records."""
     # TODO(thomaswc): Support max_records?
     for buf in python_io.tf_record_iterator(self.input_path,
                                             self.tf_options):
         yield self.proto.FromString(buf)