Пример #1
0
  def from_examples(cls,
                    filepatterns,
                    batch_size,
                    features,
                    file_format=FileFormat.TFRECORD,
                    shuffle=True,
                    num_threads=1,
                    queue_capacity=None,
                    min_after_dequeue=None,
                    seed=None):
    """Create a `DataFrame` from `tensorflow.Example`s.

    Args:
      filepatterns: a list of file patterns containing `tensorflow.Example`s.
      batch_size: desired batch size.
      features: a dict mapping feature names to `VarLenFeature` or
        `FixedLenFeature`.
      file_format: a `FileFormat` indicating the format of the files in
        `filepatterns`.
      shuffle: whether records should be shuffled. Defaults to true.
      num_threads: the number of readers that will work in parallel.
      queue_capacity: capacity of the queue that will store parsed `Example`s
      min_after_dequeue: minimum number of elements that can be left by a
        dequeue operation. Only used if `shuffle` is true.
      seed: passed to random shuffle operations. Only used if `shuffle` is true.

    Returns:
      A `DataFrame` that has columns corresponding to `features` and is filled
      with `Example`s from `filepatterns`.

    Raises:
      ValueError: no files match `filepatterns`.
      ValueError: `features` contains the reserved name 'index'.
    """
    filenames = _expand_file_names(filepatterns)
    if not filenames:
      raise ValueError("No matching file names.")

    if "index" in features:
      raise ValueError(
          "'index' is reserved and can not be used for a feature name.")

    index, record = reader_source.ReaderSource(
        FILE_FORMAT_TO_READER_CLS[file_format],
        filenames,
        batch_size=batch_size,
        queue_capacity=queue_capacity,
        shuffle=shuffle,
        min_after_dequeue=min_after_dequeue,
        num_threads=num_threads,
        seed=seed)()
    parser = example_parser.ExampleParser(features)
    parsed = parser(record)

    column_dict = parsed._asdict()
    column_dict["index"] = index

    dataframe = cls()
    dataframe.assign(**column_dict)
    return dataframe
Пример #2
0
  def testParseWithTupleDefinition(self):
    parser = example_parser.ExampleParser(self.features)
    output_columns = parser(self.example_column)
    self.assertEqual(2, len(output_columns))
    cache = {}
    output_tensors = [o.build(cache) for o in output_columns]
    self.assertEqual(2, len(output_tensors))

    with self.test_session() as sess:
      string_feature, int_feature = sess.run(output_tensors)
      np.testing.assert_array_equal(string_feature.shape, np.array([2, 2]))
      np.testing.assert_array_equal(int_feature.shape, np.array([2, 3]))
      np.testing.assert_array_equal(self.expected_string_values,
                                    string_feature.values)
      np.testing.assert_array_equal(self.expected_string_indices,
                                    string_feature.indices)
      np.testing.assert_array_equal(self.expected_int_feature,
                                    int_feature)
Пример #3
0
  def from_examples(cls,
                    filepatterns,
                    features,
                    reader_cls=io_ops.TFRecordReader,
                    num_epochs=None,
                    num_threads=1,
                    enqueue_size=None,
                    batch_size=32,
                    queue_capacity=None,
                    min_after_dequeue=None,
                    shuffle=True,
                    seed=None):
    """Create a `DataFrame` from `tensorflow.Example`s.

    Args:
      filepatterns: a list of file patterns containing `tensorflow.Example`s.
      features: a dict mapping feature names to `VarLenFeature` or
        `FixedLenFeature`.
      reader_cls: a subclass of `tensorflow.ReaderBase` that will be used to
        read the `Example`s.
      num_epochs: the number of times that the reader should loop through all
        the file names. If set to `None`, then the reader will continue
        indefinitely.
      num_threads: the number of readers that will work in parallel.
      enqueue_size: block size for each read operation.
      batch_size: desired batch size.
      queue_capacity: capacity of the queue that will store parsed `Example`s
      min_after_dequeue: minimum number of elements that can be left by a
        dequeue operation. Only used if `shuffle` is true.
      shuffle: whether records should be shuffled. Defaults to true.
      seed: passed to random shuffle operations. Only used if `shuffle` is true.

    Returns:
      A `DataFrame` that has columns corresponding to `features` and is filled
      with `Example`s from `filepatterns`.

    Raises:
      ValueError: no files match `filepatterns`.
      ValueError: `features` contains the reserved name 'index'.
    """
    filenames = _expand_file_names(filepatterns)
    if not filenames:
      raise ValueError("No matching file names.")

    if "index" in features:
      raise ValueError(
          "'index' is reserved and can not be used for a feature name.")

    index, record = reader_source.ReaderSource(
        reader_cls,
        filenames,
        enqueue_size=enqueue_size,
        batch_size=batch_size,
        num_epochs=num_epochs,
        queue_capacity=queue_capacity,
        shuffle=shuffle,
        min_after_dequeue=min_after_dequeue,
        num_threads=num_threads,
        seed=seed)()
    parser = example_parser.ExampleParser(features)
    parsed = parser(record)

    column_dict = parsed._asdict()
    column_dict["index"] = index

    dataframe = cls()
    dataframe.assign(**column_dict)
    return dataframe