def testLargeBufferSize(self):
   with ops.Graph().as_default() as g:
     ds = dataset_ops.Dataset.range(20).apply(
         shuffle_ops.shuffle_and_repeat(buffer_size=21))
     get_next_op = ds.make_one_shot_iterator().get_next()
     with self.test_session(graph=g) as sess:
       sess.run(get_next_op)
 def testLargeBufferSize(self):
     with ops.Graph().as_default() as g:
         ds = dataset_ops.Dataset.range(20).apply(
             shuffle_ops.shuffle_and_repeat(buffer_size=21))
         get_next_op = ds.make_one_shot_iterator().get_next()
         with self.session(graph=g) as sess:
             sess.run(get_next_op)
Exemplo n.º 3
0
def _maybe_shuffle_and_repeat(
    dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed):
  """Optionally shuffle and repeat dataset, as requested."""
  if num_epochs != 1 and shuffle:
    # Use shuffle_and_repeat for perf
    return dataset.apply(
        shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
                                       shuffle_seed))
  elif shuffle:
    return dataset.shuffle(shuffle_buffer_size, shuffle_seed)
  elif num_epochs != 1:
    return dataset.repeat(num_epochs)
  return dataset
Exemplo n.º 4
0
def _maybe_shuffle_and_repeat(dataset, num_epochs, shuffle,
                              shuffle_buffer_size, shuffle_seed):
    """Optionally shuffle and repeat dataset, as requested."""
    if num_epochs != 1 and shuffle:
        # Use shuffle_and_repeat for perf
        return dataset.apply(
            shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
                                           shuffle_seed))
    elif shuffle:
        return dataset.shuffle(shuffle_buffer_size, shuffle_seed)
    elif num_epochs != 1:
        return dataset.repeat(num_epochs)
    return dataset
Exemplo n.º 5
0
def make_batched_features_dataset(file_pattern,
                                  batch_size,
                                  features,
                                  reader=core_readers.TFRecordDataset,
                                  reader_args=None,
                                  num_epochs=None,
                                  shuffle=True,
                                  shuffle_buffer_size=10000,
                                  shuffle_seed=None,
                                  prefetch_buffer_size=1,
                                  reader_num_threads=1,
                                  parser_num_threads=2,
                                  sloppy_ordering=False,
                                  drop_final_batch=False):
  """Returns a `Dataset` of feature dictionaries from `Example` protos.

  Example:

  ```
  serialized_examples = [
    features {
      feature { key: "age" value { int64_list { value: [ 0 ] } } }
      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
      feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } }
    },
    features {
      feature { key: "age" value { int64_list { value: [] } } }
      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
      feature { key: "kws" value { bytes_list { value: [ "sports" ] } } }
    }
  ]
  ```

  We can use arguments:

  ```
  features: {
    "age": FixedLenFeature([], dtype=tf.int64, default_value=-1),
    "gender": FixedLenFeature([], dtype=tf.string),
    "kws": VarLenFeature(dtype=tf.string),
  }
  ```

  And the expected output is:

  ```python
  {
    "age": [[0], [-1]],
    "gender": [["f"], ["f"]],
    "kws": SparseTensor(
      indices=[[0, 0], [0, 1], [1, 0]],
      values=["code", "art", "sports"]
      dense_shape=[2, 2]),
  }
  ```

  Args:
    file_pattern: List of files or patterns of file paths containing
      `Example` records. See `tf.gfile.Glob` for pattern rules.
    batch_size: An int representing the number of consecutive elements of this
      dataset to combine in a single batch.
    features: A `dict` mapping feature keys to `FixedLenFeature` or
      `VarLenFeature` values. See `tf.parse_example`.
    reader: A function or class that can be
      called with a `filenames` tensor and (optional) `reader_args` and returns
      a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`.
    reader_args: Additional arguments to pass to the reader class.
    num_epochs: Integer specifying the number of times to read through the
      dataset. If None, cycles through the dataset forever. Defaults to `None`.
    shuffle: A boolean, indicates whether the input should be shuffled. Defaults
      to `True`.
    shuffle_buffer_size: Buffer size of the ShuffleDataset. A large capacity
      ensures better shuffling but would increase memory usage and startup time.
    shuffle_seed: Randomization seed to use for shuffling.
    prefetch_buffer_size: Number of feature batches to prefetch in order to
      improve performance. Recommended value is the number of batches consumed
      per training step (default is 1).
    reader_num_threads: Number of threads used to read `Example` records. If >1,
      the results will be interleaved.
    parser_num_threads: Number of threads to use for parsing `Example` tensors
      into a dictionary of `Feature` tensors.
    sloppy_ordering: If `True`, reading performance will be improved at
      the cost of non-deterministic ordering. If `False`, the order of elements
      produced is deterministic prior to shuffling (elements are still
      randomized if `shuffle=True`. Note that if the seed is set, then order
      of elements after shuffling is deterministic). Defaults to `False`.
    drop_final_batch: If `True`, and the batch size does not evenly divide the
      input dataset size, the final smaller batch will be dropped. Defaults to
      `False`.

  Returns:
    A dataset of `dict` elements. Each `dict` maps feature keys to
    `Tensor` or `SparseTensor` objects.
  """
  # Create dataset of all matching filenames
  filenames = _get_file_names(file_pattern, False)
  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
  if shuffle:
    dataset = dataset.shuffle(len(filenames), shuffle_seed)

  # Read `Example` records from files as tensor objects.
  if reader_args is None:
    reader_args = []

  # Read files sequentially (if reader_num_threads=1) or in parallel
  dataset = dataset.apply(
      interleave_ops.parallel_interleave(
          lambda filename: reader(filename, *reader_args),
          cycle_length=reader_num_threads,
          sloppy=sloppy_ordering))

  # Extract values if the `Example` tensors are stored as key-value tuples.
  if dataset.output_types == (dtypes.string, dtypes.string):
    dataset = dataset.map(lambda _, v: v)

  # Apply dataset repeat and shuffle transformations.
  repeat_dataset = (num_epochs != 1)
  if repeat_dataset and shuffle:
    # Used fused shuffle_and_repeat operation for better performance
    dataset = dataset.apply(
        shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
                                       shuffle_seed))
  elif repeat_dataset:
    dataset = dataset.repeat(num_epochs)
  elif shuffle:
    dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)

  if drop_final_batch:
    dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size))
  else:
    dataset = dataset.batch(batch_size)

  # Parse `Example` tensors to a dictionary of `Feature` tensors.
  dataset = dataset.map(
      lambda x: parsing_ops.parse_example(x, features),
      num_parallel_calls=parser_num_threads)

  # TODO(rachelim): Add an optional label_name argument for extracting the label
  # from the features dictionary, to comply with the type expected by the
  # input_fn to a `tf.Estimator.train` or `tf.Estimator.evaluate` function.
  dataset = dataset.prefetch(prefetch_buffer_size)
  return dataset
Exemplo n.º 6
0
def make_csv_dataset(
    file_pattern,
    batch_size,
    column_names=None,
    column_defaults=None,
    label_name=None,
    field_delim=",",
    use_quote_delim=True,
    na_value="",
    header=True,
    comment=None,
    num_epochs=None,
    shuffle=True,
    shuffle_buffer_size=10000,
    shuffle_seed=None,
    prefetch_buffer_size=1,
    num_parallel_reads=1,
    num_parallel_parser_calls=2,
    sloppy=False,
    default_float_type=dtypes.float32,
    num_rows_for_inference=100,
):
  """Reads CSV files into a dataset.

  Reads CSV files into a dataset, where each element is a (features, labels)
  tuple that corresponds to a batch of CSV rows. The features dictionary
  maps feature column names to `Tensor`s containing the corresponding
  feature data, and labels is a `Tensor` containing the batch's label data.

  Args:
    file_pattern: List of files or patterns of file paths containing CSV
      records. See @{tf.gfile.Glob} for pattern rules.
    batch_size: An int representing the number of consecutive elements of this
      dataset to combine in a single batch.
    column_names: An optional list of strings that corresponds to the CSV
      columns, in order. One per column of the input record. If this is not
      provided, infers the column names from the first row of the records.
      These names will be the keys of the features dict of each dataset element.
    column_defaults: A optional list of default values for the CSV fields. One
      item per column of the input record. Each item in the list is either a
      valid CSV dtype (float32, float64, int32, int64, or string), or a
      `Tensor` with one of the aforementioned types. The tensor can either be
      a scalar default value (if the column is optional), or an empty tensor (if
      the column is required). If a dtype is provided instead of a tensor, the
      column is also treated as required. If this list is not provided, tries
      to infer types based on reading the first num_rows_for_inference rows of
      files specified, and assumes all columns are optional, defaulting to `0`
      for numeric values and `""` for string values.
    label_name: A optional string corresponding to the label column. If
      provided, the data for this column is returned as a separate `Tensor` from
      the features dictionary, so that the dataset complies with the format
      expected by a `tf.Estimator.train` or `tf.Estimator.evaluate` input
      function.
    field_delim: An optional `string`. Defaults to `","`. Char delimiter to
      separate fields in a record.
    use_quote_delim: An optional bool. Defaults to `True`. If false, treats
      double quotation marks as regular characters inside of the string fields.
    na_value: Additional string to recognize as NA/NaN.
    header: A bool that indicates whether the first rows of provided CSV files
      correspond to header lines with column names, and should not be included
      in the data.
    comment: An optional character string that marks lines that should not be
      parsed as csv records. If this is provided, all lines that start with
      this character will not be parsed.
    num_epochs: An int specifying the number of times this dataset is repeated.
      If None, cycles through the dataset forever.
    shuffle: A bool that indicates whether the input should be shuffled.
    shuffle_buffer_size: Buffer size to use for shuffling. A large buffer size
      ensures better shuffling, but would increase memory usage and startup
      time.
    shuffle_seed: Randomization seed to use for shuffling.
    prefetch_buffer_size: An int specifying the number of feature batches to
      prefetch for performance improvement. Recommended value is the number of
      batches consumed per training step.
    num_parallel_reads: Number of threads used to read CSV records from files.
      If >1, the results will be interleaved.
    num_parallel_parser_calls: Number of parallel invocations of the CSV parsing
      function on CSV records.
    sloppy: If `True`, reading performance will be improved at
      the cost of non-deterministic ordering. If `False`, the order of elements
      produced is deterministic prior to shuffling (elements are still
      randomized if `shuffle=True`. Note that if the seed is set, then order
      of elements after shuffling is deterministic). Defaults to `False`.
    default_float_type: Either `tf.float32` or `tf.float64`. If defaults are
      not provided, float-like strings are interpreted to be this type.
    num_rows_for_inference: Number of rows of a file to use for type inference
      if record_defaults is not provided. If None, reads all the rows of all
      the files. Defaults to 100.

  Returns:
    A dataset, where each element is a (features, labels) tuple that corresponds
    to a batch of `batch_size` CSV rows. The features dictionary maps feature
    column names to `Tensor`s containing the corresponding column data, and
    labels is a `Tensor` containing the column data for the label column
    specified by `label_name`.

  Raises:
    ValueError: If any of the arguments is malformed.
  """
  # Create dataset of all matching filenames
  filenames = _get_file_names(file_pattern, False)
  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
  if shuffle:
    dataset = dataset.shuffle(len(filenames), shuffle_seed)

  # Clean arguments; figure out column names and defaults
  if comment is not None and len(comment) != 1:
    raise ValueError("`comment` arg must be a single-character string or None")

  if column_names is None:
    if not header:
      raise ValueError("Cannot infer column names without a header line.")
    # If column names are not provided, infer from the header lines
    column_names = _infer_column_names(filenames, field_delim, use_quote_delim)
  if len(column_names) != len(set(column_names)):
    raise ValueError("Cannot have duplicate column names.")

  if column_defaults is not None:
    column_defaults = [
        constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
        for x in column_defaults
    ]
  else:
    # If column defaults are not provided, infer from records at graph
    # construction time
    column_defaults = _infer_column_defaults(
        filenames, len(column_names), field_delim, use_quote_delim, na_value,
        header, comment, default_float_type, num_rows_for_inference)

  if label_name is not None and label_name not in column_names:
    raise ValueError("`label_name` provided must be one of the columns.")

  # Define map and filter functions
  def filter_fn(line):
    return math_ops.not_equal(string_ops.substr(line, 0, 1), comment)

  def filename_to_dataset(filename):
    ds = core_readers.TextLineDataset(filename)
    if header:
      ds = ds.skip(1)
    if comment is not None:
      ds = ds.filter(filter_fn)
    return ds

  def decode_csv(line):
    """Decodes CSV line into features.

    Args:
      line: String tensor corresponding to one csv record.
    Returns:
      A dictionary of feature names to values for that particular record. If
      label_name is provided, extracts the label feature to be returned as the
      second element of the tuple.
    """
    columns = parsing_ops.decode_csv(
        line,
        column_defaults,
        field_delim=field_delim,
        use_quote_delim=use_quote_delim,
        na_value=na_value,
    )
    features = dict(zip(column_names, columns))
    if label_name is not None:
      label = features.pop(label_name)
      return features, label
    return features

  # Read files sequentially or in parallel
  dataset = dataset.apply(
      interleave_ops.parallel_interleave(
          filename_to_dataset, cycle_length=num_parallel_reads, sloppy=sloppy))

  if num_epochs != 1 and shuffle:
    # Use shuffle_and_repeat for perf
    dataset = dataset.apply(
        shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
                                       shuffle_seed))
  elif shuffle:
    dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)
  elif num_epochs != 1:
    dataset = dataset.repeat(num_epochs)

  # Use map_and_batch for perf
  # TODO(b/76425672): use num_parallel_calls for better performance tuning when
  # that is added
  dataset = dataset.apply(
      batching.map_and_batch(
          map_func=decode_csv,
          batch_size=batch_size,
          num_parallel_batches=int(
              ceil(num_parallel_parser_calls / batch_size))))

  dataset = dataset.prefetch(prefetch_buffer_size)
  return dataset
Exemplo n.º 7
0
 def _build_ds(self, seed):
     return dataset_ops.Dataset.range(20).apply(
         shuffle_ops.shuffle_and_repeat(buffer_size=5, count=5, seed=seed))
Exemplo n.º 8
0
 def _build_ds(self, seed, count=5, num_elements=20):
     return dataset_ops.Dataset.range(num_elements).apply(
         shuffle_ops.shuffle_and_repeat(buffer_size=5,
                                        count=count,
                                        seed=seed))
Exemplo n.º 9
0
 def _build_ds(self, seed):
   return dataset_ops.Dataset.range(20).apply(
       shuffle_ops.shuffle_and_repeat(buffer_size=5, count=5, seed=seed))
Exemplo n.º 10
0
def make_batched_features_dataset(file_pattern,
                                  batch_size,
                                  features,
                                  reader=core_readers.TFRecordDataset,
                                  reader_args=None,
                                  num_epochs=None,
                                  shuffle=True,
                                  shuffle_buffer_size=10000,
                                  shuffle_seed=None,
                                  prefetch_buffer_size=1,
                                  reader_num_threads=1,
                                  parser_num_threads=2,
                                  sloppy_ordering=False,
                                  drop_final_batch=False):
    """Returns a `Dataset` of feature dictionaries from `Example` protos.

  Example:

  ```
  serialized_examples = [
    features {
      feature { key: "age" value { int64_list { value: [ 0 ] } } }
      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
      feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } }
    },
    features {
      feature { key: "age" value { int64_list { value: [] } } }
      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
      feature { key: "kws" value { bytes_list { value: [ "sports" ] } } }
    }
  ]
  ```

  We can use arguments:

  ```
  features: {
    "age": FixedLenFeature([], dtype=tf.int64, default_value=-1),
    "gender": FixedLenFeature([], dtype=tf.string),
    "kws": VarLenFeature(dtype=tf.string),
  }
  ```

  And the expected output is:

  ```python
  {
    "age": [[0], [-1]],
    "gender": [["f"], ["f"]],
    "kws": SparseTensor(
      indices=[[0, 0], [0, 1], [1, 0]],
      values=["code", "art", "sports"]
      dense_shape=[2, 2]),
  }
  ```

  Args:
    file_pattern: List of files or patterns of file paths containing
      `Example` records. See `tf.gfile.Glob` for pattern rules.
    batch_size: An int representing the number of consecutive elements of this
      dataset to combine in a single batch.
    features: A `dict` mapping feature keys to `FixedLenFeature` or
      `VarLenFeature` values. See `tf.parse_example`.
    reader: A function or class that can be
      called with a `filenames` tensor and (optional) `reader_args` and returns
      a `Dataset` of `Example` tensors. Defaults to `tf.data.TFRecordDataset`.
    reader_args: Additional arguments to pass to the reader class.
    num_epochs: Integer specifying the number of times to read through the
      dataset. If None, cycles through the dataset forever. Defaults to `None`.
    shuffle: A boolean, indicates whether the input should be shuffled. Defaults
      to `True`.
    shuffle_buffer_size: Buffer size of the ShuffleDataset. A large capacity
      ensures better shuffling but would increase memory usage and startup time.
    shuffle_seed: Randomization seed to use for shuffling.
    prefetch_buffer_size: Number of feature batches to prefetch in order to
      improve performance. Recommended value is the number of batches consumed
      per training step (default is 1).
    reader_num_threads: Number of threads used to read `Example` records. If >1,
      the results will be interleaved.
    parser_num_threads: Number of threads to use for parsing `Example` tensors
      into a dictionary of `Feature` tensors.
    sloppy_ordering: If `True`, reading performance will be improved at
      the cost of non-deterministic ordering. If `False`, the order of elements
      produced is deterministic prior to shuffling (elements are still
      randomized if `shuffle=True`. Note that if the seed is set, then order
      of elements after shuffling is deterministic). Defaults to `False`.
    drop_final_batch: If `True`, and the batch size does not evenly divide the
      input dataset size, the final smaller batch will be dropped. Defaults to
      `False`.

  Returns:
    A dataset of `dict` elements. Each `dict` maps feature keys to
    `Tensor` or `SparseTensor` objects.
  """
    # Create dataset of all matching filenames
    filenames = _get_file_names(file_pattern, False)
    dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
    if shuffle:
        dataset = dataset.shuffle(len(filenames), shuffle_seed)

    # Read `Example` records from files as tensor objects.
    if reader_args is None:
        reader_args = []

    # Read files sequentially (if reader_num_threads=1) or in parallel
    dataset = dataset.apply(
        interleave_ops.parallel_interleave(
            lambda filename: reader(filename, *reader_args),
            cycle_length=reader_num_threads,
            sloppy=sloppy_ordering))

    # Extract values if the `Example` tensors are stored as key-value tuples.
    if dataset.output_types == (dtypes.string, dtypes.string):
        dataset = dataset.map(lambda _, v: v)

    # Apply dataset repeat and shuffle transformations.
    repeat_dataset = (num_epochs != 1)
    if repeat_dataset and shuffle:
        # Used fused shuffle_and_repeat operation for better performance
        dataset = dataset.apply(
            shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
                                           shuffle_seed))
    elif repeat_dataset:
        dataset = dataset.repeat(num_epochs)
    elif shuffle:
        dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)

    if drop_final_batch:
        dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size))
    else:
        dataset = dataset.batch(batch_size)

    # Parse `Example` tensors to a dictionary of `Feature` tensors.
    dataset = dataset.map(lambda x: parsing_ops.parse_example(x, features),
                          num_parallel_calls=parser_num_threads)

    # TODO(rachelim): Add an optional label_name argument for extracting the label
    # from the features dictionary, to comply with the type expected by the
    # input_fn to a `tf.Estimator.train` or `tf.Estimator.evaluate` function.
    dataset = dataset.prefetch(prefetch_buffer_size)
    return dataset
Exemplo n.º 11
0
def make_csv_dataset(
    file_pattern,
    batch_size,
    column_names=None,
    column_defaults=None,
    label_name=None,
    select_columns=None,
    field_delim=",",
    use_quote_delim=True,
    na_value="",
    header=True,
    comment=None,
    num_epochs=None,
    shuffle=True,
    shuffle_buffer_size=10000,
    shuffle_seed=None,
    prefetch_buffer_size=1,
    num_parallel_reads=1,
    num_parallel_parser_calls=2,
    sloppy=False,
    default_float_type=dtypes.float32,
    num_rows_for_inference=100,
):
    """Reads CSV files into a dataset.

  Reads CSV files into a dataset, where each element is a (features, labels)
  tuple that corresponds to a batch of CSV rows. The features dictionary
  maps feature column names to `Tensor`s containing the corresponding
  feature data, and labels is a `Tensor` containing the batch's label data.

  Args:
    file_pattern: List of files or patterns of file paths containing CSV
      records. See @{tf.gfile.Glob} for pattern rules.
    batch_size: An int representing the number of consecutive elements of this
      dataset to combine in a single batch.
    column_names: An optional list of strings that corresponds to the CSV
      columns, in order. One per column of the input record. If this is not
      provided, infers the column names from the first row of the records.
      These names will be the keys of the features dict of each dataset element.
    column_defaults: A optional list of default values for the CSV fields. One
      item per selected column of the input record. Each item in the list is
      either a valid CSV dtype (float32, float64, int32, int64, or string), or a
      `Tensor` with one of the aforementioned types. The tensor can either be
      a scalar default value (if the column is optional), or an empty tensor (if
      the column is required). If a dtype is provided instead of a tensor, the
      column is also treated as required. If this list is not provided, tries
      to infer types based on reading the first num_rows_for_inference rows of
      files specified, and assumes all columns are optional, defaulting to `0`
      for numeric values and `""` for string values. If both this and
      `select_columns` are specified, these must have the same lengths, and
      `column_defaults` is assumed to be sorted in order of increasing column
      index.
    label_name: A optional string corresponding to the label column. If
      provided, the data for this column is returned as a separate `Tensor` from
      the features dictionary, so that the dataset complies with the format
      expected by a `tf.Estimator.train` or `tf.Estimator.evaluate` input
      function.
    select_columns: An optional list of integer indices or string column
      names, that specifies a subset of columns of CSV data to select. If
      column names are provided, these must correspond to names provided in
      `column_names` or inferred from the file header lines. When this argument
      is specified, only a subset of CSV columns will be parsed and returned,
      corresponding to the columns specified. Using this results in faster
      parsing and lower memory usage. If both this and `column_defaults` are
      specified, these must have the same lengths, and `column_defaults` is
      assumed to be sorted in order of increasing column index.
    field_delim: An optional `string`. Defaults to `","`. Char delimiter to
      separate fields in a record.
    use_quote_delim: An optional bool. Defaults to `True`. If false, treats
      double quotation marks as regular characters inside of the string fields.
    na_value: Additional string to recognize as NA/NaN.
    header: A bool that indicates whether the first rows of provided CSV files
      correspond to header lines with column names, and should not be included
      in the data.
    comment: An optional character string that marks lines that should not be
      parsed as csv records. If this is provided, all lines that start with
      this character will not be parsed.
    num_epochs: An int specifying the number of times this dataset is repeated.
      If None, cycles through the dataset forever.
    shuffle: A bool that indicates whether the input should be shuffled.
    shuffle_buffer_size: Buffer size to use for shuffling. A large buffer size
      ensures better shuffling, but would increase memory usage and startup
      time.
    shuffle_seed: Randomization seed to use for shuffling.
    prefetch_buffer_size: An int specifying the number of feature batches to
      prefetch for performance improvement. Recommended value is the number of
      batches consumed per training step.
    num_parallel_reads: Number of threads used to read CSV records from files.
      If >1, the results will be interleaved.
    num_parallel_parser_calls: Number of parallel invocations of the CSV parsing
      function on CSV records.
    sloppy: If `True`, reading performance will be improved at
      the cost of non-deterministic ordering. If `False`, the order of elements
      produced is deterministic prior to shuffling (elements are still
      randomized if `shuffle=True`. Note that if the seed is set, then order
      of elements after shuffling is deterministic). Defaults to `False`.
    default_float_type: Either `tf.float32` or `tf.float64`. If defaults are
      not provided, float-like strings are interpreted to be this type.
    num_rows_for_inference: Number of rows of a file to use for type inference
      if record_defaults is not provided. If None, reads all the rows of all
      the files. Defaults to 100.

  Returns:
    A dataset, where each element is a (features, labels) tuple that corresponds
    to a batch of `batch_size` CSV rows. The features dictionary maps feature
    column names to `Tensor`s containing the corresponding column data, and
    labels is a `Tensor` containing the column data for the label column
    specified by `label_name`.

  Raises:
    ValueError: If any of the arguments is malformed.
  """
    # Create dataset of all matching filenames
    filenames = _get_file_names(file_pattern, False)
    dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
    if shuffle:
        dataset = dataset.shuffle(len(filenames), shuffle_seed)

    # Clean arguments; figure out column names and defaults
    if comment is not None and len(comment) != 1:
        raise ValueError(
            "`comment` arg must be a single-character string or None")

    if column_names is None:
        if not header:
            raise ValueError(
                "Cannot infer column names without a header line.")
        # If column names are not provided, infer from the header lines
        column_names = _infer_column_names(filenames, field_delim,
                                           use_quote_delim)
    if len(column_names) != len(set(column_names)):
        raise ValueError("Cannot have duplicate column names.")

    if select_columns is not None:
        select_columns = _get_sorted_col_indices(select_columns, column_names)

    if column_defaults is not None:
        column_defaults = [
            constant_op.constant([], dtype=x)
            if x in _ACCEPTABLE_CSV_TYPES else x for x in column_defaults
        ]
    else:
        # If column defaults are not provided, infer from records at graph
        # construction time
        column_defaults = _infer_column_defaults(filenames, len(column_names),
                                                 field_delim, use_quote_delim,
                                                 na_value, header, comment,
                                                 default_float_type,
                                                 num_rows_for_inference,
                                                 select_columns)

    if select_columns is not None and len(column_defaults) != len(
            select_columns):
        raise ValueError(
            "If specified, column_defaults and select_columns must have same "
            "length.")
    if select_columns is not None and len(column_names) > len(select_columns):
        # Pick the relevant subset of column names
        column_names = [column_names[i] for i in select_columns]

    if label_name is not None and label_name not in column_names:
        raise ValueError("`label_name` provided must be one of the columns.")

    # Define map and filter functions
    def filter_fn(line):
        return math_ops.not_equal(string_ops.substr(line, 0, 1), comment)

    def filename_to_dataset(filename):
        ds = core_readers.TextLineDataset(filename)
        if header:
            ds = ds.skip(1)
        if comment is not None:
            ds = ds.filter(filter_fn)
        return ds

    def decode_csv(line):
        """Decodes CSV line into features.

    Args:
      line: String tensor corresponding to one csv record.
    Returns:
      A dictionary of feature names to values for that particular record. If
      label_name is provided, extracts the label feature to be returned as the
      second element of the tuple.
    """
        columns = parsing_ops.decode_csv(
            line,
            column_defaults,
            field_delim=field_delim,
            use_quote_delim=use_quote_delim,
            na_value=na_value,
            select_cols=select_columns,
        )
        features = dict(zip(column_names, columns))
        if label_name is not None:
            label = features.pop(label_name)
            return features, label
        return features

    # Read files sequentially or in parallel
    dataset = dataset.apply(
        interleave_ops.parallel_interleave(filename_to_dataset,
                                           cycle_length=num_parallel_reads,
                                           sloppy=sloppy))

    if num_epochs != 1 and shuffle:
        # Use shuffle_and_repeat for perf
        dataset = dataset.apply(
            shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
                                           shuffle_seed))
    elif shuffle:
        dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)
    elif num_epochs != 1:
        dataset = dataset.repeat(num_epochs)

    # Use map_and_batch for perf
    # TODO(b/76425672): use num_parallel_calls for better performance tuning when
    # that is added
    dataset = dataset.apply(
        batching.map_and_batch(map_func=decode_csv,
                               batch_size=batch_size,
                               num_parallel_batches=int(
                                   ceil(num_parallel_parser_calls /
                                        batch_size))))

    dataset = dataset.prefetch(prefetch_buffer_size)
    return dataset
Exemplo n.º 12
0
 def _build_ds(self, seed, count=5, num_elements=20):
   return dataset_ops.Dataset.range(num_elements).apply(
       shuffle_ops.shuffle_and_repeat(buffer_size=5, count=count, seed=seed))
Exemplo n.º 13
0
    def build_model(self):
        """ Graph Input """
        # images
        if self.custom_dataset:
            Image_Data_Class = ImageData(self.img_size, self.c_dim)
            inputs = tf.data.Dataset.from_tensor_slices(self.data)

            gpu_device = '/gpu:0'
            inputs = inputs.apply(shuffle_and_repeat(self.dataset_num)).apply(
                map_and_batch(Image_Data_Class.image_processing,
                              self.batch_size,
                              num_parallel_batches=16,
                              drop_remainder=True)
            )  #.apply(prefetch_to_device(gpu_device, self.batch_size))

            inputs_iterator = inputs.make_one_shot_iterator()

            self.inputs = inputs_iterator.get_next()

        else:
            self.inputs = tf.placeholder(
                tf.float32,
                [self.batch_size, self.img_size, self.img_size, self.c_dim],
                name='real_images')

        # noises
        self.z = tf.placeholder(tf.float32,
                                [self.batch_size, 1, 1, self.z_dim],
                                name='z')
        """Loss Function"""
        # output of D for real images
        real_logits = self.discriminator(self.inputs)
        print('AAAAAAAAAAA', self.inputs.shape.as_list())

        # output of D for fake images
        fake_images = self.generator(self.z)
        fake_logits = self.discriminator(fake_images, reuse=True)

        if self.gan_type.__contains__('wgan') or self.gan_type == 'dragan':
            GP = self.gradient_penalty(real=self.inputs, fake=fake_images)
        else:
            GP = 0

        # get loss for discriminator
        self.d_loss = discriminator_loss(
            self.gan_type, real=real_logits, fake=fake_logits) + GP

        # get loss for generator
        self.g_loss = generator_loss(self.gan_type, fake=fake_logits)
        """ Training """
        # divide trainable variables into a group for D and a group for G
        t_vars = tf.trainable_variables()
        d_vars = [var for var in t_vars if 'discriminator' in var.name]
        g_vars = [var for var in t_vars if 'generator' in var.name]

        # optimizers
        self.d_optim = tf.train.AdamOptimizer(self.d_learning_rate,
                                              beta1=self.beta1,
                                              beta2=self.beta2).minimize(
                                                  self.d_loss, var_list=d_vars)

        self.g_optim = tf.train.AdamOptimizer(self.g_learning_rate,
                                              beta1=self.beta1,
                                              beta2=self.beta2).minimize(
                                                  self.g_loss, var_list=g_vars)
        """" Testing """
        # for test
        self.fake_images = self.generator(self.z,
                                          is_training=False,
                                          reuse=True)
        """ Summary """
        self.d_sum = tf.summary.scalar("d_loss", self.d_loss)
        self.g_sum = tf.summary.scalar("g_loss", self.g_loss)