def _make_test_datasets(self, inputs, **kwargs):
   # Test by comparing its output to what we could get with map->decode_csv
   filenames = self.setup_files(inputs)
   dataset_expected = core_readers.TextLineDataset(filenames)
   dataset_expected = dataset_expected.map(
       lambda l: gen_parsing_ops.decode_csv(l, **kwargs))
   dataset_actual = readers.CsvDataset(filenames, **kwargs)
   return (dataset_actual, dataset_expected)
示例#2
0
def decode_csv_v2(records,
                  record_defaults,
                  field_delim=",",
                  use_quote_delim=True,
                  na_value="",
                  select_cols=None,
                  name=None):
    """Convert CSV records to tensors. Each column maps to one tensor.

  RFC 4180 format is expected for the CSV records.
  (https://tools.ietf.org/html/rfc4180)
  Note that we allow leading and trailing spaces with int or float field.

  Args:
    records: A `Tensor` of type `string`.
      Each string is a record/row in the csv and all records should have
      the same format.
    record_defaults: A list of `Tensor` objects with specific types.
      Acceptable types are `float32`, `float64`, `int32`, `int64`, `string`.
      One tensor per column of the input record, with either a
      scalar default value for that column or an empty vector if the column is
      required.
    field_delim: An optional `string`. Defaults to `","`.
      char delimiter to separate fields in a record.
    use_quote_delim: An optional `bool`. Defaults to `True`.
      If false, treats double quotation marks as regular
      characters inside of the string fields (ignoring RFC 4180, Section 2,
      Bullet 5).
    na_value: Additional string to recognize as NA/NaN.
    select_cols: Optional sorted list of column indices to select. If specified,
      only this subset of columns will be parsed and returned.
    name: A name for the operation (optional).

  Returns:
    A list of `Tensor` objects. Has the same type as `record_defaults`.
    Each tensor will have the same shape as records.

  Raises:
    ValueError: If any of the arguments is malformed.
  """
    if select_cols is not None and any(select_cols[i] >= select_cols[i + 1]
                                       for i in range(len(select_cols) - 1)):
        raise ValueError("select_cols is not strictly increasing.")
    if select_cols is not None and select_cols[0] < 0:
        raise ValueError("select_cols contains negative values.")
    if select_cols is not None and len(select_cols) != len(record_defaults):
        raise ValueError(
            "Length of select_cols and record_defaults do not match.")
    return gen_parsing_ops.decode_csv(
        records=records,
        record_defaults=record_defaults,
        field_delim=field_delim,
        use_quote_delim=use_quote_delim,
        na_value=na_value,
        name=name,
        select_cols=select_cols,
    )
 def benchmarkMapWithStrings(self):
   self._setUp(self.STR_VAL)
   for i in range(len(self._filenames)):
     num_cols = self._num_cols[i]
     kwargs = {'record_defaults': [['']] * num_cols}
     dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
     dataset = dataset.map(lambda l: gen_parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
     self._runBenchmark(dataset, num_cols, 'csv_strings_map_decode_csv')
   self._tearDown()
示例#4
0
 def benchmarkBatchThenMap(self):
     self._setUp()
     for i in range(len(self._filenames)):
         num_cols = self._num_cols[i]
         kwargs = {'record_defaults': [[0.0]] * num_cols}
         dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
         dataset = dataset.map(
             lambda l: gen_parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
         dataset = dataset.batch(self._batch_size)
         self._runBenchmark(dataset, num_cols, 'csv_map_then_batch')
     self._tearDown()