Пример #1
0
 def _new_writer(self):
     assert self._writer is None
     fname = "{:04}-{:08}.rd".format(
             self._options.partitioner_rank_id,
             self._process_index
         )
     fpath = os.path.join(self._output_dir, fname)
     self._output_fpaths.append(fpath)
     if self._options.output_builder == 'TF_RECORD':
         self._writer = tf.io.TFRecordWriter(fpath)
     else:
         assert self._options.output_builder == 'CSV_DICT'
         self._writer = CsvDictWriter(fpath)
     self._dumped_item = 0
Пример #2
0
class CSVDictBuilder(OutputWriter):
    def __init__(self, options, fpath):
        super(CSVDictBuilder, self).__init__(options, fpath)
        self._writer = CsvDictWriter(fpath)

    def write_item(self, item):
        self._writer.write(item.csv_record)

    def close(self):
        self._writer.close()

    @classmethod
    def name(cls):
        return 'CSV_DICT'
Пример #3
0
 def _new_writer(self):
     assert self._writer is None
     if self._options.output_builder == 'TF_RECORD':
         self._writer = tf.io.TFRecordWriter(self._tmp_fpath)
     else:
         assert self._options.output_builder == 'CSV_DICT'
         self._writer = CsvDictWriter(self._tmp_fpath)
Пример #4
0
    class OutputFileWriter(object):
        def __init__(self, options, partition_id):
            self._options = options
            self._partition_id = partition_id
            self._process_index = 0
            self._writer = None
            self._dumped_item = 0
            self._output_fpaths = []
            self._output_dir = os.path.join(
                    self._options.output_dir,
                    common.partition_repr(self._partition_id)
                )
            if not gfile.Exists(self._output_dir):
                gfile.MakeDirs(self._output_dir)
            assert gfile.IsDirectory(self._output_dir)

        def append_item(self, index, item):
            writer = self._get_output_writer()
            if self._options.output_builder == 'TF_RECORD':
                writer.write(item.tf_record)
            else:
                assert self._options.output_builder == 'CSV_DICT'
                writer.write(item.csv_record)
            self._dumped_item += 1
            if self._dumped_item >= self._options.output_item_threshold:
                self._finish_writer()
                if self._process_index % 16 == 0:
                    logging.info("Output partition %d dump %d files, "\
                                 "last index %d", self._partition_id,
                                 self._process_index, index)

        def finish(self):
            self._finish_writer()

        def get_output_files(self):
            return self._output_fpaths

        def _get_output_writer(self):
            if self._writer is None:
                self._new_writer()
            return self._writer

        def _new_writer(self):
            assert self._writer is None
            fname = "{:04}-{:08}.rd".format(
                    self._options.partitioner_rank_id,
                    self._process_index
                )
            fpath = os.path.join(self._output_dir, fname)
            self._output_fpaths.append(fpath)
            if self._options.output_builder == 'TF_RECORD':
                self._writer = tf.io.TFRecordWriter(fpath)
            else:
                assert self._options.output_builder == 'CSV_DICT'
                self._writer = CsvDictWriter(fpath)
            self._dumped_item = 0

        def _finish_writer(self):
            if self._writer is not None:
                self._writer.close()
                self._writer = None
            self._dumped_item = 0
            self._process_index += 1
Пример #5
0
 def _make_data_block_writer(self, fpath):
     return CsvDictWriter(fpath)
Пример #6
0
 def __init__(self, options, fpath):
     super(CSVDictBuilder, self).__init__(options, fpath)
     self._writer = CsvDictWriter(fpath)