def _new_writer(self): assert self._writer is None fname = "{:04}-{:08}.rd".format( self._options.partitioner_rank_id, self._process_index ) fpath = os.path.join(self._output_dir, fname) self._output_fpaths.append(fpath) if self._options.output_builder == 'TF_RECORD': self._writer = tf.io.TFRecordWriter(fpath) else: assert self._options.output_builder == 'CSV_DICT' self._writer = CsvDictWriter(fpath) self._dumped_item = 0
class CSVDictBuilder(OutputWriter): def __init__(self, options, fpath): super(CSVDictBuilder, self).__init__(options, fpath) self._writer = CsvDictWriter(fpath) def write_item(self, item): self._writer.write(item.csv_record) def close(self): self._writer.close() @classmethod def name(cls): return 'CSV_DICT'
def _new_writer(self): assert self._writer is None if self._options.output_builder == 'TF_RECORD': self._writer = tf.io.TFRecordWriter(self._tmp_fpath) else: assert self._options.output_builder == 'CSV_DICT' self._writer = CsvDictWriter(self._tmp_fpath)
class OutputFileWriter(object): def __init__(self, options, partition_id): self._options = options self._partition_id = partition_id self._process_index = 0 self._writer = None self._dumped_item = 0 self._output_fpaths = [] self._output_dir = os.path.join( self._options.output_dir, common.partition_repr(self._partition_id) ) if not gfile.Exists(self._output_dir): gfile.MakeDirs(self._output_dir) assert gfile.IsDirectory(self._output_dir) def append_item(self, index, item): writer = self._get_output_writer() if self._options.output_builder == 'TF_RECORD': writer.write(item.tf_record) else: assert self._options.output_builder == 'CSV_DICT' writer.write(item.csv_record) self._dumped_item += 1 if self._dumped_item >= self._options.output_item_threshold: self._finish_writer() if self._process_index % 16 == 0: logging.info("Output partition %d dump %d files, "\ "last index %d", self._partition_id, self._process_index, index) def finish(self): self._finish_writer() def get_output_files(self): return self._output_fpaths def _get_output_writer(self): if self._writer is None: self._new_writer() return self._writer def _new_writer(self): assert self._writer is None fname = "{:04}-{:08}.rd".format( self._options.partitioner_rank_id, self._process_index ) fpath = os.path.join(self._output_dir, fname) self._output_fpaths.append(fpath) if self._options.output_builder == 'TF_RECORD': self._writer = tf.io.TFRecordWriter(fpath) else: assert self._options.output_builder == 'CSV_DICT' self._writer = CsvDictWriter(fpath) self._dumped_item = 0 def _finish_writer(self): if self._writer is not None: self._writer.close() self._writer = None self._dumped_item = 0 self._process_index += 1
def _make_data_block_writer(self, fpath): return CsvDictWriter(fpath)
def __init__(self, options, fpath): super(CSVDictBuilder, self).__init__(options, fpath) self._writer = CsvDictWriter(fpath)