예제 #1
0
    def write_table(self,
                    pa_table: pa.Table,
                    writer_batch_size: Optional[int] = None):
        """ Write a batch of Example to file.

        Args:
            example: the Example to add.
        """
        if writer_batch_size is None:
            writer_batch_size = self.writer_batch_size
        if self.pa_writer is None:
            self._build_writer(inferred_schema=pa_table.schema)
        batches: List[pa.RecordBatch] = pa_table.to_batches(
            max_chunksize=writer_batch_size)
        self._num_bytes += sum(batch.nbytes for batch in batches)
        self._num_examples += pa_table.num_rows
        for batch in batches:
            self.pa_writer.write_batch(batch)
예제 #2
0
    def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None):
        """Write a Table to file.

        Args:
            example: the Table to add.
        """
        if writer_batch_size is None:
            writer_batch_size = self.writer_batch_size
        if self.pa_writer is None:
            self._build_writer(inferred_schema=pa_table.schema)
        # reorder the arrays if necessary + cast to self._schema
        # we can't simply use .cast here because we may need to change the order of the columns
        pa_table = pa.Table.from_arrays([pa_table[name] for name in self._schema.names], schema=self._schema)
        batches: List[pa.RecordBatch] = pa_table.to_batches(max_chunksize=writer_batch_size)
        self._num_bytes += sum(batch.nbytes for batch in batches)
        self._num_examples += pa_table.num_rows
        for batch in batches:
            self.pa_writer.write_batch(batch)
예제 #3
0
 def _TableToRecordBatch(
         self,
         table: pa.Table,
         batch_size: Optional[int] = None) -> List[pa.RecordBatch]:
     return table.to_batches(max_chunksize=batch_size)
예제 #4
0
 def __init__(self, table: pa.Table):
     self._schema = table.schema
     self._batches = table.to_batches()
     self._offsets = np.cumsum([0] + [len(b) for b in self._batches])
예제 #5
0
 def __init__(self, table: pa.Table):
     self._schema = table.schema
     self._batches = table.to_batches()
     self._offsets: np.ndarray = np.cumsum([0] +
                                           [len(b) for b in self._batches],
                                           dtype=np.int64)
예제 #6
0
 def write_table(self, table: pa.Table):
     for batch in table.to_batches():
         self.write(batch)