def add_input( self, accumulator: _CombinerStatsGeneratorsCombineFnAcc, input_record_batch: pa.RecordBatch ) -> _CombinerStatsGeneratorsCombineFnAcc: accumulator.input_record_batches.append(input_record_batch) num_rows = input_record_batch.num_rows accumulator.curr_batch_size += num_rows accumulator.curr_byte_size += table_util.TotalByteSize(input_record_batch) self._maybe_do_batch(accumulator) self._num_instances.inc(num_rows) return accumulator
def process(self, record_batch: pa.RecordBatch) -> Iterable[pa.RecordBatch]: num_rows = record_batch.num_rows self._num_rows.inc(num_rows) self._UpdateNumCellsCounters(record_batch) total_byte_size = table_util.TotalByteSize( record_batch, ignore_unsupported=True) self._byte_size_dist.update(total_byte_size) # These distributions are per-row therefore expensive to update because # dist.update() needs to be called num_rows * k times. if np.random.rand() < self._dist_update_prob: self._UpdateNumColumnsDist(record_batch) self._UpdateNumValuesDist(record_batch) yield record_batch
def test_simple(self, factory): # 3 int64 values # 4 int32 offsets # 1 null bitmap byte for outer ListArray # 1 null bitmap byte for inner Int64Array # 42 bytes in total. list_array = pa.array([[1, 2], [3], None], type=pa.list_(pa.int64())) # 1 null bitmap byte for outer StructArray. # 1 null bitmap byte for inner Int64Array. # 3 int64 values. # 26 bytes in total struct_array = pa.array([{"a": 1}, {"a": 2}, {"a": 3}], type=pa.struct([pa.field("a", pa.int64())])) entity = factory([list_array, struct_array], ["a1", "a2"]) self.assertEqual(42 + 26, table_util.TotalByteSize(entity))