Пример #1
0
class SimpleBlockBuilder(BlockBuilder[T]):
    def __init__(self):
        self._items = []
        self._size_estimator = SizeEstimator()

    def add(self, item: T) -> None:
        self._items.append(item)
        self._size_estimator.add(item)

    def add_block(self, block: List[T]) -> None:
        if not isinstance(block, list):
            raise TypeError(
                f"Got a block of type {type(block)}, expected list. "
                "If you are mapping a function, ensure it returns an "
                "object with the expected type. Block:\n"
                f"{block}")
        self._items.extend(block)
        for item in block:
            self._size_estimator.add(item)

    def num_rows(self) -> int:
        return len(self._items)

    def build(self) -> Block:
        return list(self._items)

    def get_estimated_memory_usage(self) -> int:
        return self._size_estimator.size_bytes()
Пример #2
0
class SimpleBlockBuilder(BlockBuilder[T]):
    def __init__(self):
        self._items = []
        self._size_estimator = SizeEstimator()

    def add(self, item: T) -> None:
        self._items.append(item)
        self._size_estimator.add(item)

    def add_block(self, block: List[T]) -> None:
        assert isinstance(block, list), block
        self._items.extend(block)
        for item in block:
            self._size_estimator.add(item)

    def build(self) -> Block:
        return list(self._items)

    def get_estimated_memory_usage(self) -> int:
        return self._size_estimator.size_bytes()
Пример #3
0
class TableBlockBuilder(BlockBuilder[T]):
    def __init__(self, block_type):
        # The set of uncompacted Python values buffered.
        self._columns = collections.defaultdict(list)
        # The set of compacted tables we have built so far.
        self._tables: List[Any] = []
        self._tables_size_bytes = 0
        # Size estimator for un-compacted table values.
        self._uncompacted_size = SizeEstimator()
        self._num_rows = 0
        self._num_compactions = 0
        self._block_type = block_type

    def add(self, item: Union[dict, TableRow]) -> None:
        if isinstance(item, TableRow):
            item = item.as_pydict()
        if not isinstance(item, dict):
            raise ValueError(
                "Returned elements of an TableBlock must be of type `dict`, "
                "got {} (type {}).".format(item, type(item))
            )
        for key, value in item.items():
            self._columns[key].append(value)
        self._num_rows += 1
        self._compact_if_needed()
        self._uncompacted_size.add(item)

    def add_block(self, block: Any) -> None:
        assert isinstance(block, self._block_type), block
        accessor = BlockAccessor.for_block(block)
        self._tables.append(block)
        self._tables_size_bytes += accessor.size_bytes()
        self._num_rows += accessor.num_rows()

    def _table_from_pydict(self, columns: Dict[str, List[Any]]) -> Block:
        raise NotImplementedError

    def _concat_tables(self, tables: List[Block]) -> Block:
        raise NotImplementedError

    @staticmethod
    def _empty_table() -> Any:
        raise NotImplementedError

    def build(self) -> Block:
        if self._columns:
            tables = [self._table_from_pydict(self._columns)]
        else:
            tables = []
        tables.extend(self._tables)
        if len(tables) > 1:
            return self._concat_tables(tables)
        elif len(tables) > 0:
            return tables[0]
        else:
            return self._empty_table()

    def num_rows(self) -> int:
        return self._num_rows

    def get_estimated_memory_usage(self) -> int:
        if self._num_rows == 0:
            return 0
        return self._tables_size_bytes + self._uncompacted_size.size_bytes()

    def _compact_if_needed(self) -> None:
        assert self._columns
        if self._uncompacted_size.size_bytes() < MAX_UNCOMPACTED_SIZE_BYTES:
            return
        block = self._table_from_pydict(self._columns)
        self.add_block(block)
        self._uncompacted_size = SizeEstimator()
        self._columns.clear()
        self._num_compactions += 1
Пример #4
0
class ArrowBlockBuilder(BlockBuilder[T]):
    def __init__(self):
        if pyarrow is None:
            raise ImportError("Run `pip install pyarrow` for Arrow support")
        # The set of uncompacted Python values buffered.
        self._columns = collections.defaultdict(list)
        # The set of compacted tables we have built so far.
        self._tables: List["pyarrow.Table"] = []
        self._tables_nbytes = 0
        # Size estimator for un-compacted table values.
        self._uncompacted_size = SizeEstimator()
        self._num_rows = 0
        self._num_compactions = 0

    def add(self, item: Union[dict, ArrowRow]) -> None:
        if isinstance(item, ArrowRow):
            item = item.as_pydict()
        if not isinstance(item, dict):
            raise ValueError(
                "Returned elements of an ArrowBlock must be of type `dict`, "
                "got {} (type {}).".format(item, type(item)))
        for key, value in item.items():
            self._columns[key].append(value)
        self._num_rows += 1
        self._compact_if_needed()
        self._uncompacted_size.add(item)

    def add_block(self, block: "pyarrow.Table") -> None:
        assert isinstance(block, pyarrow.Table), block
        self._tables.append(block)
        self._tables_nbytes += block.nbytes
        self._num_rows += block.num_rows

    def build(self) -> Block:
        if self._columns:
            tables = [pyarrow.Table.from_pydict(self._columns)]
        else:
            tables = []
        tables.extend(self._tables)
        if len(tables) > 1:
            return pyarrow.concat_tables(tables, promote=True)
        elif len(tables) > 0:
            return tables[0]
        else:
            return pyarrow.Table.from_pydict({})

    def num_rows(self) -> int:
        return self._num_rows

    def get_estimated_memory_usage(self) -> int:
        if self._num_rows == 0:
            return 0
        return self._tables_nbytes + self._uncompacted_size.size_bytes()

    def _compact_if_needed(self) -> None:
        assert self._columns
        if self._uncompacted_size.size_bytes() < MAX_UNCOMPACTED_SIZE_BYTES:
            return
        block = pyarrow.Table.from_pydict(self._columns)
        self._tables.append(block)
        self._tables_nbytes += block.nbytes
        self._uncompacted_size = SizeEstimator()
        self._columns.clear()
        self._num_compactions += 1