Пример #1
0
    def __init__(
        self,
        values: Iterator[int],
        *,
        num_values: int,
        max_value: int,
        num_lower_bits: Optional[int] = None
    ) -> None:
        """
        Compressed representation of a monotonically-increasing sequence of
        nonnegative integers.
        """

        self._size = num_values

        # Number of bits needed to store the largest value.
        w = math.ceil(math.log2(max(1, max_value)))

        # Number of lower-order bits of each value to store in the lower
        # bit vector.
        if num_lower_bits is None:
            num_lower_bits = math.floor(max_value / num_values)
        self._num_lower_bits = num_lower_bits
        if num_lower_bits != 0:
            self._lower_bits = bitarray()
        else:
            self._lower_bits = None

        # Number of higher-order bits of each value to store in the upper bit
        # vector.
        self._num_upper_bits = w - num_lower_bits
        self._upper_bits = bitarray()

        previous_value = 0
        for value in values:
            if value > max_value:
                raise ValueError(
                    f"The value '{value}' is larger than the max_value '{max_value}'"
                )
            if value < previous_value:
                raise ValueError(
                    "Values must be non-decreasing. "
                    f"(Found '{previous_value}' followed by '{value}')"
                )

            if self._lower_bits is not None:
                binary_str = f"{value:b}"
                lower_bits_str = binary_str[-num_lower_bits:].rjust(num_lower_bits, '0')
                self._lower_bits.extend(lower_bits_str)

            upper_bits = value >> num_lower_bits
            previous_upper_bits = previous_value >> num_lower_bits
            if previous_value != -1:
                self._upper_bits.extend([False] * max(0, upper_bits - previous_upper_bits))
            self._upper_bits.append(True)

            previous_value = value
        self._upper_bits.append(False)
        self._upper_poppy = Poppy(self._upper_bits)
Пример #2
0
def test_rank_zero(bb: bytes) -> None:
    assume(len(bb) % 8 == 0)

    bits = bitarray()
    bits.frombytes(bb)
    poppy = Poppy(bits)

    for i in range(len(bits)):
        assert poppy.rank_zero(i) == sum(1 - int(b) for b in bits[0:(i + 1)])
Пример #3
0
def test_select_binary_search_bug_is_not_present_2020_09_06() -> None:
    i = 3586
    bits = bitarray(i)
    bits.setall(False)
    bits[0] = True
    bits[i - 1] = True
    poppy = Poppy(bits)

    assert poppy.select(0) == 0
    assert poppy.select(1) == i - 1
Пример #4
0
def test_select_structure(byte_value: int, num_bytes: int) -> None:
    bits = bitarray()
    bits.frombytes(bytes([byte_value]) * num_bytes)
    poppy = Poppy(bits)

    for level_0_idx, sampling_answers in enumerate(poppy._select_structure):
        for i, sampling_answer in enumerate(sampling_answers):
            sum_left = poppy._level_0[level_0_idx]
            assert (poppy.rank(sampling_answer + ((1 << 32) * level_0_idx)) -
                    sum_left) == (i * 8192 + 1)
Пример #5
0
def test_select_poppy_big(byte_value: int, num_bytes: int,
                          step_size: int) -> None:
    bits = bitarray()
    bits.frombytes(bytes([byte_value]) * num_bytes)
    poppy = Poppy(bits)

    a = 0
    for i, b in enumerate(bits):
        if b:
            if i % step_size == 0:
                assert poppy.select(a) == i
            a += 1
Пример #6
0
def test_rank_big(byte_value: int, num_bytes: int, step_size: int) -> None:
    bits = bitarray()
    bits.frombytes(bytes([byte_value]) * num_bytes)
    poppy = Poppy(bits)

    i = 0
    sum_bits = sum(bits[0:1])
    while i < len(bits):
        assert poppy.rank(i) == sum_bits
        next_i = i + step_size
        partial_sum = sum(bits[(i + 1):min(next_i + 1, len(bits))])
        i = next_i
        sum_bits += partial_sum
Пример #7
0
def test_select_zero_poppy(bb: bytes) -> None:
    assume(len(bb) % 8 == 0)

    bits = bitarray()
    bits.frombytes(bb)
    poppy = Poppy(bits)

    select_zero_answers: List[int] = []
    for i in range(len(bits)):
        if not bits[i]:
            select_zero_answers.append(i)

    for i, pos in enumerate(select_zero_answers):
        assert poppy.select_zero(i) == pos
Пример #8
0
def test_l1_layer(byte_value: int, num_bytes: int) -> None:
    bits = bitarray()
    bits.frombytes(bytes([byte_value]) * num_bytes)

    # Manually compute the popcount sums here.
    level_1_size = math.ceil(len(bits) / 2048)
    level_1: List[int] = [0] * level_1_size

    v = memoryview(bits)
    for byte_offset in range(0, len(v), 8):
        level_1_idx = 1 + byte_offset // 256
        if level_1_idx < len(level_1):
            level_1[level_1_idx] += popcount(v[byte_offset:byte_offset + 8])

    for byte_offset in range(0, num_bytes, 1 << 29):
        level_1_idx = byte_offset // 256
        level_1[level_1_idx] = 0

    for i in range(1, len(level_1)):
        level_1[i] += level_1[i - 1]

    poppy = Poppy(bits)
    # Python will literally asplode if we try to use list equality to compare
    # the two lists.
    for i in range(0, len(level_1)):
        assert poppy._level_1[2 * i] == level_1[i], f"Failed at {i}"
Пример #9
0
def test_getitem(bb: bytes) -> None:
    assume(len(bb) % 8 == 0)

    bits = bitarray()
    bits.frombytes(bb)
    poppy = Poppy(bits)

    for i in range(len(bits)):
        assert poppy[i] == bits[i]
Пример #10
0
    def __init__(
        self,
        *,
        root: A,
        get_left_child: Callable[[A], Optional[A]],
        get_right_child: Callable[[A], Optional[A]]
    ) -> None:
        queue = deque([root])

        self._bits = bitarray()
        while queue:
            tree_node = queue.popleft()
            for child in [get_left_child(tree_node), get_right_child(tree_node)]:
                if child is not None:
                    self._bits.append(True)
                    queue.append(child)
                else:
                    self._bits.append(False)
        self._poppy = Poppy(self._bits)
Пример #11
0
    def _extract_runs(self, values: IndexedIntSequence) -> List[HuffmanTreeNode]:
        run_starts: List[int] = [0]

        for i in range(1, len(values)):
            if values[i] < values[i - 1]:
                run_starts.append(i)

        run_starts_bitarray = bitarray(len(values))
        run_starts_bitarray.setall(False)
        for start in run_starts:
            run_starts_bitarray[start] = True
        self._run_starts = Poppy(run_starts_bitarray)

        runs: List[HuffmanTreeNode] = []
        for i in range(len(run_starts)):
            from_index = run_starts[i]
            until_index = run_starts[i + 1] if i + 1 < len(run_starts) else len(values)
            runs.append(Run(from_=from_index, until=until_index))

        return runs
Пример #12
0
class LoudsBinaryTree:
    def __init__(
        self,
        *,
        root: A,
        get_left_child: Callable[[A], Optional[A]],
        get_right_child: Callable[[A], Optional[A]]
    ) -> None:
        queue = deque([root])

        self._bits = bitarray()
        while queue:
            tree_node = queue.popleft()
            for child in [get_left_child(tree_node), get_right_child(tree_node)]:
                if child is not None:
                    self._bits.append(True)
                    queue.append(child)
                else:
                    self._bits.append(False)
        self._poppy = Poppy(self._bits)

    def get_root(self) -> int:
        return 0

    def get_parent(self, i: int) -> Optional[int]:
        if i == 0:
            return None
        return math.floor(self._poppy.select(i - 1) / 2)

    def get_left_child(self, i: int) -> Optional[int]:
        if not self._bits[2 * i]:
            return None
        return self._poppy.rank(2 * i)

    def get_right_child(self, i: int) -> Optional[int]:
        if not self._bits[2 * i + 1]:
            return None
        return self._poppy.rank(2 * i + 1)

    def is_leaf(self, i: int) -> bool:
        return not (self._bits[2 * i] or self._bits[2 * i + 1])
Пример #13
0
def test_relative_count(
    initial_value_block_0: int,
    initial_value_block_1: int,
    initial_value_block_2: int,
    add_0: int,
    add_1: int,
    add_2: int,
) -> None:
    assume(initial_value_block_0 + add_0 <= 512)
    assume(initial_value_block_1 + add_1 <= 512)
    assume(initial_value_block_2 + add_2 <= 512)

    initial_values = [
        initial_value_block_0, initial_value_block_1, initial_value_block_2
    ]

    adds = [add_0, add_1, add_2]

    packed_count = 0
    for basic_block_index, initial_value in enumerate(initial_values):
        packed_count = Poppy._add_relative_count(
            basic_block_index=basic_block_index,
            packed_relative_counts=packed_count,
            pop_count=initial_value)

    for basic_block_index, initial_value in enumerate(initial_values):
        assert Poppy._get_relative_count(
            basic_block_index=basic_block_index,
            packed_relative_counts=packed_count) == initial_value

    for basic_block_index, add in enumerate(adds):
        packed_count = Poppy._add_relative_count(
            basic_block_index=basic_block_index,
            packed_relative_counts=packed_count,
            pop_count=add)

    for basic_block_index, (initial_value,
                            add) in enumerate(zip(initial_values, adds)):
        assert Poppy._get_relative_count(
            basic_block_index=basic_block_index,
            packed_relative_counts=packed_count) == initial_value + add
Пример #14
0
def test_l2_layer(bb: bytes) -> None:
    assume(len(bb) % 8 == 0)
    bits = bitarray()
    bits.frombytes(bb)
    poppy = Poppy(bits)

    for byte_offset in range(0, len(bb), 64):
        basic_block_idx = (byte_offset % 256) // 64
        if basic_block_idx != 3:
            # Calculate the sum of the bits in the 64-byte block.
            bit_start = 8 * byte_offset
            bit_end = min(len(bits), bit_start + 512)
            expected_pop_count = sum(bits[bit_start:bit_end])

            level_2_idx = (byte_offset // 256) * 2 + 1
            packed_relative_counts = poppy._level_1[level_2_idx]
            actual_pop_count = poppy._get_relative_count(
                basic_block_index=basic_block_idx,
                packed_relative_counts=packed_relative_counts)

            assert expected_pop_count == actual_pop_count
Пример #15
0
def test_l0_layer(byte_value: int, num_bytes: int) -> None:
    bits = bitarray()
    bits.frombytes(bytes([byte_value]) * num_bytes)

    # Manually compute the popcount sums here.
    num_popcount_sums = math.ceil(len(bits) / (2**32))
    popcount_sums: List[int] = [0] * num_popcount_sums

    v = memoryview(bits)
    for byte_offset in range(0, len(v), 8):
        popcount_idx = 1 + byte_offset // (2**29)
        if popcount_idx < len(popcount_sums):
            popcount_sums[popcount_idx] += popcount(v[byte_offset:byte_offset +
                                                      8])

    for i in range(1, len(popcount_sums)):
        popcount_sums[i] += popcount_sums[i - 1]

    poppy = Poppy(bits)
    assert list(poppy._level_0) == popcount_sums
Пример #16
0
class Permutation:
    def __init__(self, values: IndexedIntSequence) -> None:
        runs = self._extract_runs(values)
        self._build_huffman_tree(values, runs)

    def _extract_runs(self, values: IndexedIntSequence) -> List[HuffmanTreeNode]:
        run_starts: List[int] = [0]

        for i in range(1, len(values)):
            if values[i] < values[i - 1]:
                run_starts.append(i)

        run_starts_bitarray = bitarray(len(values))
        run_starts_bitarray.setall(False)
        for start in run_starts:
            run_starts_bitarray[start] = True
        self._run_starts = Poppy(run_starts_bitarray)

        runs: List[HuffmanTreeNode] = []
        for i in range(len(run_starts)):
            from_index = run_starts[i]
            until_index = run_starts[i + 1] if i + 1 < len(run_starts) else len(values)
            runs.append(Run(from_=from_index, until=until_index))

        return runs

    def _build_huffman_tree(self, values: IndexedIntSequence, tree_nodes: List[HuffmanTreeNode]) -> None:
        # Determine the tree topology.
        heapq.heapify(tree_nodes)
        merge_sort_bitarray = bitarray()
        merge_sort_offset = 0
        while len(tree_nodes) > 1:
            x = heapq.heappop(tree_nodes)
            y = heapq.heappop(tree_nodes)
            merged = HuffmanInnerNode(
                size=len(x) + len(y),
                left_child=x,
                right_child=y,
                merge_sort_offset=merge_sort_offset
            )

            # Populate the merge sort bitarray's values
            it_left = ((value, False) for value in x.iterator(values, merge_sort_bitarray))
            it_right = ((value, True) for value in y.iterator(values, merge_sort_bitarray))

            for _, b in heapq.merge(it_left, it_right):
                merge_sort_bitarray.append(b)
                merge_sort_offset += 1

            heapq.heappush(tree_nodes, merged)

        # Build a LOUDS representation of the tree topology
        louds = LoudsBinaryTree(
            root=tree_nodes[0],
            get_left_child=lambda n: n.left_child if isinstance(n, HuffmanInnerNode) else None,
            get_right_child=lambda n: n.right_child if isinstance(n, HuffmanInnerNode) else None
        )

        # The data stored at each node of the tree is:
        # - The offset into a bitarray (if a node is an inner node)
        # - The offset into the original permutation (if a node is a leaf node)
        node_data: List[int] = []
        sizes: List[int] = []
        queue = deque(tree_nodes)
        while queue:
            tree_node = queue.popleft()
            if isinstance(tree_node, HuffmanInnerNode):
                node_data.append(tree_node.merge_sort_offset)
                sizes.append(len(tree_node))
                queue.append(tree_node.left_child)
                queue.append(tree_node.right_child)
            elif isinstance(tree_node, Run):
                node_data.append(tree_node.from_)
                sizes.append(len(tree_node))
            else:
                raise TypeError

        self._louds = louds
        self._node_data = node_data
        self._merge_sort_poppy = Poppy(merge_sort_bitarray)

        number_of_runs = self._run_starts.rank(len(self._run_starts) - 1)
        self._run_rank_to_louds_id = [0] * number_of_runs
        for louds_id in range(len(self._node_data)):
            if self._louds.is_leaf(louds_id):
                run_offset = self._node_data[louds_id]
                run_rank = self._run_starts.rank(run_offset)
                self._run_rank_to_louds_id[run_rank - 1] = louds_id

    def __getitem__(self, key: int) -> int:
        """
        Retrieve the i'th element of this permutation.
        """

        """
        Start at the leaves. Find the index of the current key `k` in the current
        node. Now, consider the parent node:
            - If the current node is the left child, get the index of the `k`'th
              zero in the parent node via "select_zero".

            - If the current node is the right child, get the index of the `k`th
            one in the parent node via "select".

        Repeat until the current node is the root, and return `k`.
        """
        run_start = self._run_starts.rank(key) - 1
        current_node = self._run_rank_to_louds_id[run_start]
        key = key - self._node_data[current_node]

        while True:
            parent = self._louds.get_parent(current_node)
            if parent is None:
                return key
            else:
                parent_offset = self._node_data[parent]
                if self._louds.get_left_child(parent) == current_node:
                    # # get the index of the `k`'th zero in the parent node
                    parent_rank_zero = (
                        0 if parent_offset == 0 else
                        self._merge_sort_poppy.rank_zero(parent_offset - 1)
                    )
                    key = self._merge_sort_poppy.select_zero(key + parent_rank_zero) - parent_offset
                    current_node = parent
                else:
                    # get the index of the `k`th one in the parent node
                    parent_rank = (
                        0 if parent_offset == 0 else
                        self._merge_sort_poppy.rank(parent_offset - 1)
                    )
                    key = self._merge_sort_poppy.select(key + parent_rank) - parent_offset
                    current_node = parent

    def index_of(self, value: int) -> int:
        """
        Retrieve the index of the given value within this permutation. (This is
        the inverse of the permutation.)
        """

        """
        Start at the root.  Look up the bit at position `value`. Calculate either
        the rank_1 or rank_0 of that position, depending on whether it's 1 or 0.
        Recurse to either the left or right child, depending on that value.
        """
        current_node = self._louds.get_root()
        while True:
            if self._louds.is_leaf(current_node):
                return self._node_data[current_node] + value
            else:
                offset = self._node_data[current_node]
                bit_value = self._merge_sort_poppy[offset + value]

                if not bit_value:
                    value = (
                        self._merge_sort_poppy.rank_zero(offset + value) -
                        (
                            0 if offset == 0 else
                            self._merge_sort_poppy.rank_zero(offset - 1)
                        )
                    ) - 1
                    child = self._louds.get_left_child(current_node)
                else:
                    value = (
                        self._merge_sort_poppy.rank(offset + value) -
                        (
                            0 if offset == 0 else
                            self._merge_sort_poppy.rank(offset - 1)
                        )
                    ) - 1
                    child = self._louds.get_right_child(current_node)
                assert child is not None
                current_node = child
Пример #17
0
class EliasFano:
    def __init__(self, values: Iterator[int], *, num_values: int,
                 max_value: int) -> None:
        """
        Compressed representation of a monotonically-increasing sequence of
        nonnegative integers.
        """

        # Number of bits needed to store the largest value.
        w = math.ceil(math.log2(max(1, max_value)))

        # Number of lower-order bits of each value to store in the lower
        # bit vector.
        num_lower_bits = math.floor(max_value / num_values)
        self._num_lower_bits = num_lower_bits
        if num_lower_bits != 0:
            self._lower_bits = bitarray()
        else:
            self._lower_bits = None

        # Number of higher-order bits of each value to store in the upper bit
        # vector.
        self._num_upper_bits = w - num_lower_bits
        self._upper_bits = bitarray()

        previous_value = 0
        for value in values:
            if value > max_value:
                raise ValueError(
                    f"The value '{value}' is larger than the max_value '{max_value}'"
                )
            if value < previous_value:
                raise ValueError(
                    "Values must be non-decreasing. "
                    f"(Found '{previous_value}' followed by '{value}')")

            if self._lower_bits is not None:
                binary_str = f"{value:b}"
                lower_bits_str = binary_str[-num_lower_bits:].rjust(
                    num_lower_bits, '0')
                self._lower_bits.extend(lower_bits_str)

            upper_bits = value >> num_lower_bits
            previous_upper_bits = previous_value >> num_lower_bits
            if previous_value != -1:
                self._upper_bits.extend(
                    [False] * max(0, upper_bits - previous_upper_bits))
            self._upper_bits.append(True)

            previous_value = value
        self._upper_bits.append(False)
        self._upper_poppy = Poppy(self._upper_bits)

    def __getitem__(self, key: int) -> int:
        if self._lower_bits is not None:
            lower_offset = key * self._num_lower_bits
            lower = int(
                self._lower_bits[lower_offset:lower_offset +
                                 self._num_lower_bits].to01(), 2)
        else:
            lower = 0

        upper = self._upper_poppy.select(key) - key
        return (upper << self._num_lower_bits) | lower
Пример #18
0
def test_rank_boundary() -> None:
    bits = bitarray()
    bits.frombytes(bytes([255]) * ((1 << 29) + 32))
    poppy = Poppy(bits)
    i = 1 << 32
    assert poppy.rank(i) == i + 1
Пример #19
0
    def _build_huffman_tree(self, values: IndexedIntSequence, tree_nodes: List[HuffmanTreeNode]) -> None:
        # Determine the tree topology.
        heapq.heapify(tree_nodes)
        merge_sort_bitarray = bitarray()
        merge_sort_offset = 0
        while len(tree_nodes) > 1:
            x = heapq.heappop(tree_nodes)
            y = heapq.heappop(tree_nodes)
            merged = HuffmanInnerNode(
                size=len(x) + len(y),
                left_child=x,
                right_child=y,
                merge_sort_offset=merge_sort_offset
            )

            # Populate the merge sort bitarray's values
            it_left = ((value, False) for value in x.iterator(values, merge_sort_bitarray))
            it_right = ((value, True) for value in y.iterator(values, merge_sort_bitarray))

            for _, b in heapq.merge(it_left, it_right):
                merge_sort_bitarray.append(b)
                merge_sort_offset += 1

            heapq.heappush(tree_nodes, merged)

        # Build a LOUDS representation of the tree topology
        louds = LoudsBinaryTree(
            root=tree_nodes[0],
            get_left_child=lambda n: n.left_child if isinstance(n, HuffmanInnerNode) else None,
            get_right_child=lambda n: n.right_child if isinstance(n, HuffmanInnerNode) else None
        )

        # The data stored at each node of the tree is:
        # - The offset into a bitarray (if a node is an inner node)
        # - The offset into the original permutation (if a node is a leaf node)
        node_data: List[int] = []
        sizes: List[int] = []
        queue = deque(tree_nodes)
        while queue:
            tree_node = queue.popleft()
            if isinstance(tree_node, HuffmanInnerNode):
                node_data.append(tree_node.merge_sort_offset)
                sizes.append(len(tree_node))
                queue.append(tree_node.left_child)
                queue.append(tree_node.right_child)
            elif isinstance(tree_node, Run):
                node_data.append(tree_node.from_)
                sizes.append(len(tree_node))
            else:
                raise TypeError

        self._louds = louds
        self._node_data = node_data
        self._merge_sort_poppy = Poppy(merge_sort_bitarray)

        number_of_runs = self._run_starts.rank(len(self._run_starts) - 1)
        self._run_rank_to_louds_id = [0] * number_of_runs
        for louds_id in range(len(self._node_data)):
            if self._louds.is_leaf(louds_id):
                run_offset = self._node_data[louds_id]
                run_rank = self._run_starts.rank(run_offset)
                self._run_rank_to_louds_id[run_rank - 1] = louds_id