def test_compressed_runs_bit_array_rank_zero(bb: bytes) -> None: assume(len(bb) % 8 == 0) bits = bitarray() bits.frombytes(bb) crba = CompressedRunsBitArray(bits) for i in range(len(bits)): assert crba.rank_zero(i) == sum(1 - int(b) for b in bits[0:(i + 1)])
def test_compressed_runs_bit_array_select_zero(bb: bytes) -> None: assume(len(bb) % 8 == 0) bits = bitarray() bits.frombytes(bb) crba = CompressedRunsBitArray(bits) select_zero_answers: List[int] = [] for i in range(len(bits)): if not bits[i]: select_zero_answers.append(i) for i, pos in enumerate(select_zero_answers): assert crba.select_zero(i) == pos
def test_compressed_runs_bit_array_getitem(bb: bytes) -> None: assume(len(bb) % 8 == 0) bits = bitarray() bits.frombytes(bb) crba = CompressedRunsBitArray(bits) for i in range(len(bits)): assert crba[i] == bits[i]
def test_compressed_runs_bit_array_select_example_1b() -> None: bits = bitarray('11110000000011010000') crba = CompressedRunsBitArray(bits) assert crba.select_zero(0) == 4 assert crba.select_zero(1) == 5 assert crba.select_zero(2) == 6 assert crba.select_zero(3) == 7 assert crba.select_zero(4) == 8 assert crba.select_zero(5) == 9 assert crba.select_zero(6) == 10 assert crba.select_zero(7) == 11 assert crba.select_zero(8) == 14 assert crba.select_zero(9) == 16 assert crba.select_zero(10) == 17 assert crba.select_zero(11) == 18 assert crba.select_zero(12) == 19
def test_compressed_runs_bit_array_select_example_2a() -> None: bits = bitarray('100001111111100101111') crba = CompressedRunsBitArray(bits) assert crba.select(0) == 0 assert crba.select(1) == 5 assert crba.select(2) == 6 assert crba.select(3) == 7 assert crba.select(4) == 8 assert crba.select(5) == 9 assert crba.select(6) == 10 assert crba.select(7) == 11 assert crba.select(8) == 12 assert crba.select(9) == 15 assert crba.select(10) == 17 assert crba.select(11) == 18 assert crba.select(12) == 19 assert crba.select(13) == 20
def _build_huffman_tree(self, values: IndexedIntSequence, tree_nodes: List[HuffmanTreeNode]) -> None: # Determine the tree topology. heapq.heapify(tree_nodes) merge_sort_bitarray = bitarray() merge_sort_offset = 0 while len(tree_nodes) > 1: x = heapq.heappop(tree_nodes) y = heapq.heappop(tree_nodes) merged = HuffmanInnerNode(size=len(x) + len(y), left_child=x, right_child=y, merge_sort_offset=merge_sort_offset) # Populate the merge sort bitarray's values it_left = ((value, False) for value in x.iterator(values, merge_sort_bitarray)) it_right = ((value, True) for value in y.iterator(values, merge_sort_bitarray)) for _, b in heapq.merge(it_left, it_right): merge_sort_bitarray.append(b) merge_sort_offset += 1 heapq.heappush(tree_nodes, merged) # Build a LOUDS representation of the tree topology louds = LoudsBinaryTree(root=tree_nodes[0], get_left_child=lambda n: n.left_child if isinstance(n, HuffmanInnerNode) else None, get_right_child=lambda n: n.right_child if isinstance(n, HuffmanInnerNode) else None) # The data stored at each node of the tree is: # - The offset into a bitarray (if a node is an inner node) # - The offset into the original permutation (if a node is a leaf node) node_data: List[int] = [] sizes: List[int] = [] queue = deque(tree_nodes) while queue: tree_node = queue.popleft() if isinstance(tree_node, HuffmanInnerNode): node_data.append(tree_node.merge_sort_offset) sizes.append(len(tree_node)) queue.append(tree_node.left_child) queue.append(tree_node.right_child) elif isinstance(tree_node, Run): node_data.append(tree_node.from_) sizes.append(len(tree_node)) else: raise TypeError self._louds = louds self._node_data = node_data # TODO: The choice of "4" lower-order bits here is somewhat arbitrary. # Consider passing it in as a constructor parameter. self._merge_sort_poppy = CompressedRunsBitArray(merge_sort_bitarray, num_lower_bits=4) number_of_runs = len(self._run_starts) self._run_rank_to_louds_id = [0] * number_of_runs for louds_id in range(len(self._node_data)): if self._louds.is_leaf(louds_id): run_offset = self._node_data[louds_id] run_start = self._get_run_start_position(run_offset) self._run_rank_to_louds_id[run_start] = louds_id
class Permutation: def __init__(self, values: IndexedIntSequence) -> None: self._size = len(values) runs = self._extract_runs(values) self._build_huffman_tree(values, runs) def _extract_runs(self, values: IndexedIntSequence) -> List[HuffmanTreeNode]: run_starts: List[int] = [0] for i in range(1, len(values)): if values[i] < values[i - 1]: run_starts.append(i) self._run_starts = run_starts runs: List[HuffmanTreeNode] = [] for i in range(len(run_starts)): from_index = run_starts[i] until_index = run_starts[ i + 1] if i + 1 < len(run_starts) else len(values) runs.append(Run(from_=from_index, until=until_index)) return runs def _build_huffman_tree(self, values: IndexedIntSequence, tree_nodes: List[HuffmanTreeNode]) -> None: # Determine the tree topology. heapq.heapify(tree_nodes) merge_sort_bitarray = bitarray() merge_sort_offset = 0 while len(tree_nodes) > 1: x = heapq.heappop(tree_nodes) y = heapq.heappop(tree_nodes) merged = HuffmanInnerNode(size=len(x) + len(y), left_child=x, right_child=y, merge_sort_offset=merge_sort_offset) # Populate the merge sort bitarray's values it_left = ((value, False) for value in x.iterator(values, merge_sort_bitarray)) it_right = ((value, True) for value in y.iterator(values, merge_sort_bitarray)) for _, b in heapq.merge(it_left, it_right): merge_sort_bitarray.append(b) merge_sort_offset += 1 heapq.heappush(tree_nodes, merged) # Build a LOUDS representation of the tree topology louds = LoudsBinaryTree(root=tree_nodes[0], get_left_child=lambda n: n.left_child if isinstance(n, HuffmanInnerNode) else None, get_right_child=lambda n: n.right_child if isinstance(n, HuffmanInnerNode) else None) # The data stored at each node of the tree is: # - The offset into a bitarray (if a node is an inner node) # - The offset into the original permutation (if a node is a leaf node) node_data: List[int] = [] sizes: List[int] = [] queue = deque(tree_nodes) while queue: tree_node = queue.popleft() if isinstance(tree_node, HuffmanInnerNode): node_data.append(tree_node.merge_sort_offset) sizes.append(len(tree_node)) queue.append(tree_node.left_child) queue.append(tree_node.right_child) elif isinstance(tree_node, Run): node_data.append(tree_node.from_) sizes.append(len(tree_node)) else: raise TypeError self._louds = louds self._node_data = node_data # TODO: The choice of "4" lower-order bits here is somewhat arbitrary. # Consider passing it in as a constructor parameter. self._merge_sort_poppy = CompressedRunsBitArray(merge_sort_bitarray, num_lower_bits=4) number_of_runs = len(self._run_starts) self._run_rank_to_louds_id = [0] * number_of_runs for louds_id in range(len(self._node_data)): if self._louds.is_leaf(louds_id): run_offset = self._node_data[louds_id] run_start = self._get_run_start_position(run_offset) self._run_rank_to_louds_id[run_start] = louds_id def _get_run_start_position(self, run_id: int) -> int: low = 0 high = len(self._run_starts) - 1 while low <= high: mid = (low + high) >> 1 mid_val = self._run_starts[mid] if mid_val < run_id: low = mid + 1 elif mid_val > run_id: high = mid - 1 else: break if low > high: mid = high return mid def __getitem__(self, key: int) -> int: """ Retrieve the i'th element of this permutation. """ """ Start at the leaves. Find the index of the current key `k` in the current node. Now, consider the parent node: - If the current node is the left child, get the index of the `k`'th zero in the parent node via "select_zero". - If the current node is the right child, get the index of the `k`th one in the parent node via "select". Repeat until the current node is the root, and return `k`. """ if not (0 <= key < self._size): raise IndexError(f"Index out of bounds: {key}") run_rank = self._get_run_start_position(key) current_node = self._run_rank_to_louds_id[run_rank] key = key - self._node_data[current_node] while True: parent = self._louds.get_parent(current_node) if parent is None: return key else: parent_offset = self._node_data[parent] if self._louds.get_left_child(parent) == current_node: # # get the index of the `k`'th zero in the parent node parent_rank_zero = ( 0 if parent_offset == 0 else self._merge_sort_poppy.rank_zero(parent_offset - 1)) key = self._merge_sort_poppy.select_zero( key + parent_rank_zero) - parent_offset current_node = parent else: # get the index of the `k`th one in the parent node parent_rank = (0 if parent_offset == 0 else self._merge_sort_poppy.rank(parent_offset - 1)) key = self._merge_sort_poppy.select( key + parent_rank) - parent_offset current_node = parent def index_of(self, value: int) -> int: """ Retrieve the index of the given value within this permutation. (This is the inverse of the permutation.) """ """ Start at the root. Look up the bit at position `value`. Calculate either the rank_1 or rank_0 of that position, depending on whether it's 1 or 0. Recurse to either the left or right child, depending on that value. """ current_node = self._louds.get_root() while True: if self._louds.is_leaf(current_node): return self._node_data[current_node] + value else: offset = self._node_data[current_node] bit_value = self._merge_sort_poppy[offset + value] if not bit_value: value = (self._merge_sort_poppy.rank_zero(offset + value) - (0 if offset == 0 else self._merge_sort_poppy. rank_zero(offset - 1))) - 1 child = self._louds.get_left_child(current_node) else: value = (self._merge_sort_poppy.rank(offset + value) - (0 if offset == 0 else self._merge_sort_poppy.rank(offset - 1))) - 1 child = self._louds.get_right_child(current_node) assert child is not None current_node = child
def test_compressed_runs_bit_array_rank_example_2b() -> None: bits = bitarray('011110000000011010000') crba = CompressedRunsBitArray(bits) assert crba.rank_zero(0) == 1 assert crba.rank_zero(1) == 1 assert crba.rank_zero(2) == 1 assert crba.rank_zero(3) == 1 assert crba.rank_zero(4) == 1 assert crba.rank_zero(5) == 2 assert crba.rank_zero(6) == 3 assert crba.rank_zero(7) == 4 assert crba.rank_zero(8) == 5 assert crba.rank_zero(9) == 6 assert crba.rank_zero(10) == 7 assert crba.rank_zero(11) == 8 assert crba.rank_zero(12) == 9 assert crba.rank_zero(13) == 9 assert crba.rank_zero(14) == 9 assert crba.rank_zero(15) == 10 assert crba.rank_zero(16) == 10 assert crba.rank_zero(17) == 11 assert crba.rank_zero(18) == 12 assert crba.rank_zero(19) == 13 assert crba.rank_zero(20) == 14
def test_compressed_runs_bit_array_rank_example_1a() -> None: bits = bitarray('00001111111100101111') crba = CompressedRunsBitArray(bits) assert crba.rank(0) == 0 assert crba.rank(1) == 0 assert crba.rank(2) == 0 assert crba.rank(3) == 0 assert crba.rank(4) == 1 assert crba.rank(5) == 2 assert crba.rank(6) == 3 assert crba.rank(7) == 4 assert crba.rank(8) == 5 assert crba.rank(9) == 6 assert crba.rank(10) == 7 assert crba.rank(11) == 8 assert crba.rank(12) == 8 assert crba.rank(13) == 8 assert crba.rank(14) == 9 assert crba.rank(15) == 9 assert crba.rank(16) == 10 assert crba.rank(17) == 11 assert crba.rank(18) == 12 assert crba.rank(19) == 13