def __init__( self, values: Iterator[int], *, num_values: int, max_value: int, num_lower_bits: Optional[int] = None ) -> None: """ Compressed representation of a monotonically-increasing sequence of nonnegative integers. """ self._size = num_values # Number of bits needed to store the largest value. w = math.ceil(math.log2(max(1, max_value))) # Number of lower-order bits of each value to store in the lower # bit vector. if num_lower_bits is None: num_lower_bits = math.floor(max_value / num_values) self._num_lower_bits = num_lower_bits if num_lower_bits != 0: self._lower_bits = bitarray() else: self._lower_bits = None # Number of higher-order bits of each value to store in the upper bit # vector. self._num_upper_bits = w - num_lower_bits self._upper_bits = bitarray() previous_value = 0 for value in values: if value > max_value: raise ValueError( f"The value '{value}' is larger than the max_value '{max_value}'" ) if value < previous_value: raise ValueError( "Values must be non-decreasing. " f"(Found '{previous_value}' followed by '{value}')" ) if self._lower_bits is not None: binary_str = f"{value:b}" lower_bits_str = binary_str[-num_lower_bits:].rjust(num_lower_bits, '0') self._lower_bits.extend(lower_bits_str) upper_bits = value >> num_lower_bits previous_upper_bits = previous_value >> num_lower_bits if previous_value != -1: self._upper_bits.extend([False] * max(0, upper_bits - previous_upper_bits)) self._upper_bits.append(True) previous_value = value self._upper_bits.append(False) self._upper_poppy = Poppy(self._upper_bits)
def test_rank_zero(bb: bytes) -> None: assume(len(bb) % 8 == 0) bits = bitarray() bits.frombytes(bb) poppy = Poppy(bits) for i in range(len(bits)): assert poppy.rank_zero(i) == sum(1 - int(b) for b in bits[0:(i + 1)])
def test_select_binary_search_bug_is_not_present_2020_09_06() -> None: i = 3586 bits = bitarray(i) bits.setall(False) bits[0] = True bits[i - 1] = True poppy = Poppy(bits) assert poppy.select(0) == 0 assert poppy.select(1) == i - 1
def test_select_structure(byte_value: int, num_bytes: int) -> None: bits = bitarray() bits.frombytes(bytes([byte_value]) * num_bytes) poppy = Poppy(bits) for level_0_idx, sampling_answers in enumerate(poppy._select_structure): for i, sampling_answer in enumerate(sampling_answers): sum_left = poppy._level_0[level_0_idx] assert (poppy.rank(sampling_answer + ((1 << 32) * level_0_idx)) - sum_left) == (i * 8192 + 1)
def test_select_poppy_big(byte_value: int, num_bytes: int, step_size: int) -> None: bits = bitarray() bits.frombytes(bytes([byte_value]) * num_bytes) poppy = Poppy(bits) a = 0 for i, b in enumerate(bits): if b: if i % step_size == 0: assert poppy.select(a) == i a += 1
def test_rank_big(byte_value: int, num_bytes: int, step_size: int) -> None: bits = bitarray() bits.frombytes(bytes([byte_value]) * num_bytes) poppy = Poppy(bits) i = 0 sum_bits = sum(bits[0:1]) while i < len(bits): assert poppy.rank(i) == sum_bits next_i = i + step_size partial_sum = sum(bits[(i + 1):min(next_i + 1, len(bits))]) i = next_i sum_bits += partial_sum
def test_select_zero_poppy(bb: bytes) -> None: assume(len(bb) % 8 == 0) bits = bitarray() bits.frombytes(bb) poppy = Poppy(bits) select_zero_answers: List[int] = [] for i in range(len(bits)): if not bits[i]: select_zero_answers.append(i) for i, pos in enumerate(select_zero_answers): assert poppy.select_zero(i) == pos
def test_l1_layer(byte_value: int, num_bytes: int) -> None: bits = bitarray() bits.frombytes(bytes([byte_value]) * num_bytes) # Manually compute the popcount sums here. level_1_size = math.ceil(len(bits) / 2048) level_1: List[int] = [0] * level_1_size v = memoryview(bits) for byte_offset in range(0, len(v), 8): level_1_idx = 1 + byte_offset // 256 if level_1_idx < len(level_1): level_1[level_1_idx] += popcount(v[byte_offset:byte_offset + 8]) for byte_offset in range(0, num_bytes, 1 << 29): level_1_idx = byte_offset // 256 level_1[level_1_idx] = 0 for i in range(1, len(level_1)): level_1[i] += level_1[i - 1] poppy = Poppy(bits) # Python will literally asplode if we try to use list equality to compare # the two lists. for i in range(0, len(level_1)): assert poppy._level_1[2 * i] == level_1[i], f"Failed at {i}"
def test_getitem(bb: bytes) -> None: assume(len(bb) % 8 == 0) bits = bitarray() bits.frombytes(bb) poppy = Poppy(bits) for i in range(len(bits)): assert poppy[i] == bits[i]
def __init__( self, *, root: A, get_left_child: Callable[[A], Optional[A]], get_right_child: Callable[[A], Optional[A]] ) -> None: queue = deque([root]) self._bits = bitarray() while queue: tree_node = queue.popleft() for child in [get_left_child(tree_node), get_right_child(tree_node)]: if child is not None: self._bits.append(True) queue.append(child) else: self._bits.append(False) self._poppy = Poppy(self._bits)
def _extract_runs(self, values: IndexedIntSequence) -> List[HuffmanTreeNode]: run_starts: List[int] = [0] for i in range(1, len(values)): if values[i] < values[i - 1]: run_starts.append(i) run_starts_bitarray = bitarray(len(values)) run_starts_bitarray.setall(False) for start in run_starts: run_starts_bitarray[start] = True self._run_starts = Poppy(run_starts_bitarray) runs: List[HuffmanTreeNode] = [] for i in range(len(run_starts)): from_index = run_starts[i] until_index = run_starts[i + 1] if i + 1 < len(run_starts) else len(values) runs.append(Run(from_=from_index, until=until_index)) return runs
class LoudsBinaryTree: def __init__( self, *, root: A, get_left_child: Callable[[A], Optional[A]], get_right_child: Callable[[A], Optional[A]] ) -> None: queue = deque([root]) self._bits = bitarray() while queue: tree_node = queue.popleft() for child in [get_left_child(tree_node), get_right_child(tree_node)]: if child is not None: self._bits.append(True) queue.append(child) else: self._bits.append(False) self._poppy = Poppy(self._bits) def get_root(self) -> int: return 0 def get_parent(self, i: int) -> Optional[int]: if i == 0: return None return math.floor(self._poppy.select(i - 1) / 2) def get_left_child(self, i: int) -> Optional[int]: if not self._bits[2 * i]: return None return self._poppy.rank(2 * i) def get_right_child(self, i: int) -> Optional[int]: if not self._bits[2 * i + 1]: return None return self._poppy.rank(2 * i + 1) def is_leaf(self, i: int) -> bool: return not (self._bits[2 * i] or self._bits[2 * i + 1])
def test_relative_count( initial_value_block_0: int, initial_value_block_1: int, initial_value_block_2: int, add_0: int, add_1: int, add_2: int, ) -> None: assume(initial_value_block_0 + add_0 <= 512) assume(initial_value_block_1 + add_1 <= 512) assume(initial_value_block_2 + add_2 <= 512) initial_values = [ initial_value_block_0, initial_value_block_1, initial_value_block_2 ] adds = [add_0, add_1, add_2] packed_count = 0 for basic_block_index, initial_value in enumerate(initial_values): packed_count = Poppy._add_relative_count( basic_block_index=basic_block_index, packed_relative_counts=packed_count, pop_count=initial_value) for basic_block_index, initial_value in enumerate(initial_values): assert Poppy._get_relative_count( basic_block_index=basic_block_index, packed_relative_counts=packed_count) == initial_value for basic_block_index, add in enumerate(adds): packed_count = Poppy._add_relative_count( basic_block_index=basic_block_index, packed_relative_counts=packed_count, pop_count=add) for basic_block_index, (initial_value, add) in enumerate(zip(initial_values, adds)): assert Poppy._get_relative_count( basic_block_index=basic_block_index, packed_relative_counts=packed_count) == initial_value + add
def test_l2_layer(bb: bytes) -> None: assume(len(bb) % 8 == 0) bits = bitarray() bits.frombytes(bb) poppy = Poppy(bits) for byte_offset in range(0, len(bb), 64): basic_block_idx = (byte_offset % 256) // 64 if basic_block_idx != 3: # Calculate the sum of the bits in the 64-byte block. bit_start = 8 * byte_offset bit_end = min(len(bits), bit_start + 512) expected_pop_count = sum(bits[bit_start:bit_end]) level_2_idx = (byte_offset // 256) * 2 + 1 packed_relative_counts = poppy._level_1[level_2_idx] actual_pop_count = poppy._get_relative_count( basic_block_index=basic_block_idx, packed_relative_counts=packed_relative_counts) assert expected_pop_count == actual_pop_count
def test_l0_layer(byte_value: int, num_bytes: int) -> None: bits = bitarray() bits.frombytes(bytes([byte_value]) * num_bytes) # Manually compute the popcount sums here. num_popcount_sums = math.ceil(len(bits) / (2**32)) popcount_sums: List[int] = [0] * num_popcount_sums v = memoryview(bits) for byte_offset in range(0, len(v), 8): popcount_idx = 1 + byte_offset // (2**29) if popcount_idx < len(popcount_sums): popcount_sums[popcount_idx] += popcount(v[byte_offset:byte_offset + 8]) for i in range(1, len(popcount_sums)): popcount_sums[i] += popcount_sums[i - 1] poppy = Poppy(bits) assert list(poppy._level_0) == popcount_sums
class Permutation: def __init__(self, values: IndexedIntSequence) -> None: runs = self._extract_runs(values) self._build_huffman_tree(values, runs) def _extract_runs(self, values: IndexedIntSequence) -> List[HuffmanTreeNode]: run_starts: List[int] = [0] for i in range(1, len(values)): if values[i] < values[i - 1]: run_starts.append(i) run_starts_bitarray = bitarray(len(values)) run_starts_bitarray.setall(False) for start in run_starts: run_starts_bitarray[start] = True self._run_starts = Poppy(run_starts_bitarray) runs: List[HuffmanTreeNode] = [] for i in range(len(run_starts)): from_index = run_starts[i] until_index = run_starts[i + 1] if i + 1 < len(run_starts) else len(values) runs.append(Run(from_=from_index, until=until_index)) return runs def _build_huffman_tree(self, values: IndexedIntSequence, tree_nodes: List[HuffmanTreeNode]) -> None: # Determine the tree topology. heapq.heapify(tree_nodes) merge_sort_bitarray = bitarray() merge_sort_offset = 0 while len(tree_nodes) > 1: x = heapq.heappop(tree_nodes) y = heapq.heappop(tree_nodes) merged = HuffmanInnerNode( size=len(x) + len(y), left_child=x, right_child=y, merge_sort_offset=merge_sort_offset ) # Populate the merge sort bitarray's values it_left = ((value, False) for value in x.iterator(values, merge_sort_bitarray)) it_right = ((value, True) for value in y.iterator(values, merge_sort_bitarray)) for _, b in heapq.merge(it_left, it_right): merge_sort_bitarray.append(b) merge_sort_offset += 1 heapq.heappush(tree_nodes, merged) # Build a LOUDS representation of the tree topology louds = LoudsBinaryTree( root=tree_nodes[0], get_left_child=lambda n: n.left_child if isinstance(n, HuffmanInnerNode) else None, get_right_child=lambda n: n.right_child if isinstance(n, HuffmanInnerNode) else None ) # The data stored at each node of the tree is: # - The offset into a bitarray (if a node is an inner node) # - The offset into the original permutation (if a node is a leaf node) node_data: List[int] = [] sizes: List[int] = [] queue = deque(tree_nodes) while queue: tree_node = queue.popleft() if isinstance(tree_node, HuffmanInnerNode): node_data.append(tree_node.merge_sort_offset) sizes.append(len(tree_node)) queue.append(tree_node.left_child) queue.append(tree_node.right_child) elif isinstance(tree_node, Run): node_data.append(tree_node.from_) sizes.append(len(tree_node)) else: raise TypeError self._louds = louds self._node_data = node_data self._merge_sort_poppy = Poppy(merge_sort_bitarray) number_of_runs = self._run_starts.rank(len(self._run_starts) - 1) self._run_rank_to_louds_id = [0] * number_of_runs for louds_id in range(len(self._node_data)): if self._louds.is_leaf(louds_id): run_offset = self._node_data[louds_id] run_rank = self._run_starts.rank(run_offset) self._run_rank_to_louds_id[run_rank - 1] = louds_id def __getitem__(self, key: int) -> int: """ Retrieve the i'th element of this permutation. """ """ Start at the leaves. Find the index of the current key `k` in the current node. Now, consider the parent node: - If the current node is the left child, get the index of the `k`'th zero in the parent node via "select_zero". - If the current node is the right child, get the index of the `k`th one in the parent node via "select". Repeat until the current node is the root, and return `k`. """ run_start = self._run_starts.rank(key) - 1 current_node = self._run_rank_to_louds_id[run_start] key = key - self._node_data[current_node] while True: parent = self._louds.get_parent(current_node) if parent is None: return key else: parent_offset = self._node_data[parent] if self._louds.get_left_child(parent) == current_node: # # get the index of the `k`'th zero in the parent node parent_rank_zero = ( 0 if parent_offset == 0 else self._merge_sort_poppy.rank_zero(parent_offset - 1) ) key = self._merge_sort_poppy.select_zero(key + parent_rank_zero) - parent_offset current_node = parent else: # get the index of the `k`th one in the parent node parent_rank = ( 0 if parent_offset == 0 else self._merge_sort_poppy.rank(parent_offset - 1) ) key = self._merge_sort_poppy.select(key + parent_rank) - parent_offset current_node = parent def index_of(self, value: int) -> int: """ Retrieve the index of the given value within this permutation. (This is the inverse of the permutation.) """ """ Start at the root. Look up the bit at position `value`. Calculate either the rank_1 or rank_0 of that position, depending on whether it's 1 or 0. Recurse to either the left or right child, depending on that value. """ current_node = self._louds.get_root() while True: if self._louds.is_leaf(current_node): return self._node_data[current_node] + value else: offset = self._node_data[current_node] bit_value = self._merge_sort_poppy[offset + value] if not bit_value: value = ( self._merge_sort_poppy.rank_zero(offset + value) - ( 0 if offset == 0 else self._merge_sort_poppy.rank_zero(offset - 1) ) ) - 1 child = self._louds.get_left_child(current_node) else: value = ( self._merge_sort_poppy.rank(offset + value) - ( 0 if offset == 0 else self._merge_sort_poppy.rank(offset - 1) ) ) - 1 child = self._louds.get_right_child(current_node) assert child is not None current_node = child
class EliasFano: def __init__(self, values: Iterator[int], *, num_values: int, max_value: int) -> None: """ Compressed representation of a monotonically-increasing sequence of nonnegative integers. """ # Number of bits needed to store the largest value. w = math.ceil(math.log2(max(1, max_value))) # Number of lower-order bits of each value to store in the lower # bit vector. num_lower_bits = math.floor(max_value / num_values) self._num_lower_bits = num_lower_bits if num_lower_bits != 0: self._lower_bits = bitarray() else: self._lower_bits = None # Number of higher-order bits of each value to store in the upper bit # vector. self._num_upper_bits = w - num_lower_bits self._upper_bits = bitarray() previous_value = 0 for value in values: if value > max_value: raise ValueError( f"The value '{value}' is larger than the max_value '{max_value}'" ) if value < previous_value: raise ValueError( "Values must be non-decreasing. " f"(Found '{previous_value}' followed by '{value}')") if self._lower_bits is not None: binary_str = f"{value:b}" lower_bits_str = binary_str[-num_lower_bits:].rjust( num_lower_bits, '0') self._lower_bits.extend(lower_bits_str) upper_bits = value >> num_lower_bits previous_upper_bits = previous_value >> num_lower_bits if previous_value != -1: self._upper_bits.extend( [False] * max(0, upper_bits - previous_upper_bits)) self._upper_bits.append(True) previous_value = value self._upper_bits.append(False) self._upper_poppy = Poppy(self._upper_bits) def __getitem__(self, key: int) -> int: if self._lower_bits is not None: lower_offset = key * self._num_lower_bits lower = int( self._lower_bits[lower_offset:lower_offset + self._num_lower_bits].to01(), 2) else: lower = 0 upper = self._upper_poppy.select(key) - key return (upper << self._num_lower_bits) | lower
def test_rank_boundary() -> None: bits = bitarray() bits.frombytes(bytes([255]) * ((1 << 29) + 32)) poppy = Poppy(bits) i = 1 << 32 assert poppy.rank(i) == i + 1
def _build_huffman_tree(self, values: IndexedIntSequence, tree_nodes: List[HuffmanTreeNode]) -> None: # Determine the tree topology. heapq.heapify(tree_nodes) merge_sort_bitarray = bitarray() merge_sort_offset = 0 while len(tree_nodes) > 1: x = heapq.heappop(tree_nodes) y = heapq.heappop(tree_nodes) merged = HuffmanInnerNode( size=len(x) + len(y), left_child=x, right_child=y, merge_sort_offset=merge_sort_offset ) # Populate the merge sort bitarray's values it_left = ((value, False) for value in x.iterator(values, merge_sort_bitarray)) it_right = ((value, True) for value in y.iterator(values, merge_sort_bitarray)) for _, b in heapq.merge(it_left, it_right): merge_sort_bitarray.append(b) merge_sort_offset += 1 heapq.heappush(tree_nodes, merged) # Build a LOUDS representation of the tree topology louds = LoudsBinaryTree( root=tree_nodes[0], get_left_child=lambda n: n.left_child if isinstance(n, HuffmanInnerNode) else None, get_right_child=lambda n: n.right_child if isinstance(n, HuffmanInnerNode) else None ) # The data stored at each node of the tree is: # - The offset into a bitarray (if a node is an inner node) # - The offset into the original permutation (if a node is a leaf node) node_data: List[int] = [] sizes: List[int] = [] queue = deque(tree_nodes) while queue: tree_node = queue.popleft() if isinstance(tree_node, HuffmanInnerNode): node_data.append(tree_node.merge_sort_offset) sizes.append(len(tree_node)) queue.append(tree_node.left_child) queue.append(tree_node.right_child) elif isinstance(tree_node, Run): node_data.append(tree_node.from_) sizes.append(len(tree_node)) else: raise TypeError self._louds = louds self._node_data = node_data self._merge_sort_poppy = Poppy(merge_sort_bitarray) number_of_runs = self._run_starts.rank(len(self._run_starts) - 1) self._run_rank_to_louds_id = [0] * number_of_runs for louds_id in range(len(self._node_data)): if self._louds.is_leaf(louds_id): run_offset = self._node_data[louds_id] run_rank = self._run_starts.rank(run_offset) self._run_rank_to_louds_id[run_rank - 1] = louds_id